def parallelize(self, c, numSlices=None): """ Distribute a local Python collection to form an RDD. >>> sc.parallelize(range(5), 5).glom().collect() [[0], [1], [2], [3], [4]] """ numSlices = numSlices or self.defaultParallelism # Calling the Java parallelize() method with an ArrayList is too slow, # because it sends O(n) Py4J commands. As an alternative, serialized # objects are written to a file and loaded through textFile(). tempFile = NamedTemporaryFile(delete=False, dir=self._temp_dir) # Make sure we distribute data evenly if it's smaller than self.batchSize if "__len__" not in dir(c): c = list(c) # Make it a list so we can compute its length batchSize = min(len(c) // numSlices, self._batchSize) if batchSize > 1: serializer = BatchedSerializer(self._unbatched_serializer, batchSize) else: serializer = self._unbatched_serializer serializer.dump_stream(c, tempFile) tempFile.close() readRDDFromFile = self._jvm.PythonRDD.readRDDFromFile jrdd = readRDDFromFile(self._jsc, tempFile.name, numSlices) return RDD(jrdd, self, serializer)
def parallelize(self, c, numSlices=None): """ Distribute a local Python collection to form an RDD. >>> sc.parallelize(range(5), 5).glom().collect() [[0], [1], [2], [3], [4]] """ numSlices = numSlices or self.defaultParallelism # Calling the Java parallelize() method with an ArrayList is too slow, # because it sends O(n) Py4J commands. As an alternative, serialized # objects are written to a file and loaded through textFile(). tempFile = NamedTemporaryFile(delete=False, dir=self._temp_dir) # Make sure we distribute data evenly if it's smaller than self.batchSize if "__len__" not in dir(c): c = list(c) # Make it a list so we can compute its length batchSize = min(len(c) // numSlices, self._batchSize) if batchSize > 1: serializer = BatchedSerializer(self._unbatched_serializer, batchSize) else: serializer = self._unbatched_serializer serializer.dump_stream(c, tempFile) tempFile.close() readRDDFromFile = self._jvm.PythonRDD.readRDDFromFile jrdd = readRDDFromFile(self._jsc, tempFile.name, numSlices) return RDD(jrdd, self, serializer)
def parallelize(self, c, numSlices=None): """ Distribute a local Python collection to form an RDD. Using xrange is recommended if the input represents a range for performance. 分发一个本地的Python集合以形成一个RDD。如果输入表示的是一个范围推荐使用xrange。 >>> sc.parallelize([0, 2, 3, 4, 6], 5).glom().collect() [[0], [2], [3], [4], [6]] >>> sc.parallelize(xrange(0, 6, 2), 5).glom().collect() [[], [0], [], [2], [4]] """ numSlices = int(numSlices) if numSlices is not None else self.defaultParallelism if isinstance(c, xrange): size = len(c) if size == 0: return self.parallelize([], numSlices) step = c[1] - c[0] if size > 1 else 1 start0 = c[0] def getStart(split): return start0 + int((split * size / numSlices)) * step def f(split, iterator): return xrange(getStart(split), getStart(split + 1), step) return self.parallelize([], numSlices).mapPartitionsWithIndex(f) # Calling the Java parallelize() method with an ArrayList is too slow, # because it sends O(n) Py4J commands. As an alternative, serialized # objects are written to a file and loaded through textFile(). #对一个ArrayList调用Java的并行化方法速度太慢,因为它要发送O(n)Py4J命令。作为替代,序列化对象被写入到一个文件中,并通过textFile()加载。 tempFile = NamedTemporaryFile(delete=False, dir=self._temp_dir) try: # Make sure we distribute data evenly if it's smaller than self.batchSize #确保我们均匀地分布数据如果它小于self.batchsize if "__len__" not in dir(c): c = list(c) # Make it a list so we can compute its length将其转换成list,以便可以计算它的长度 batchSize = max(1, min(len(c) // numSlices, self._batchSize or 1024))#计算块大小 serializer = BatchedSerializer(self._unbatched_serializer, batchSize)#获取分块后的序列化对象 serializer.dump_stream(c, tempFile)#写入临时文件中 tempFile.close()#关闭临时文件 readRDDFromFile = self._jvm.PythonRDD.readRDDFromFile jrdd = readRDDFromFile(self._jsc, tempFile.name, numSlices) finally: # readRDDFromFile eagerily reads the file so we can delete right after. #readRDDFromFile将快速读取文件,以便我们可以在之后删除 os.unlink(tempFile.name)#删除临时文件,如果文件是一个目录则返回一个错误 return RDD(jrdd, self, serializer)
def parallelize(self, c, numSlices=None): """ Distribute a local Python collection to form an RDD. Using xrange is recommended if the input represents a range for performance. >>> sc.parallelize([0, 2, 3, 4, 6], 5).glom().collect() [[0], [2], [3], [4], [6]] >>> sc.parallelize(xrange(0, 6, 2), 5).glom().collect() [[], [0], [], [2], [4]] """ numSlices = int( numSlices) if numSlices is not None else self.defaultParallelism if isinstance(c, xrange): size = len(c) if size == 0: return self.parallelize([], numSlices) step = c[1] - c[0] if size > 1 else 1 start0 = c[0] def getStart(split): return start0 + int((split * size / numSlices)) * step def f(split, iterator): return xrange(getStart(split), getStart(split + 1), step) return self.parallelize([], numSlices).mapPartitionsWithIndex(f) # Calling the Java parallelize() method with an ArrayList is too slow, # because it sends O(n) Py4J commands. As an alternative, serialized # objects are written to a file and loaded through textFile(). tempFile = NamedTemporaryFile(delete=False, dir=self._temp_dir) try: # Make sure we distribute data evenly if it's smaller than self.batchSize if "__len__" not in dir(c): c = list(c) # Make it a list so we can compute its length batchSize = max(1, min(len(c) // numSlices, self._batchSize or 1024)) serializer = BatchedSerializer(self._unbatched_serializer, batchSize) serializer.dump_stream(c, tempFile) tempFile.close() readRDDFromFile = self._jvm.PythonRDD.readRDDFromFile jrdd = readRDDFromFile(self._jsc, tempFile.name, numSlices) finally: # readRDDFromFile eagerily reads the file so we can delete right after. os.unlink(tempFile.name) return RDD(jrdd, self, serializer)
def parallelize(self, c, numSlices=None): """ Distribute a local Python collection to form an RDD. Using xrange is recommended if the input represents a range for performance. >>> sc.parallelize([0, 2, 3, 4, 6], 5).glom().collect() [[0], [2], [3], [4], [6]] >>> sc.parallelize(xrange(0, 6, 2), 5).glom().collect() [[], [0], [], [2], [4]] """ numSlices = int(numSlices) if numSlices is not None else self.defaultParallelism if isinstance(c, xrange): size = len(c) if size == 0: return self.parallelize([], numSlices) step = c[1] - c[0] if size > 1 else 1 start0 = c[0] def getStart(split): return start0 + int((split * size / numSlices)) * step def f(split, iterator): return xrange(getStart(split), getStart(split + 1), step) return self.parallelize([], numSlices).mapPartitionsWithIndex(f) # Calling the Java parallelize() method with an ArrayList is too slow, # because it sends O(n) Py4J commands. As an alternative, serialized # objects are written to a file and loaded through textFile(). tempFile = NamedTemporaryFile(delete=False, dir=self._temp_dir) try: # Make sure we distribute data evenly if it's smaller than self.batchSize if "__len__" not in dir(c): c = list(c) # Make it a list so we can compute its length batchSize = max(1, min(len(c) // numSlices, self._batchSize or 1024)) serializer = BatchedSerializer(self._unbatched_serializer, batchSize) serializer.dump_stream(c, tempFile) tempFile.close() readRDDFromFile = self._jvm.PythonRDD.readRDDFromFile jrdd = readRDDFromFile(self._jsc, tempFile.name, numSlices) finally: # readRDDFromFile eagerily reads the file so we can delete right after. os.unlink(tempFile.name) return RDD(jrdd, self, serializer)
class ExternalList(object): """ ExternalList can have many items which cannot be hold in memory in the same time. >>> l = ExternalList(list(range(100))) >>> len(l) 100 >>> l.append(10) >>> len(l) 101 >>> for i in range(20240): ... l.append(i) >>> len(l) 20341 >>> import pickle >>> l2 = pickle.loads(pickle.dumps(l)) >>> len(l2) 20341 >>> list(l2)[100] 10 """ LIMIT = 10240 def __init__(self, values): self.values = values self.count = len(values) self._file = None self._ser = None def __getstate__(self): if self._file is not None: self._file.flush() with os.fdopen(os.dup(self._file.fileno()), "rb") as f: f.seek(0) serialized = f.read() else: serialized = b'' return self.values, self.count, serialized def __setstate__(self, item): self.values, self.count, serialized = item if serialized: self._open_file() self._file.write(serialized) else: self._file = None self._ser = None def __iter__(self): if self._file is not None: self._file.flush() # read all items from disks first with os.fdopen(os.dup(self._file.fileno()), 'rb') as f: f.seek(0) for v in self._ser.load_stream(f): yield v for v in self.values: yield v def __len__(self): return self.count def append(self, value): self.values.append(value) self.count += 1 # dump them into disk if the key is huge if len(self.values) >= self.LIMIT: self._spill() def _open_file(self): dirs = _get_local_dirs("objects") d = dirs[id(self) % len(dirs)] if not os.path.exists(d): os.makedirs(d) p = os.path.join(d, str(id(self))) self._file = open(p, "wb+", 65536) self._ser = BatchedSerializer(CompressedSerializer(PickleSerializer()), 1024) os.unlink(p) def __del__(self): if self._file: self._file.close() self._file = None def _spill(self): """ dump the values into disk """ global MemoryBytesSpilled, DiskBytesSpilled if self._file is None: self._open_file() used_memory = get_used_memory() pos = self._file.tell() self._ser.dump_stream(self.values, self._file) self.values = [] gc.collect() DiskBytesSpilled += self._file.tell() - pos MemoryBytesSpilled += (used_memory - get_used_memory()) << 20
class ExternalList(object): """ ExternalList can have many items which cannot be hold in memory in the same time. Examples -------- >>> l = ExternalList(list(range(100))) >>> len(l) 100 >>> l.append(10) >>> len(l) 101 >>> for i in range(20240): ... l.append(i) >>> len(l) 20341 >>> import pickle >>> l2 = pickle.loads(pickle.dumps(l)) >>> len(l2) 20341 >>> list(l2)[100] 10 """ LIMIT = 10240 def __init__(self, values): self.values = values self.count = len(values) self._file = None self._ser = None def __getstate__(self): if self._file is not None: self._file.flush() with os.fdopen(os.dup(self._file.fileno()), "rb") as f: f.seek(0) serialized = f.read() else: serialized = b"" return self.values, self.count, serialized def __setstate__(self, item): self.values, self.count, serialized = item if serialized: self._open_file() self._file.write(serialized) else: self._file = None self._ser = None def __iter__(self): if self._file is not None: self._file.flush() # read all items from disks first with os.fdopen(os.dup(self._file.fileno()), "rb") as f: f.seek(0) for v in self._ser.load_stream(f): yield v for v in self.values: yield v def __len__(self): return self.count def append(self, value): self.values.append(value) self.count += 1 # dump them into disk if the key is huge if len(self.values) >= self.LIMIT: self._spill() def _open_file(self): dirs = _get_local_dirs("objects") d = dirs[id(self) % len(dirs)] if not os.path.exists(d): os.makedirs(d) p = os.path.join(d, str(id(self))) self._file = open(p, "w+b", 65536) self._ser = BatchedSerializer(CompressedSerializer(PickleSerializer()), 1024) os.unlink(p) def __del__(self): if self._file: self._file.close() self._file = None def _spill(self): """dump the values into disk""" global MemoryBytesSpilled, DiskBytesSpilled if self._file is None: self._open_file() used_memory = get_used_memory() pos = self._file.tell() self._ser.dump_stream(self.values, self._file) self.values = [] gc.collect() DiskBytesSpilled += self._file.tell() - pos MemoryBytesSpilled += max(used_memory - get_used_memory(), 0) << 20