Example #1
0
    def __init__(self, ctx, path, fields=None, taskMemory=None):
        RDD.__init__(self, ctx)
        if taskMemory:
            self.mem = taskMemory

        if isinstance(path, six.string_types):
            files = self._get_files(path)
        else:
            files = chain(self._get_files(p) for p in path)

        rdds = [TabularFileRDD(ctx, f, fields) for f in files]
        self._splits = []
        i = 0
        for rdd in rdds:
            for sp in rdd.splits:
                self._splits.append(TabularSplit(i, rdd, sp))
                i += 1
        self._dependencies = [OneToOneDependency(rdd) for rdd in rdds]
        self.repr_name = '<%s %d %s...>' % (self.__class__.__name__, len(rdds),
                                            ','.join(
                                                str(rdd) for rdd in rdds[:1]))
        self._preferred_locs = {}
        for split in self._splits:
            self._preferred_locs[split] = split.rdd.preferredLocations(
                split.split)
Example #2
0
    def compute(self, split):
        buffers = [list() for i in self.fields]
        remain_size = STRIPE_DATA_SIZE
        path = os.path.join(self.path, '%04d.dt' % split.index)
        indices = dict((i, AdaptiveIndex()) for i in self.indices)

        def write_stripe(f, compressed, header, padding=True):
            h = compress(marshal.dumps(header))
            assert len(h) < STRIPE_HEADER_SIZE
            f.write(struct.pack('I', len(h)))
            f.write(h)
            padding_size = STRIPE_SIZE - len(h) - 4
            for c in compressed:
                f.write(c)
                padding_size -= len(c)

            if padding:
                f.write('\0' * padding_size)

        with atomic_file(path) as f:
            stripe_id = 0
            for it in chain(self.prev.iterator(sp) for sp in split.splits):
                row = it[:len(self.fields)]
                size = len(marshal.dumps(tuple(row)))
                if size > STRIPE_DATA_SIZE:
                    raise RuntimeError('Row too big')

                if size > remain_size:
                    compressed = [compress(marshal.dumps(tuple(b))) for b in buffers]
                    _sizes = tuple(map(len, compressed))
                    _remain_size = STRIPE_DATA_SIZE - sum(_sizes)
                    if size > _remain_size:
                        write_stripe(f, compressed, _sizes)
                        buffers = [list() for i in self.fields]
                        remain_size = STRIPE_DATA_SIZE
                        stripe_id += 1
                    else:
                        remain_size = _remain_size

                remain_size -= size
                for i, value in enumerate(row):
                    buffers[i].append(value)
                    field = self.fields[i]
                    if field in self.indices:
                        indices[field].add(value, stripe_id)

            if any(buffers):
                compressed = [compress(marshal.dumps(tuple(b))) for b in buffers]
                _sizes = tuple(map(len, compressed))
                write_stripe(f, compressed, _sizes, False)

            footer_indices = zlib.compress(cPickle.dumps(indices, -1))
            footer_fields = compress(marshal.dumps(self.fields))
            f.write(footer_indices)
            f.write(footer_fields)
            f.write(struct.pack('II', len(footer_fields), len(footer_indices)))

        yield path
Example #3
0
    def __init__(self, ctx, path, fields = None, taskMemory=None):
        RDD.__init__(self, ctx)
        if taskMemory:
            self.mem = taskMemory

        if isinstance(path, basestring):
            files = self._get_files(path)
        else:
            files = chain(self._get_files(p) for p in path)

        self.rdds = [TabularFileRDD(ctx, f, fields) for f in files]
        self._splits = [TabularSplit(rdd, sp) for rdd in self.rdds for sp in rdd.splits]
        self.dependencies = [OneToOneDependency(rdd) for rdd in self.rdds]
Example #4
0
    def __init__(self, ctx, path, fields=None, taskMemory=None):
        RDD.__init__(self, ctx)
        if taskMemory:
            self.mem = taskMemory

        if isinstance(path, basestring):
            files = self._get_files(path)
        else:
            files = chain(self._get_files(p) for p in path)

        self.rdds = [TabularFileRDD(ctx, f, fields) for f in files]
        self._splits = []
        i = 0
        for rdd in self.rdds:
            for sp in rdd.splits:
                self._splits.append(TabularSplit(i, rdd, sp))
                i += 1
        self.dependencies = [OneToOneDependency(rdd) for rdd in self.rdds]
Example #5
0
    def __init__(self, ctx, path, fields = None, taskMemory=None):
        RDD.__init__(self, ctx)
        if taskMemory:
            self.mem = taskMemory

        if isinstance(path, basestring):
            files = self._get_files(path)
        else:
            files = chain(self._get_files(p) for p in path)

        rdds = [TabularFileRDD(ctx, f, fields) for f in files]
        self._splits = []
        i = 0
        for rdd in rdds:
            for sp in rdd.splits:
                self._splits.append(TabularSplit(i, rdd, sp))
                i += 1
        self._dependencies = [OneToOneDependency(rdd) for rdd in rdds]
        self.repr_name = '<%s %d %s...>' % (self.__class__.__name__, len(rdds),
                                  ','.join(str(rdd) for rdd in rdds[:1]))
        self._preferred_locs = {}
        for split in self._splits:
            self._preferred_locs[split] = split.rdd.preferredLocations(split.split)
Example #6
0
 def filter(self, fun):
     if self.index_type == BITMAP_INDEX:
         return chain(v.positions() for k, v in self.index.items() if fun(k))
Example #7
0
 def filter(self, fun):
     if self.index_type == BITMAP_INDEX:
         return chain(v.positions() for k, v in self.index.items() if fun(k))