Exemple #1
0
    def __init__(self, rdd, path, field_names, indices=None, numSplits=None):
        RDD.__init__(self, rdd.ctx)
        self.prev = rdd
        self.mem = rdd.mem + 600
        self.path = path
        if os.path.exists(path):
            raise RuntimeError('path already exists: %s' % path)

        os.makedirs(path)
        if isinstance(field_names, basestring):
            field_names = field_names.replace(',', ' ').split()

        if len(set(field_names)) != len(field_names):
            raise ValueError('duplicated field names')
        
        self.fields = map(str, field_names)
        if isinstance(indices, types.StringTypes):
            indices = indices.replace(',', ' ').split()

        self.indices = set()
        if indices:
            for i in indices:
                i = str(i)
                if i not in self.fields:
                    raise ValueError('index field %s not in field list' % i)

                self.indices.add(i)
                
        prev_splits = len(rdd)
        numSplits = min(numSplits or prev_splits, prev_splits)
        self.numSplits = min(numSplits, prev_splits)
        s = [int(round(1.0*prev_splits/numSplits*i)) for i in xrange(numSplits + 1)]
        self._splits = [MultiSplit(i, rdd.splits[s[i]:s[i+1]]) for i in xrange(numSplits)]
        self.dependencies = [OneToRangeDependency(rdd, int(prev_splits/numSplits),
                                                  prev_splits)]
Exemple #2
0
 def __init__(self, rdd, filters):
     RDD.__init__(self, rdd.ctx)
     self.rdd = rdd
     self.filters = filters
     self.mem = max(self.mem, rdd.mem)
     self.dependencies = [OneToOneDependency(rdd)]
     self._splits = self._get_splits()
Exemple #3
0
 def __init__(self, rdd, filters):
     RDD.__init__(self, rdd.ctx)
     self.rdd = rdd
     self.filters = filters
     self.mem = max(self.mem, rdd.mem)
     self.dependencies = [OneToOneDependency(rdd)]
     self._splits = self._get_splits()
Exemple #4
0
    def __init__(self, ctx, path, fields=None, taskMemory=None):
        RDD.__init__(self, ctx)
        if taskMemory:
            self.mem = taskMemory

        if isinstance(path, six.string_types):
            files = self._get_files(path)
        else:
            files = chain(self._get_files(p) for p in path)

        rdds = [TabularFileRDD(ctx, f, fields) for f in files]
        self._splits = []
        i = 0
        for rdd in rdds:
            for sp in rdd.splits:
                self._splits.append(TabularSplit(i, rdd, sp))
                i += 1
        self._dependencies = [OneToOneDependency(rdd) for rdd in rdds]
        self.repr_name = '<%s %d %s...>' % (self.__class__.__name__, len(rdds),
                                            ','.join(
                                                str(rdd) for rdd in rdds[:1]))
        self._preferred_locs = {}
        for split in self._splits:
            self._preferred_locs[split] = split.rdd.preferredLocations(
                split.split)
Exemple #5
0
 def __init__(self, rdd, filters):
     RDD.__init__(self, rdd.ctx)
     self.rdd = rdd
     self.filters = filters
     self.mem = max(self.mem, rdd.mem)
     self._dependencies = [OneToOneDependency(rdd)]
     self._splits = self._get_splits()
     self.repr_name = '<%s %s>' % (self.__class__.__name__, rdd)
     self._preferred_locs = {}
     for split in self._splits:
         self._preferred_locs[split] = rdd.preferredLocations(split)
Exemple #6
0
    def __init__(self, ctx, path, fields = None, taskMemory=None):
        RDD.__init__(self, ctx)
        if taskMemory:
            self.mem = taskMemory

        if isinstance(path, basestring):
            files = self._get_files(path)
        else:
            files = chain(self._get_files(p) for p in path)

        self.rdds = [TabularFileRDD(ctx, f, fields) for f in files]
        self._splits = [TabularSplit(rdd, sp) for rdd in self.rdds for sp in rdd.splits]
        self.dependencies = [OneToOneDependency(rdd) for rdd in self.rdds]
Exemple #7
0
    def __init__(self, ctx, path, fields=None, taskMemory=None):
        RDD.__init__(self, ctx)
        if taskMemory:
            self.mem = taskMemory

        if isinstance(path, basestring):
            files = self._get_files(path)
        else:
            files = chain(self._get_files(p) for p in path)

        self.rdds = [TabularFileRDD(ctx, f, fields) for f in files]
        self._splits = []
        i = 0
        for rdd in self.rdds:
            for sp in rdd.splits:
                self._splits.append(TabularSplit(i, rdd, sp))
                i += 1
        self.dependencies = [OneToOneDependency(rdd) for rdd in self.rdds]
Exemple #8
0
    def __init__(self, rdd, path, field_names, indices=None, numSplits=None):
        RDD.__init__(self, rdd.ctx)
        self.prev = rdd
        self.mem = rdd.mem + 600
        self.path = path
        if os.path.exists(path):
            raise RuntimeError('path already exists: %s' % path)

        os.makedirs(path)
        if isinstance(field_names, six.string_types):
            field_names = field_names.replace(',', ' ').split()

        if len(set(field_names)) != len(field_names):
            raise ValueError('duplicated field names')

        self.fields = list(map(str, field_names))
        if isinstance(indices, (str, )):
            indices = indices.replace(',', ' ').split()

        self.indices = set()
        if indices:
            for i in indices:
                i = str(i)
                if i not in self.fields:
                    raise ValueError('index field %s not in field list' % i)

                self.indices.add(i)

        prev_splits = len(rdd)
        numSplits = min(numSplits or prev_splits, prev_splits)
        self.numSplits = min(numSplits, prev_splits)
        s = [
            int(round(1.0 * prev_splits / numSplits * i))
            for i in range(numSplits + 1)
        ]
        self._splits = [
            MultiSplit(i, rdd.splits[s[i]:s[i + 1]]) for i in range(numSplits)
        ]
        self._dependencies = [
            OneToRangeDependency(rdd, int(prev_splits / numSplits),
                                 prev_splits)
        ]
        self.repr_name = '<OutputTabularRDD %s %s>' % (path, rdd)
Exemple #9
0
    def __init__(self, ctx, path, fields = None, taskMemory=None):
        RDD.__init__(self, ctx)
        if taskMemory:
            self.mem = taskMemory

        if isinstance(path, basestring):
            files = self._get_files(path)
        else:
            files = chain(self._get_files(p) for p in path)

        rdds = [TabularFileRDD(ctx, f, fields) for f in files]
        self._splits = []
        i = 0
        for rdd in rdds:
            for sp in rdd.splits:
                self._splits.append(TabularSplit(i, rdd, sp))
                i += 1
        self._dependencies = [OneToOneDependency(rdd) for rdd in rdds]
        self.repr_name = '<%s %d %s...>' % (self.__class__.__name__, len(rdds),
                                  ','.join(str(rdd) for rdd in rdds[:1]))
        self._preferred_locs = {}
        for split in self._splits:
            self._preferred_locs[split] = split.rdd.preferredLocations(split.split)
Exemple #10
0
 def __getstate__(self):
     d = RDD.__getstate__(self)
     del d['filters']
     return d, dumps(self.filters)
Exemple #11
0
 def _clear_dependencies(self):
     RDD._clear_dependencies(self)
     self.rdd = None
Exemple #12
0
 def __getstate__(self):
     d = RDD.__getstate__(self)
     del d['filters']
     return d, dumps(self.filters)