Beispiel #1
0
    def gen_rdd(self, **kwargs):
        use_limit = kwargs.get('use_limit', False)
        converters = []
        for c, t in self.columns:
            if t == Type.INT:
                converters.append(lambda x:int(x) if x else 0)
            elif t == Type.FLOAT:
                converters.append(lambda x:float(x) if x else 0)
            else:
                converters.append(lambda x:x)

        expr = self.expr
        if self.selectors:
            r = expr.file_path
            path = set()
            for s in self.selectors:
                for p in glob.glob('%s/%s' % (r, s)):
                    if not os.path.isdir(p):
                        path.add(p)
                    else:
                        for root, _, names in walk(p, followlinks=True):
                            path.update(os.path.join(root, name) for name in names
                                        if not name.startswith('.'))

            path = list(path)
        else:
            path = expr.file_path

        if use_limit and isinstance(path, basestring):
            for root, _, names in walk(path, followlinks=True):
                if names:
                    path = [os.path.join(root, name) for name in names]
                    break
        
        rdd = dpark.textFile(path)
        if expr.expr:
            rdd = eval('rdd.' + expr.expr, global_env, {'rdd':rdd})

        row = rdd.first()
        if isinstance(row, basestring):
            if '\t' in row:
                rdd = rdd.fromCsv('excel-tab')
            elif ',' in row:
                rdd = rdd.fromCsv('excel')
            else:
                rdd = rdd.map(lambda l:l.split(' '))

        return rdd.map(lambda x:[c(x[i]) for i, c in enumerate(converters)])
Beispiel #2
0
 def table(self, path, **kwargs):
     dpath = path[0] if isinstance(path, (list, tuple)) else path
     for root, dirs, names in walk(dpath):
         if '.field_names' in names:
             p = os.path.join(root, '.field_names')
             fields = open(p).read().split('\t')
             break
     else:
         raise Exception("no .field_names found in %s" % path)
     return self.tableFile(path, **kwargs).asTable(fields)
Beispiel #3
0
 def table(self, path, **kwargs):
     dpath = path[0] if isinstance(path, (list, tuple)) else path
     for root, dirs, names in walk(dpath):
         if '.field_names' in names:
             p = os.path.join(root, '.field_names')
             fields = open(p).read().split('\t')
             break
     else:
         raise Exception("no .field_names found in %s" % path)
     return self.tableFile(path, **kwargs).asTable(fields)
Beispiel #4
0
 def _get_files(self, path):
     path = os.path.realpath(path)
     if os.path.isdir(path):
         for root,dirs,names in walk(path):
             for n in sorted(names):
                 if not n.startswith('.'):
                     yield os.path.join(root, n)
                     
     else:
         yield path
Beispiel #5
0
    def _get_files(self, path):
        path = os.path.realpath(path)
        if os.path.isdir(path):
            for root,dirs,names in walk(path):
                for n in sorted(names):
                    if not n.startswith('.'):
                        yield os.path.join(root, n)

        else:
            yield path
Beispiel #6
0
    def textFile(self,
                 path,
                 ext='',
                 followLink=True,
                 maxdepth=0,
                 cls=TextFileRDD,
                 *ka,
                 **kws):
        self.init()
        if isinstance(path, (list, tuple)):
            return self.union([
                self.textFile(p, ext, followLink, maxdepth, cls, *ka, **kws)
                for p in path
            ])

        path = os.path.realpath(path)

        def create_rdd(cls, path, *ka, **kw):
            if cls is TextFileRDD:
                if path.endswith('.bz2'):
                    return BZip2FileRDD(self, path, *ka, **kw)
                elif path.endswith('.gz'):
                    return GZipFileRDD(self, path, *ka, **kw)
            return cls(self, path, *ka, **kw)

        if os.path.isdir(path):
            paths = []
            for root, dirs, names in walk(path, followlinks=followLink):
                if maxdepth > 0:
                    depth = len(filter(None, root[len(path):].split('/'))) + 1
                    if depth > maxdepth:
                        break
                for n in sorted(names):
                    if n.endswith(ext) and not n.startswith('.'):
                        p = os.path.join(root, n)
                        if followLink or not os.path.islink(p):
                            paths.append(p)
                dirs.sort()
                for d in dirs[:]:
                    if d.startswith('.'):
                        dirs.remove(d)

            rdds = [create_rdd(cls, p, *ka, **kws) for p in paths]
            return self.union(rdds)
        else:
            return create_rdd(cls, path, *ka, **kws)
Beispiel #7
0
    def textFile(self, path, ext='', followLink=True, maxdepth=0, cls=TextFileRDD, *ka, **kws):
        self.init()
        if isinstance(path, (list, tuple)):
            return self.union([self.textFile(p, ext, followLink, maxdepth, cls, *ka, **kws)
                for p in path])

        path = os.path.realpath(path)
        def create_rdd(cls, path, *ka, **kw):
            if cls is TextFileRDD:
                if path.endswith('.bz2'):
                    return BZip2FileRDD(self, path, *ka, **kw)
                elif path.endswith('.gz'):
                    return GZipFileRDD(self, path, *ka, **kw)
            return cls(self, path, *ka, **kw)

        if os.path.isdir(path):
            paths = []
            for root,dirs,names in walk(path, followlinks=followLink):
                if maxdepth > 0:
                    depth = len(filter(None, root[len(path):].split('/'))) + 1
                    if depth > maxdepth:
                        break
                for n in sorted(names):
                    if n.endswith(ext) and not n.startswith('.'):
                        p = os.path.join(root, n)
                        if followLink or not os.path.islink(p):
                            paths.append(p)
                dirs.sort()
                for d in dirs[:]:
                    if d.startswith('.'):
                        dirs.remove(d)

            rdds = [create_rdd(cls, p, *ka, **kws)
                     for p in paths]
            return self.union(rdds)
        else:
            return create_rdd(cls, path, *ka, **kws)