コード例 #1
0
ファイル: tabular.py プロジェクト: zouzias/dpark
    def _get_files(self, path):
        path = os.path.realpath(path)
        if os.path.isdir(path):
            for root, dirs, names in walk(path):
                for n in sorted(names):
                    if not n.startswith('.'):
                        yield os.path.join(root, n)

        else:
            yield path
コード例 #2
0
ファイル: tabular.py プロジェクト: douban/dpark
    def _get_files(self, path):
        path = os.path.realpath(path)
        if os.path.isdir(path):
            for root, dirs, names in walk(path):
                for n in sorted(names):
                    if not n.startswith('.'):
                        yield os.path.join(root, n)

        else:
            yield path
コード例 #3
0
ファイル: context.py プロジェクト: rohithreddy/dpark
    def table(self, path, **kwargs):
        dpath = path[0] if isinstance(path, (list, tuple)) else path
        for root, dirs, names in walk(dpath):
            if '.field_names' in names:
                p = os.path.join(root, '.field_names')
                with open(p) as f:
                    fields = f.read().split('\t')

                break
        else:
            raise Exception("no .field_names found in %s" % path)
        return self.tableFile(path, **kwargs).asTable(fields)
コード例 #4
0
ファイル: context.py プロジェクト: douban/dpark
    def table(self, path, **kwargs):
        dpath = path[0] if isinstance(path, (list, tuple)) else path
        for root, dirs, names in walk(dpath):
            if '.field_names' in names:
                p = os.path.join(root, '.field_names')
                with open(p) as f:
                    fields = f.read().split('\t')

                break
        else:
            raise Exception("no .field_names found in %s" % path)
        return self.tableFile(path, **kwargs).asTable(fields)
コード例 #5
0
ファイル: context.py プロジェクト: rohithreddy/dpark
    def textFile(self,
                 path,
                 ext='',
                 followLink=True,
                 maxdepth=0,
                 cls=TextFileRDD,
                 *ka,
                 **kws):
        self.init()
        if isinstance(path, (list, tuple)):
            return self.union([
                self.textFile(p, ext, followLink, maxdepth, cls, *ka, **kws)
                for p in path
            ])

        path = os.path.realpath(path)

        def create_rdd(_cls, _path, *_ka, **_kw):
            if _cls is TextFileRDD:
                if _path.endswith('.bz2'):
                    return BZip2FileRDD(self, _path, *_ka, **_kw)
                elif _path.endswith('.gz'):
                    return GZipFileRDD(self, _path, *_ka, **_kw)
            return _cls(self, _path, *_ka, **_kw)

        if os.path.isdir(path):
            paths = []
            for root, dirs, names in walk(path, followlinks=followLink):
                if maxdepth > 0:
                    depth = len(
                        [_f for _f in root[len(path):].split('/') if _f]) + 1
                    if depth > maxdepth:
                        break
                for n in sorted(names):
                    if n.endswith(ext) and not n.startswith('.'):
                        p = os.path.join(root, n)
                        if followLink or not os.path.islink(p):
                            paths.append(p)
                dirs.sort()
                for d in dirs[:]:
                    if d.startswith('.'):
                        dirs.remove(d)

            rdds = [create_rdd(cls, p, *ka, **kws) for p in paths]
            return self.union(rdds)
        else:
            return create_rdd(cls, path, *ka, **kws)
コード例 #6
0
ファイル: dpark_mfs.py プロジェクト: zxh1986123/dpark
def walk_dir(path, followlinks=False):
    paths = []
    t = time.time()

    for root, dirs, names in walk(path, followlinks=followlinks):
        for n in sorted(names):
            if not n.startswith('.'):
                p = os.path.join(root, n)
                if followlinks or not os.path.islink(p):
                    paths.append(p)
        dirs.sort()
        for d in dirs[:]:
            if d.startswith('.'):
                dirs.remove(d)
    t = time.time() - t
    print("walk {} files use {}s".format(len(paths), t))
    return paths
コード例 #7
0
ファイル: context.py プロジェクト: douban/dpark
    def textFile(self, path, ext='', followLink=True, maxdepth=0, cls=TextFileRDD, *ka, **kws):
        self.init()
        if isinstance(path, (list, tuple)):
            return self.union([self.textFile(p, ext, followLink, maxdepth, cls, *ka, **kws)
                               for p in path])

        path = os.path.realpath(path)

        def create_rdd(_cls, _path, *_ka, **_kw):
            if _cls is TextFileRDD:
                if _path.endswith('.bz2'):
                    return BZip2FileRDD(self, _path, *_ka, **_kw)
                elif _path.endswith('.gz'):
                    return GZipFileRDD(self, _path, *_ka, **_kw)
            return _cls(self, _path, *_ka, **_kw)

        if os.path.isdir(path):
            paths = []
            for root, dirs, names in walk(path, followlinks=followLink):
                if maxdepth > 0:
                    depth = len([_f for _f in root[len(path):].split('/') if _f]) + 1
                    if depth > maxdepth:
                        break
                for n in sorted(names):
                    if n.endswith(ext) and not n.startswith('.'):
                        p = os.path.join(root, n)
                        if followLink or not os.path.islink(p):
                            paths.append(p)
                dirs.sort()
                for d in dirs[:]:
                    if d.startswith('.'):
                        dirs.remove(d)

            rdds = [create_rdd(cls, p, *ka, **kws)
                    for p in paths]
            return self.union(rdds)
        else:
            return create_rdd(cls, path, *ka, **kws)