def gen_rdd(self, **kwargs): use_limit = kwargs.get('use_limit', False) converters = [] for c, t in self.columns: if t == Type.INT: converters.append(lambda x:int(x) if x else 0) elif t == Type.FLOAT: converters.append(lambda x:float(x) if x else 0) else: converters.append(lambda x:x) expr = self.expr if self.selectors: r = expr.file_path path = set() for s in self.selectors: for p in glob.glob('%s/%s' % (r, s)): if not os.path.isdir(p): path.add(p) else: for root, _, names in walk(p, followlinks=True): path.update(os.path.join(root, name) for name in names if not name.startswith('.')) path = list(path) else: path = expr.file_path if use_limit and isinstance(path, basestring): for root, _, names in walk(path, followlinks=True): if names: path = [os.path.join(root, name) for name in names] break rdd = dpark.textFile(path) if expr.expr: rdd = eval('rdd.' + expr.expr, global_env, {'rdd':rdd}) row = rdd.first() if isinstance(row, basestring): if '\t' in row: rdd = rdd.fromCsv('excel-tab') elif ',' in row: rdd = rdd.fromCsv('excel') else: rdd = rdd.map(lambda l:l.split(' ')) return rdd.map(lambda x:[c(x[i]) for i, c in enumerate(converters)])
def table(self, path, **kwargs): dpath = path[0] if isinstance(path, (list, tuple)) else path for root, dirs, names in walk(dpath): if '.field_names' in names: p = os.path.join(root, '.field_names') fields = open(p).read().split('\t') break else: raise Exception("no .field_names found in %s" % path) return self.tableFile(path, **kwargs).asTable(fields)
def _get_files(self, path): path = os.path.realpath(path) if os.path.isdir(path): for root,dirs,names in walk(path): for n in sorted(names): if not n.startswith('.'): yield os.path.join(root, n) else: yield path
def textFile(self, path, ext='', followLink=True, maxdepth=0, cls=TextFileRDD, *ka, **kws): self.init() if isinstance(path, (list, tuple)): return self.union([ self.textFile(p, ext, followLink, maxdepth, cls, *ka, **kws) for p in path ]) path = os.path.realpath(path) def create_rdd(cls, path, *ka, **kw): if cls is TextFileRDD: if path.endswith('.bz2'): return BZip2FileRDD(self, path, *ka, **kw) elif path.endswith('.gz'): return GZipFileRDD(self, path, *ka, **kw) return cls(self, path, *ka, **kw) if os.path.isdir(path): paths = [] for root, dirs, names in walk(path, followlinks=followLink): if maxdepth > 0: depth = len(filter(None, root[len(path):].split('/'))) + 1 if depth > maxdepth: break for n in sorted(names): if n.endswith(ext) and not n.startswith('.'): p = os.path.join(root, n) if followLink or not os.path.islink(p): paths.append(p) dirs.sort() for d in dirs[:]: if d.startswith('.'): dirs.remove(d) rdds = [create_rdd(cls, p, *ka, **kws) for p in paths] return self.union(rdds) else: return create_rdd(cls, path, *ka, **kws)
def textFile(self, path, ext='', followLink=True, maxdepth=0, cls=TextFileRDD, *ka, **kws): self.init() if isinstance(path, (list, tuple)): return self.union([self.textFile(p, ext, followLink, maxdepth, cls, *ka, **kws) for p in path]) path = os.path.realpath(path) def create_rdd(cls, path, *ka, **kw): if cls is TextFileRDD: if path.endswith('.bz2'): return BZip2FileRDD(self, path, *ka, **kw) elif path.endswith('.gz'): return GZipFileRDD(self, path, *ka, **kw) return cls(self, path, *ka, **kw) if os.path.isdir(path): paths = [] for root,dirs,names in walk(path, followlinks=followLink): if maxdepth > 0: depth = len(filter(None, root[len(path):].split('/'))) + 1 if depth > maxdepth: break for n in sorted(names): if n.endswith(ext) and not n.startswith('.'): p = os.path.join(root, n) if followLink or not os.path.islink(p): paths.append(p) dirs.sort() for d in dirs[:]: if d.startswith('.'): dirs.remove(d) rdds = [create_rdd(cls, p, *ka, **kws) for p in paths] return self.union(rdds) else: return create_rdd(cls, path, *ka, **kws)