def realfile_(self,hadoop,outdir,tmpdir): m = re.search(r'\.(gz|bz2)',self.file) if m: if m.group(1) == 'gz': hadoop.syscall('gunzip -c %s > %s' % (self.file,os.path.join(tmpdir,'ttable'))) else: hadoop.syscall('bunzip2 -c %s > %s' % (self.file,os.path.join(tmpdir,'ttable'))) return os.path.join(tmpdir,'ttable') else: return self.file
def run(self,hadoop): bname = [] output = self.output if len(output) > 0: bname = [os.path.basename(o) for o in output] # bname and output frequently used together opbn = zip(output, bname) if len(output) > 0 and all([os.path.exists(o) for o in output]): for op, bn in opbn: hadoop.put(op, bn) return else: hadoop.syscall(self.execute) if len(output) > 0: for op, bn in opbn: if self.stage == 'training': hadoop.getmerge(bn, op) else: hadoop.get(bn, op)
def run(self,hadoop,outdir,tmpdir): bname = [] output_suffix = [o + self.suffix for o in self.output] if len(self.output) > 0: bname = [os.path.basename(o) for o in self.output] # bname+suffix and output_suffix frequently used together osbn = zip(output_suffix, [b + self.suffix for b in bname]) if len(self.output) > 0 and all([os.path.exists(o) for o in output_suffix]): for os, bn in osbn: hadoop.put(os, bn) return else: file = self.realfile_(hadoop,outdir,tmpdir) execute = PTemplate(self.execute).safe_substitute({'file':file,'suffix':self.suffix}) hadoop.syscall(execute) if len(self.output) > 0: for os, bn in osbn: if self.stage == 'training': hadoop.getmerge(bn, os) else: hadoop.get(bn, os) self.cleanfile_(file,hadoop,outdir,tmpdir)
def cleanfile_(self,file,hadoop,outdir,tmpdir): if file != self.file: hadoop.syscall('rm -rf %s' % os.path.join(tmpdir,'ttable'))