Beispiel #1
0
 def realfile_(self,hadoop,outdir,tmpdir):
     m = re.search(r'\.(gz|bz2)',self.file)
     if m:
         if m.group(1) == 'gz':
             hadoop.syscall('gunzip -c %s > %s' % (self.file,os.path.join(tmpdir,'ttable')))
         else:
             hadoop.syscall('bunzip2 -c %s > %s' % (self.file,os.path.join(tmpdir,'ttable')))
         return os.path.join(tmpdir,'ttable')
     else:
         return self.file
Beispiel #2
0
 def run(self,hadoop):
     bname = []
     output = self.output
     if len(output) > 0:
         bname = [os.path.basename(o) for o in output]
     # bname and output frequently used together
     opbn = zip(output, bname)
     if len(output) > 0 and all([os.path.exists(o) for o in output]):
         for op, bn in opbn:
             hadoop.put(op, bn)
         return
     else:
         hadoop.syscall(self.execute)
         if len(output) > 0:
             for op, bn in opbn:
                 if self.stage == 'training':
                     hadoop.getmerge(bn, op)
                 else:
                     hadoop.get(bn, op)
Beispiel #3
0
 def run(self,hadoop,outdir,tmpdir):
     bname = []
     output_suffix = [o + self.suffix for o in self.output]
     if len(self.output) > 0:
         bname = [os.path.basename(o) for o in self.output]
     # bname+suffix and output_suffix frequently used together
     osbn = zip(output_suffix, [b + self.suffix for b in bname])
     if len(self.output) > 0 and all([os.path.exists(o) for o in output_suffix]):
         for os, bn in osbn:
             hadoop.put(os, bn)
         return
     else:
         file = self.realfile_(hadoop,outdir,tmpdir)
         execute = PTemplate(self.execute).safe_substitute({'file':file,'suffix':self.suffix})
         hadoop.syscall(execute)
         if len(self.output) > 0:
             for os, bn in osbn:
                 if self.stage == 'training':
                     hadoop.getmerge(bn, os)
                 else:
                     hadoop.get(bn, os)
         self.cleanfile_(file,hadoop,outdir,tmpdir)
Beispiel #4
0
 def cleanfile_(self,file,hadoop,outdir,tmpdir):
     if file != self.file:
         hadoop.syscall('rm -rf %s' % os.path.join(tmpdir,'ttable'))