def execute(self, data): Plugin.init(self, data) self.target = data.get('in') out = '%s.reduced' %(self.target) io = open(self.target, 'r') oio = open(out, 'w') for i, l in enumerate(io): ss = l.strip().split(self.mr_delimiter) try: k = ss[0] v = ss[1] except: continue oio.write('%s%s%s\n' %(v, self.mr_delimiter, k)) io.close() oio.close() data['in'] = out Plugin.terminate(self, data) return data
def execute(self, data): Plugin.init(self, data) self.target = data.get('in') if not self.target: return data out = '%s.%s' %(self.target, self.suffix) data['in'] = out oio = open(out, 'w') io = open(self.target, 'r') for i, l in enumerate(io): ss = l.strip().split(self.mr_delimiter) k = ss[0] vs = ss[1] oio.write('%s%s1\n' %(vs, self.mr_delimiter)) io.close() Plugin.terminate(self, data) return data
def execute(self, data): Plugin.init(self, data) self.target = data.get('in') output = '%s.%s' %(self.target, self.suffix) data['in'] = output if self.skip: return data io = open(self.target, 'r') oo = open(output, 'w') for i, l in enumerate(io): ss = l.strip().split(self.mr_delimiter) try: k = ss[0] v = ss[1] except: continue oo.write('%s%s%s\n' %(v, self.mr_delimiter, k)) oo.close() io.close() Plugin.terminate(self, data) return data
def execute(self, data): t = time.time() print 'reducing...', Plugin.init(self, data) self.target = data.get('in') out = '%s.reduced' %(self.target) data['in'] = out if self.skip: return data io = open(self.target, 'r') oio = open(out, 'w') pk = None pv = 0 for i, l in enumerate(io): ss = l.strip().split(self.mr_delimiter) try: k = ss[0] v = int(ss[1]) except: continue if pk and pk != k: oio.write('%s%s%d\n' %(pk, self.mr_delimiter, pv)) pv = v else: pv += v pk = k # last oio.write('%s%s%d\n' %(pk, self.mr_delimiter, pv)) io.close() oio.close() data['in'] = out Plugin.terminate(self, data) t = time.time() - t print ' .done ', t, '[ms]' return data
def execute(self, data): Plugin.init(self, data) self.target = data.get('in') sorted = '%s.%s' %(self.target, self.suffix) # add to data data['sorted'] = sorted data['in'] = sorted if self.skip: return data t = time.time() stdout.write('# process split sort ...') j = 0 io = open(self.target, 'r') # tmp dirをクリア stderr.write('# clear temp files ... ') if self.temp_clean and os.path.exists(self.temp_dir): for f in os.listdir(self.temp_dir): os.remove(os.path.join(self.temp_dir, f)) stderr.write(' .done\n') # # ここをワーカースレッドで並列化 # workers = [] for i in range(self.num_threads): w = OnMemorySortWorker(self, i, self.target, self.num_threads) workers.append(w) for w in workers: w.start() for w in workers: w.join() t = time.time() - t stdout.write(' done. %f [s]\n' %(t)) # merge t = time.time() sorted = '%s.%s' %(self.target, self.suffix) stdout.write('# process merge sort and output to %s ...' %(sorted)) # new merge sort fs = os.listdir(self.temp_dir) num_tmp_files = len(fs) while True: self._merge_sort(sorted, num_tmp_files) fs = os.listdir(self.temp_dir) if len(fs) == 1: break # sortedを if os.path.exists(sorted): os.remove(sorted) #print fs[0] os.rename('%s/%s' %(self.temp_dir, fs[0]), sorted) t = time.time() - t stdout.write(' done. %f [s]\n' %(t)) # add to data data['sorted'] = sorted data['in'] = sorted Plugin.terminate(self, data) return data