Esempio n. 1
0
    def execute(self, data):
        Plugin.init(self, data)
        
        self.target = data.get('in')
        out = '%s.reduced' %(self.target)

        io = open(self.target, 'r')
        oio = open(out, 'w')

        for i, l in enumerate(io):
            ss = l.strip().split(self.mr_delimiter)
            try:
                k = ss[0]
                v = ss[1]
            except:
                continue

            oio.write('%s%s%s\n' %(v, self.mr_delimiter, k))


        io.close()
        oio.close()
        
        data['in'] = out
        
        Plugin.terminate(self, data)
        return data
Esempio n. 2
0
    def execute(self, data):
        Plugin.init(self, data)

        self.target = data.get('in')

        if not self.target:
            return data

        out = '%s.%s' %(self.target, self.suffix)
        data['in'] = out
        
        oio = open(out, 'w')
        io = open(self.target, 'r')
        for i, l in enumerate(io):
            ss = l.strip().split(self.mr_delimiter)
            
            k = ss[0]
            vs = ss[1]
            
            oio.write('%s%s1\n' %(vs, self.mr_delimiter))
            
        io.close()
        
        Plugin.terminate(self, data)
        
        return data
Esempio n. 3
0
    def execute(self, data):
        Plugin.init(self, data)
        
        self.target = data.get('in')
        output = '%s.%s' %(self.target, self.suffix)
        data['in'] = output
        
        if self.skip:
            return data
        
        io = open(self.target, 'r')
        oo = open(output, 'w')
        for i, l in enumerate(io):
            ss = l.strip().split(self.mr_delimiter)
            try:
                k = ss[0]
                v = ss[1]
            except:
                continue
            
            oo.write('%s%s%s\n' %(v, self.mr_delimiter, k))

            
        oo.close()
        io.close()



        Plugin.terminate(self, data)
        
        return data
Esempio n. 4
0
    def execute(self, data):
        t = time.time()

        print 'reducing...',

        Plugin.init(self, data)

        self.target = data.get('in')
        out = '%s.reduced' %(self.target)
        data['in'] = out
        
        if self.skip:
            return data

        io = open(self.target, 'r')
        oio = open(out, 'w')

        pk = None
        pv = 0
        for i, l in enumerate(io):
            ss = l.strip().split(self.mr_delimiter)
            try:
                k = ss[0]
                v = int(ss[1])
            except:
                continue
            
            if pk and pk != k:
                oio.write('%s%s%d\n' %(pk, self.mr_delimiter, pv))
                pv = v
            else:
                pv += v

            pk = k
        

        # last
        oio.write('%s%s%d\n' %(pk, self.mr_delimiter, pv))

        io.close()
        oio.close()
        
        data['in'] = out
        
        Plugin.terminate(self, data)
        
        t = time.time() - t
        
        print ' .done ', t, '[ms]'
        
        return data
Esempio n. 5
0
    def execute(self, data):
        Plugin.init(self, data)
        self.target = data.get('in')

        sorted = '%s.%s' %(self.target, self.suffix)
        # add to data
        data['sorted'] = sorted
        data['in'] = sorted
        
        if self.skip:
            return data

        t = time.time()

        stdout.write('# process split sort ...')

        j = 0
        io = open(self.target, 'r')
        
        # tmp dirをクリア
        stderr.write('# clear temp files ... ')
        if self.temp_clean and os.path.exists(self.temp_dir):
            for f in os.listdir(self.temp_dir):
                os.remove(os.path.join(self.temp_dir, f))
        stderr.write(' .done\n')

        #
        # ここをワーカースレッドで並列化
        #
        workers = []
        for i in range(self.num_threads):
            w = OnMemorySortWorker(self, i, self.target, self.num_threads)
            workers.append(w)


        for w in workers:
            w.start()

        for w in workers:
            w.join()

        t = time.time() - t
        stdout.write(' done. %f [s]\n' %(t))

        # merge
        t = time.time()
        sorted = '%s.%s' %(self.target, self.suffix)
        stdout.write('# process merge sort and output to %s ...' %(sorted))

        # new merge sort
        fs = os.listdir(self.temp_dir)
        num_tmp_files = len(fs)
        while True:
            self._merge_sort(sorted, num_tmp_files)
            
            fs = os.listdir(self.temp_dir)
            if len(fs) == 1:
                break

        # sortedを
        if os.path.exists(sorted):
            os.remove(sorted)
        
        #print fs[0]
        os.rename('%s/%s' %(self.temp_dir, fs[0]), sorted)

        t = time.time() - t
        stdout.write(' done. %f [s]\n' %(t))


        # add to data
        data['sorted'] = sorted
        data['in'] = sorted
        
        Plugin.terminate(self, data)
        return data