Beispiel #1
0
 def emit(self, key, value):
     self.progress()
     if self.writer:
         self.writer.emit(key, value)
     else:
         if self._is_mapper and self._private_encoding:
             key = private_encode(key)
             value = private_encode(value)
         if self.partitioner:
             part = self.partitioner.partition(key, self.n_reduces)
             self.up_link.send(self.up_link.PARTITIONED_OUTPUT,
                               part, key, value)
         else:
             self.up_link.send(self.up_link.OUTPUT, key, value)
Beispiel #2
0
 def initialize_break_points(cls,
                             n_reducers,
                             sampled_records,
                             input_dir,
                             n_threads=2):
     file_infos = [
         i for i in hdfs.lsl(input_dir)
         if (i['kind'] == 'file'
             and os.path.basename(i['name']).startswith('part'))
     ]
     n_files = len(file_infos)
     total_size = sum(map(lambda _: int(_['size']), file_infos))
     n_records = total_size // RECORD_LENGTH
     assert n_records > sampled_records
     df = max(n_files // n_reducers, 1)
     paths = [
         i['name'] for i in it.islice(file_infos, 0, df * n_reducers, df)
     ]
     break_points = cls.get_break_points(sampled_records // n_reducers,
                                         n_reducers, paths, n_threads)
     vals = [_ for _ in zip(break_points, range(1, n_reducers))]
     selector = Selector(vals)
     bp_path = os.path.join(cls.TMP_DIR, cls.BREAK_POINTS_CACHE_FILE)
     with io.open(bp_path, "wb") as f:
         f.write(srl.private_encode(selector))
     return bp_path
Beispiel #3
0
 def emit(self, key, value):
     self.progress()
     if self.writer:
         self.writer.emit(key, value)
     else:
         if self._private_encoding:
             key = private_encode(key)
             value = private_encode(value)
         else:
             key = (key if type(key) in [str, unicode] else unicode(key))
             value = (value
                      if type(value) in [str, unicode] else unicode(value))
         if self.partitioner:
             part = self.partitioner.partition(key, self.n_reduces)
             self.up_link.send('partitionedOutput', part, key, value)
         else:
             self.up_link.send('output', key, value)
Beispiel #4
0
 def emit(self, key, value):
     self.progress()
     if self.writer:
         self.writer.emit(key, value)
     else:
         if self._private_encoding:
             key = private_encode(key)
             value = private_encode(value)
         else:
             key = (key if type(key) in [str, unicode]
                    else unicode(key))
             value = (value if type(value) in [str, unicode]
                      else unicode(value))
         if self.partitioner:
             part = self.partitioner.partition(key, self.n_reduces)
             self.up_link.send('partitionedOutput', part, key, value)
         else:
             self.up_link.send('output', key, value)
Beispiel #5
0
 def initialize_break_points(cls, n_reducers, sampled_records,
                             input_dir, n_threads=2):
     file_infos = [i for i in hdfs.lsl(input_dir)
                   if (i['kind'] == 'file' and
                       os.path.basename(i['name']).startswith('part'))]
     n_files = len(file_infos)
     total_size = sum(map(lambda _: int(_['size']), file_infos))
     n_records = total_size // RECORD_LENGTH
     assert n_records > sampled_records
     df = max(n_files // n_reducers, 1)
     paths = [i['name']
              for i in it.islice(file_infos, 0, df * n_reducers, df)]
     break_points = cls.get_break_points(sampled_records // n_reducers,
                                         n_reducers, paths, n_threads)
     vals = [_ for _ in zip(break_points, range(1, n_reducers))]
     selector = Selector(vals)
     bp_path = os.path.join(cls.TMP_DIR, cls.BREAK_POINTS_CACHE_FILE)
     with io.open(bp_path, "wb") as f:
         f.write(srl.private_encode(selector))
     return bp_path
Beispiel #6
0
 def test_private_serialize(self):
     for obj in [1, 0.4, "Hello", [1, 2, 3], {"key": "value"}]:
         self.assertEqual(obj, srl.private_decode(srl.private_encode(obj)))
Beispiel #7
0
 def test_private_serialize(self):
     for obj in [1, 0.4, "Hello", [1, 2, 3], {"key": "value"}]:
         self.assertEqual(obj, srl.private_decode(srl.private_encode(obj)))