Example #1
0
 def test_serialize_to_string(self):
     numbers = random.sample(xrange(-18999289888, 18999289888), 10000)
     for n in numbers:
         s = srl.serialize_to_string(n)
         stream = StringIO(s)
         x = srl.deserialize_vint(stream)
         self.assertEqual(n, x)
Example #2
0
 def write_map_down_stream(self, file_in, job_conf, num_reducers,
                           authorization=None, input_split=''):
     """
     Prepares a binary file with all the downward (from hadoop to the
     pipes program) command flow. If `file_in` is `not None`, it will
     simulate the behavior of hadoop `TextLineReader` FIXME and add to
     the command flow a mapItem instruction for each line of `file_in`.
     Otherwise, it assumes that the pipes program will use the
     `input_split` variable and take care of record reading by itself.
     """
     input_key_type = 'org.apache.hadoop.io.LongWritable'
     input_value_type = 'org.apache.hadoop.io.Text'
     piped_input = file_in is not None
     self.tempf = tempfile.NamedTemporaryFile('r+', prefix='pydoop-tmp')
     f = self.tempf.file
     self.logger.debug('writing map input data in %s', f.name)
     down_stream = BinaryWriter(f)
     self.write_header_down_stream(down_stream, authorization, job_conf)
     down_stream.send('runMap', input_split, num_reducers, piped_input)
     if piped_input:
         down_stream.send('setInputTypes', input_key_type, input_value_type)
         pos = file_in.tell()
         for l in file_in:
             self.logger.debug("Line: %s", l)
             k = serialize_to_string(pos)
             down_stream.send('mapItem', k, l)
             pos = file_in.tell()
         down_stream.send('close')
     self.logger.debug('\tdone writing, rewinding')
     f.seek(0)
     return f
Example #3
0
 def test_serialize_to_string(self):
     numbers = random.sample(xrange(-18999289888, 18999289888), 10000)
     for n in numbers:
         s = srl.serialize_to_string(n)
         stream = StringIO(s)
         x = srl.deserialize_vint(stream)
         self.assertEqual(n, x)
Example #4
0
 def write_map_down_stream(self, file_in, job_conf, num_reducers,
                           authorization=None, input_split=''):
     """
     Prepares a binary file with all the downward (from hadoop to the
     pipes program) command flow. If `file_in` is `not None`, it will
     simulate the behavior of hadoop `TextLineReader` FIXME and add to
     the command flow a mapItem instruction for each line of `file_in`.
     Otherwise, it assumes that the pipes program will use the
     `input_split` variable and take care of record reading by itself.
     """
     input_key_type = 'org.apache.hadoop.io.LongWritable'
     input_value_type = 'org.apache.hadoop.io.Text'
     piped_input = file_in is not None
     self.tempf = tempfile.NamedTemporaryFile('r+', prefix='pydoop-tmp')
     f = self.tempf.file
     self.logger.debug('writing map input data in %s', f.name)
     down_stream = BinaryWriter(f)
     self.write_header_down_stream(down_stream, authorization, job_conf)
     down_stream.send('runMap', input_split, num_reducers, piped_input)
     if piped_input:
         down_stream.send('setInputTypes', input_key_type, input_value_type)
         pos = file_in.tell()
         for l in file_in:
             self.logger.debug("Line: %s", l)
             k = serialize_to_string(pos)
             down_stream.send('mapItem', k, l)
             pos = file_in.tell()
         down_stream.send('close')
     self.logger.debug('\tdone writing, rewinding')
     f.seek(0)
     return f
 def next(self):
     if self.bytes_read > self.isplit.length:
         raise StopIteration
     key = serialize_to_string(self.isplit.offset + self.bytes_read)
     record = self.file.readline()
     if record == "":  # end of file
         raise StopIteration
     self.bytes_read += len(record)
     return (key, record)
Example #6
0
 def next(self):
     if self.bytes_read > self.isplit.length:
         raise StopIteration
     key = serialize_to_string(self.isplit.offset + self.bytes_read)
     record = self.file.readline()
     if record == "":  # end of file
         raise StopIteration
     self.bytes_read += len(record)
     return (key, record)
Example #7
0
    def write_map_down_stream(self, file_in, job_conf, num_reducers,
                              authorization=None, input_split=''):
        """
        Prepares a binary file with all the downward (from hadoop to the
        pipes program) command flow. If `file_in` is `not None`, it will
        simulate the behavior of hadoop `TextLineReader` FIXME and add to
        the command flow a mapItem instruction for each line of `file_in`.
        Otherwise, it assumes that the pipes program will use the
        `input_split` variable and take care of record reading by itself.
        """
        input_key_type = 'org.apache.hadoop.io.LongWritable'
        input_value_type = 'org.apache.hadoop.io.Text'
        piped_input = file_in is not None
        self.tempf = tempfile.NamedTemporaryFile('r+', prefix='pydoop-tmp')
        f = self.tempf.file
        self.logger.debug('writing map input data to %s', f.name)
        down_stream = BinaryWriter(f)
        self.write_header_down_stream(down_stream, authorization, job_conf)
        down_stream.send('runMap', input_split, num_reducers, piped_input)
        if piped_input:
            down_stream.send('setInputTypes', input_key_type, input_value_type)
            if AVRO_INPUT in job_conf:
                serializers = defaultdict(lambda: lambda r: '')
                avro_input = job_conf[AVRO_INPUT].upper()
                reader = get_avro_reader(file_in)

                if avro_input == 'K' or avro_input == 'KV':
                    serializer = AvroSerializer(
                        job_conf.get(AVRO_KEY_INPUT_SCHEMA)
                    )
                    serializers['K'] = serializer.serialize

                if avro_input == 'V' or avro_input == 'KV':
                    serializer = AvroSerializer(
                        job_conf.get(AVRO_VALUE_INPUT_SCHEMA)
                    )
                    serializers['V'] = serializer.serialize

                for record in reader:
                    if avro_input == 'KV':
                        record_k = record['key']
                        record_v = record['value']
                    else:
                        record_v = record_k = record

                    down_stream.send(
                        'mapItem',
                        serializers['K'](record_k),
                        serializers['V'](record_v),
                    )

            else:
                pos = file_in.tell()
                for l in file_in:
                    self.logger.debug("Line: %s", l)
                    k = serialize_to_string(pos)
                    down_stream.send('mapItem', k, l)
                    pos = file_in.tell()
            down_stream.send('close')
        self.logger.debug('done writing, rewinding')
        f.seek(0)
        return f