def test_serialize_to_string(self): numbers = random.sample(xrange(-18999289888, 18999289888), 10000) for n in numbers: s = srl.serialize_to_string(n) stream = StringIO(s) x = srl.deserialize_vint(stream) self.assertEqual(n, x)
def write_map_down_stream(self, file_in, job_conf, num_reducers, authorization=None, input_split=''): """ Prepares a binary file with all the downward (from hadoop to the pipes program) command flow. If `file_in` is `not None`, it will simulate the behavior of hadoop `TextLineReader` FIXME and add to the command flow a mapItem instruction for each line of `file_in`. Otherwise, it assumes that the pipes program will use the `input_split` variable and take care of record reading by itself. """ input_key_type = 'org.apache.hadoop.io.LongWritable' input_value_type = 'org.apache.hadoop.io.Text' piped_input = file_in is not None self.tempf = tempfile.NamedTemporaryFile('r+', prefix='pydoop-tmp') f = self.tempf.file self.logger.debug('writing map input data in %s', f.name) down_stream = BinaryWriter(f) self.write_header_down_stream(down_stream, authorization, job_conf) down_stream.send('runMap', input_split, num_reducers, piped_input) if piped_input: down_stream.send('setInputTypes', input_key_type, input_value_type) pos = file_in.tell() for l in file_in: self.logger.debug("Line: %s", l) k = serialize_to_string(pos) down_stream.send('mapItem', k, l) pos = file_in.tell() down_stream.send('close') self.logger.debug('\tdone writing, rewinding') f.seek(0) return f
def next(self): if self.bytes_read > self.isplit.length: raise StopIteration key = serialize_to_string(self.isplit.offset + self.bytes_read) record = self.file.readline() if record == "": # end of file raise StopIteration self.bytes_read += len(record) return (key, record)
def write_map_down_stream(self, file_in, job_conf, num_reducers, authorization=None, input_split=''): """ Prepares a binary file with all the downward (from hadoop to the pipes program) command flow. If `file_in` is `not None`, it will simulate the behavior of hadoop `TextLineReader` FIXME and add to the command flow a mapItem instruction for each line of `file_in`. Otherwise, it assumes that the pipes program will use the `input_split` variable and take care of record reading by itself. """ input_key_type = 'org.apache.hadoop.io.LongWritable' input_value_type = 'org.apache.hadoop.io.Text' piped_input = file_in is not None self.tempf = tempfile.NamedTemporaryFile('r+', prefix='pydoop-tmp') f = self.tempf.file self.logger.debug('writing map input data to %s', f.name) down_stream = BinaryWriter(f) self.write_header_down_stream(down_stream, authorization, job_conf) down_stream.send('runMap', input_split, num_reducers, piped_input) if piped_input: down_stream.send('setInputTypes', input_key_type, input_value_type) if AVRO_INPUT in job_conf: serializers = defaultdict(lambda: lambda r: '') avro_input = job_conf[AVRO_INPUT].upper() reader = get_avro_reader(file_in) if avro_input == 'K' or avro_input == 'KV': serializer = AvroSerializer( job_conf.get(AVRO_KEY_INPUT_SCHEMA) ) serializers['K'] = serializer.serialize if avro_input == 'V' or avro_input == 'KV': serializer = AvroSerializer( job_conf.get(AVRO_VALUE_INPUT_SCHEMA) ) serializers['V'] = serializer.serialize for record in reader: if avro_input == 'KV': record_k = record['key'] record_v = record['value'] else: record_v = record_k = record down_stream.send( 'mapItem', serializers['K'](record_k), serializers['V'](record_v), ) else: pos = file_in.tell() for l in file_in: self.logger.debug("Line: %s", l) k = serialize_to_string(pos) down_stream.send('mapItem', k, l) pos = file_in.tell() down_stream.send('close') self.logger.debug('done writing, rewinding') f.seek(0) return f