Ejemplo n.º 1
0
 def __init__(self, context):
     super(Reader, self).__init__()
     self.isplit = pp.InputSplit(context.getInputSplit())
     self.file = hdfs.open(self.isplit.filename)
     self.file.seek(self.isplit.offset)
     self.bytes_read = 0
     if self.isplit.offset > 0:
         discarded = self.file.readline(
         )  # read by reader of previous split
         self.bytes_read += len(discarded)
Ejemplo n.º 2
0
 def __init__(self, context):
     super(Reader, self).__init__()
     self.logger = logging.getLogger("Reader")
     self.isplit = pp.InputSplit(context.getInputSplit())
     for a in "filename", "offset", "length":
         self.logger.debug("isplit.%s = %r" % (a, getattr(self.isplit, a)))
     self.file = hdfs.open(self.isplit.filename)
     self.logger.debug("readline chunk size = %r" % self.file.chunk_size)
     self.file.seek(self.isplit.offset)
     self.bytes_read = 0
     if self.isplit.offset > 0:
         # read by reader of previous split
         discarded = self.file.readline()
         self.bytes_read += len(discarded)
Ejemplo n.º 3
0
    def __init__(self, context):
        super(SequenceFileReader, self).__init__()
        self.isplit = pp.InputSplit(context.getInputSplit())
        logger.debug("isplit filename: %s", self.isplit.filename)
        logger.debug("isplit offset: %s", self.isplit.offset)
        logger.debug("isplit length: %s", self.isplit.length)
        self.seq_file = _HdfsSequenceFileReader(path=self.isplit.filename,
                                                start=self.isplit.offset,
                                                length=self.isplit.length)

        key_class = self.seq_file.getKeyClass()
        value_class = self.seq_file.getValueClass()
        self._key = key_class()
        self._value = value_class()
        logger.debug("done initializing pydoop.reader.SequenceFileReader")