def __init__(self, context): super(Reader, self).__init__() self.isplit = pp.InputSplit(context.getInputSplit()) self.file = hdfs.open(self.isplit.filename) self.file.seek(self.isplit.offset) self.bytes_read = 0 if self.isplit.offset > 0: discarded = self.file.readline( ) # read by reader of previous split self.bytes_read += len(discarded)
def __init__(self, context): super(Reader, self).__init__() self.logger = logging.getLogger("Reader") self.isplit = pp.InputSplit(context.getInputSplit()) for a in "filename", "offset", "length": self.logger.debug("isplit.%s = %r" % (a, getattr(self.isplit, a))) self.file = hdfs.open(self.isplit.filename) self.logger.debug("readline chunk size = %r" % self.file.chunk_size) self.file.seek(self.isplit.offset) self.bytes_read = 0 if self.isplit.offset > 0: # read by reader of previous split discarded = self.file.readline() self.bytes_read += len(discarded)
def __init__(self, context): super(SequenceFileReader, self).__init__() self.isplit = pp.InputSplit(context.getInputSplit()) logger.debug("isplit filename: %s", self.isplit.filename) logger.debug("isplit offset: %s", self.isplit.offset) logger.debug("isplit length: %s", self.isplit.length) self.seq_file = _HdfsSequenceFileReader(path=self.isplit.filename, start=self.isplit.offset, length=self.isplit.length) key_class = self.seq_file.getKeyClass() value_class = self.seq_file.getValueClass() self._key = key_class() self._value = value_class() logger.debug("done initializing pydoop.reader.SequenceFileReader")