# License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT """ This example includes only the bare minimum required to run wordcount. See wordcount-full.py for an example that uses counters, RecordReader, etc. """ import pydoop.pipes as pp class Mapper(pp.Mapper): def map(self, context): words = context.getInputValue().split() for w in words: context.emit(w, "1") class Reducer(pp.Reducer): def reduce(self, context): s = 0 while context.nextValue(): s += int(context.getInputValue()) context.emit(context.getInputKey(), str(s)) if __name__ == "__main__": pp.runTask(pp.Factory(Mapper, Reducer))
self.file.fs.close() def emit(self, key, value): self.file.write("%s%s%s\n" % (key, self.sep, value)) class Partitioner(pp.Partitioner): def __init__(self, context): super(Partitioner, self).__init__(context) self.logger = logging.getLogger("Partitioner") def partition(self, key, numOfReduces): reducer_id = (hash(key) & sys.maxint) % numOfReduces self.logger.debug("reducer_id: %r" % reducer_id) return reducer_id if __name__ == "__main__": pp.runTask(pp.Factory( Mapper, Reducer, record_reader_class=Reader, record_writer_class=Writer, partitioner_class=Partitioner, combiner_class=Reducer, )) # Local Variables: # mode: python # End:
RecordReader, etc. """ import pydoop.pipes as pp import re class Mapper(pp.Mapper): def __init__(self, context): print context def map(self, context): words = re.sub('[^0-9a-zA-Z]+', ' ', context.getInputValue()).split() for w in words: context.emit(w, "1") class Reducer(pp.Reducer): def __init__(self, context): print "Map" def reduce(self, context): s = 0 while context.nextValue(): s += int(context.getInputValue()) context.emit(context.getInputKey(), str(s)) if __name__ == "__main__": pp.runTask(pp.Factory(mapper_class=Mapper, reducer_class=Reducer))
def run_task(): return pp.runTask(pp.Factory(Mapper, Reducer))
super(Mapper, self).__init__(context) context.setStatus("Initialization started") self.excluded_counter = context.getCounter("IPCOUNT", "EXCLUDED_LINES") jc = context.getJobConf() pu.jc_configure(self, jc, "ipcount.excludes", "excludes_fn", "") if self.excludes_fn: with open(self.excludes_fn) as f: self.excludes = set(l.strip() for l in f if not l.isspace()) else: self.excludes = set() context.setStatus("Initialization done") def map(self, context): ip = context.getInputValue().split(None, 1)[0] if ip not in self.excludes: context.emit(ip, "1") else: context.incrementCounter(self.excluded_counter, 1) class Reducer(pp.Reducer): def reduce(self, context): s = 0 while context.nextValue(): s += int(context.getInputValue()) context.emit(context.getInputKey(), str(s)) if __name__ == "__main__": pp.runTask(pp.Factory(Mapper, Reducer, combiner_class=Reducer))
super(Reader, self).__init__() self.isplit = pp.InputSplit(context.getInputSplit()) self.file = hdfs.open(self.isplit.filename) self.file.seek(self.isplit.offset) self.bytes_read = 0 if self.isplit.offset > 0: discarded = self.file.readline( ) # read by reader of previous split self.bytes_read += len(discarded) def close(self): self.file.close() self.file.fs.close() def next(self): if self.bytes_read > self.isplit.length: # end of input split return (False, "", "") key = struct.pack(">q", self.isplit.offset + self.bytes_read) record = self.file.readline() if record == "": # end of file return (False, "", "") self.bytes_read += len(record) return (True, key, record) def getProgress(self): return min(float(self.bytes_read) / self.isplit.length, 1.0) if __name__ == "__main__": pp.runTask(pp.Factory(Mapper, Reducer, record_reader_class=Reader))