def reducer_count(k, vs): """ count the occurency of the key """ c = 0 for v in vs: c += 1 emit(k, str(c))
def reducer_simplesum(k, vs): """ sum the counts of the key """ s = 0 for v in vs: s += float(v) emit(k, str(s))
def step_cleanup(self): """Compute last record and emit the data. """ # last record for multi-line record. The processing is duplicated for single-line record. self._compute() for (k,v) in self._output.iteritems(): if type(v) == type([]): emit(k, "\t".join(map(str,v))) else: emit(k, str(v))
def step_cleanup(self): """Compute last record and emit the data. """ # last record for multi-line record. The processing is duplicated for single-line record. self._compute() for (k, v) in self._output.iteritems(): if type(v) == type([]): emit(k, "\t".join(map(str, v))) else: emit(k, str(v))
def reducer(k, vs): """ """ for v in vs: pid = v.rsplit("\t")[-1] suffix = 'Z' if pid == '102': suffix = 'A' elif pid == '103': suffix = 'B' elif pid == '241': suffix = 'C' else: pass emit(k, v+"#"+suffix)
def reducer_listsum(k, vs): """ Sum the list values of the key. length of list is calculated from the first data line. Note that vs is of type hceutil.ReduceValues, which is not subscriptable. """ s = [] slen = 0 for v in vs: delta = map(float, v.split("\t")) if s : for i in range(slen): s[i] += delta[i] else : s = delta slen = len(s) emit(k, "\t".join(map(str, s)))
def reducer_listsum(k, vs): """ Sum the list values of the key. length of list is calculated from the first data line. Note that vs is of type hceutil.ReduceValues, which is not subscriptable. """ s = [] slen = 0 for v in vs: delta = map(float, v.split("\t")) if s: for i in range(slen): s[i] += delta[i] else: s = delta slen = len(s) emit(k, "\t".join(map(str, s)))
def mapper(k, v): """ k == None, v is a text line """ global rec, nbad try: if rec.parseLine(v): baiduid = rec.attr("baiduid") if baiduid != '-': pid = rec.attr("urlfields").get("pid", '0') if pid in ["102", "103", "241"]: time = rec.attr('timesz') ip = rec.attr('ip') url = rec.attr('url') refer = rec.attr('referer') emit(baiduid, "\t".join([time, ip, url, refer, pid])) else: # print >> sys.stderr, pid pass return True except ValueError: print >> sys.stderr, v nbad += 1 return True
def reducer_cat(k,vs): """ simplely cat """ for v in vs: emit(k,str(v))
def reducer_cat(k, vs): """ simplely cat """ for v in vs: emit(k, str(v))