import struct from pydoop.mapreduce.pipes import run_task, Factory from pydoop.mapreduce.api import Mapper, Reducer class FilterMapper(Mapper): def map(self, context): dst, cnt = context.key, context.value cnt = struct.unpack(">i", cnt)[0] context.emit(0, (dst, cnt)) class FilterReducer(Reducer): def reduce(self, context): dic = {} lst = [] for pair in list(context.values): dst, cnt = pair[0], pair[1] dic[dst] = cnt lst = sorted(dic.items(), key=lambda t: t[1], reverse=True) for i in range(50): context.emit(lst[i][0], lst[i][1]) if __name__ == "__main__": factory = Factory(FilterMapper, FilterReducer) run_task(factory)
from pydoop.mapreduce.api import Mapper, Reducer """ Count followers of each node Input : directed graph e.g.) "3 4" indicates that person 3 has 4 followers. Output : (destination, follower count) e.g.) "4 2" node 4 has 2 followers. """ class DstCountMapper(Mapper): def map(self, context): # Implements your codes x = context.value.split() context.emit(x[1], 1) # pass class DstCountReducer(Reducer): def reduce(self, context): # Implements your codes #with open("asd.txt", "w") as f: # f.write("debug\n") context.emit("", "") # pass if __name__ == "__main__": factory = Factory(DstCountMapper, DstCountReducer) run_task(factory, auto_serialize=False)
self.ctx = ctx LOGGER.info("Mapper instantiated") def map(self, ctx): words = re.sub('[^0-9a-zA-Z]+', ' ', ctx.value).split() for w in words: ctx.emit(w, 1) class TReducer(Reducer): def __init__(self, ctx): super(TReducer, self).__init__(ctx) self.ctx = ctx LOGGER.info("Reducer instantiated") def reduce(self, ctx): s = sum(ctx.values) # Note: we explicitly write the value as a str. ctx.emit(ctx.key, str(s)) FACTORY = Factory(mapper_class=TMapper, reducer_class=TReducer) def main(): run_task(FACTORY) if __name__ == "__main__": main()
import struct from pydoop.mapreduce.pipes import run_task, Factory from pydoop.mapreduce.api import Mapper, Reducer class FilterMapper(Mapper): """ Process a wordcount output stream, emitting only records relative to words whose count is equal to or above the configured threshold. """ def __init__(self, context): super(FilterMapper, self).__init__(context) jc = context.job_conf self.threshold = jc.get_int("filter.occurrence.threshold") def map(self, context): word, occurrence = context.key, context.value occurrence = struct.unpack(">i", occurrence)[0] if occurrence >= self.threshold: context.emit(word, str(occurrence)) class FilterReducer(Reducer): def reduce(self, context): pass if __name__ == "__main__": run_task(Factory(FilterMapper, FilterReducer))
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT import struct import re from pydoop.mapreduce.pipes import run_task, Factory from pydoop.mapreduce.api import Mapper, Reducer class WordCountMapper(Mapper): def map(self, context): words = re.sub('[^0-9a-zA-Z]+', ' ', context.value).split() for w in words: context.emit(w, 1) class WordCountReducer(Reducer): def reduce(self, context): s = sum(context.values) context.emit(context.key, struct.pack(">i", s)) if __name__ == "__main__": run_task(Factory(WordCountMapper, WordCountReducer))
""" Filter out words whose occurrence falls below a specified value. """ import struct from pydoop.mapreduce.pipes import run_task, Factory from pydoop.mapreduce.api import Mapper class FilterMapper(Mapper): """ Process a wordcount output stream, emitting only records relative to words whose count is equal to or above the configured threshold. """ def __init__(self, context): super(FilterMapper, self).__init__(context) jc = context.job_conf self.threshold = jc.get_int("filter.occurrence.threshold") def map(self, context): word, occurrence = context.key, context.value occurrence = struct.unpack(">i", occurrence)[0] if occurrence >= self.threshold: context.emit(word, str(occurrence)) if __name__ == "__main__": factory = Factory(FilterMapper) run_task(factory, raw_values=True)
# under the License. # # END_COPYRIGHT import struct from pydoop.mapreduce.pipes import run_task, Factory from pydoop.mapreduce.api import Mapper, Reducer class WordCountMapper(Mapper): def map(self, context): #print("============== Map Start! ======================") for w in context.value.split(): context.emit(w, 1) #print("============== Map Start! ======================") class WordCountReducer(Reducer): def reduce(self, context): #print("============== reduce Start! ======================") s = sum(context.values) #context.emit(context.key.encode("utf-8"), struct.pack(">i", s)) context.emit(context.key, s) #print("============== reduce Start! ======================") if __name__ == "__main__": factory = Factory(WordCountMapper, WordCountReducer) run_task(factory, auto_serialize=False)
def main(): return run_task(Factory(Mapper, Reducer, combiner_class=Reducer))