from MapReduce import MapReduce mr = MapReduce() def mapper(record): # key: word # value: filename value = record[0] text = record[1] for key in text.split(): mr.emit_intermediate(key, value) def reducer(key, values): # key: word # value: list of filenames mr.emit((key, list(set(values)))) if __name__ == '__main__': import sys, json inputdata = open(sys.argv[1]) # inputdata = open('./data/books.json') mr.execute(inputdata, mapper, reducer) with open('inverted_index.json', 'w') as outfile: json.dump(mr.result, outfile)
order_id = record[1] #Mapper will be called on all the orders first(As the data file contains the records of orders before line_items) #As the output of mapper is fed to reducer,all the output to be displayed need to be present in the output of the Mapper. #So emit_intermdiate all the records with order_id as the key #So for every record the id,record is emmited if record_type == "order": mr.emit_intermediate(order_id,record) elif record_type == "line_item": mr.emit_intermediate(order_id,record) #Reducer funtion is called for every record in the output of the map phase (here it is the global dictionary mr.intermediate) def reducer(key,list_of_values): #for all list_of_values first field will be order record and all the others will be line_item records #ie list_of_values[0] will be the orders and list_of_values[1:n] will be the list_item records #Every order is emitted with all the list_items having the same order ID current = 1; while current < len(list_of_values): mr.emit((list_of_values[0],list_of_values[current])) current = current + 1; ##################################################### if __name__ == '__main__': inputdata = open(sys.argv[1]) #execute funtion steps #1.Mapper function for every record (Every mapper appends the output to a global dictionary by calling the functin mr.emit_intermediate ..Line number 18 ) #2.Reducer is called for every record in the output of the map phase (Every reducer appends the data to the global array mr.result) #3.The final result array is printed mr.execute(inputdata, mapper, reducer)
from MapReduce import MapReduce map_reduce_obj = MapReduce() def mapper(record): number_of_baskets=len(record) candidate_list=open(sys.argv[2]) for candidate in candidate_list: count=0 candidate=json.loads(candidate.strip()) for candidate_chunk in record: if not set(candidate)-set(candidate_chunk): count+=1 map_reduce_obj.emit_intermediate(tuple(candidate),(count,number_of_baskets)) def reduce(key,list_of_value): total_count=0 total_baskets=0 for item in list_of_value: total_count+=item[0] total_baskets+=item[1] threshold=ceil(total_baskets*0.3) if total_count>=threshold: map_reduce_obj.emit([list(key),total_count]) if __name__=='__main__': input_data = open(sys.argv[1]) map_reduce_obj.execute(input_data, mapper, reduce)
import sys from MapReduce import MapReduce mr = MapReduce() def mapper(record): # key: document identifier # value: document contents key, seq = record mr.emit_intermediate(seq[:-10], 1) # Part 3 def reducer(key, list_of_values): # key: word # value: list of occurrence counts mr.emit(key) # Part 4 with open(sys.argv[1]) as f: mr.execute(f, mapper, reducer)
from MapperMatrixVector import MapperMatrixVector from ReducerMatrixVector import ReducerMatrixVector from MapReduce import MapReduce from FileHelper import FileHelper # Create instances for mapper and reducer # Note that the vector is stored in the instance theReducerMatrixVector = ReducerMatrixVector(); theMapperMatrixVector = MapperMatrixVector('dataFiles/b'); # the file where the matrix is stored matrixFile = ['dataFiles/A']; # MapReduce theMapReducerMatrixVector = MapReduce(theMapperMatrixVector,theReducerMatrixVector,matrixFile,0,1) resultDict = theMapReducerMatrixVector.execute(); # Write output outFileFirectory = 'outputs/' outfileName = 'matrixVectorResults.txt'; FileHelper.writeDictionnary(outFileFirectory+outfileName,resultDict)
from MapperCountingWords import MapperCountingWords from ReducerCountingWords import ReducerCountingWords from MapReduce import MapReduce from FileHelper import FileHelper # Create instances for mapper and reducer theMapper = MapperCountingWords(); theReducer = ReducerCountingWords(); # parse the file : one word/line inFiles = ['dataFiles/text']; # we can have more than one text file inFileParsed = 'dataFiles/textParsed'; FileHelper.transformTextIntoListOfWords(inFiles,inFileParsed) # MapReduce theMapReducer = MapReduce(theMapper,theReducer,[inFileParsed],silent=-1,nThreads=5) resultDict = theMapReducer.execute() # Write output outFileFirectory = 'outputs/' outfileName = 'coutingWordsResults.txt'; FileHelper.writeDictionnary(outFileFirectory+outfileName,resultDict)
from MapReduce import MapReduce import itertools import sys map_reduce_obj = MapReduce() def mapper(record): map_reduce_obj.emit_intermediate(record[0], record[1]) map_reduce_obj.emit_intermediate(record[1], record[0]) def reducer(key, list_of_values): value_group = list(itertools.combinations(list_of_values, 2)) for value in value_group: value = list(value) value.sort() value.append(key) map_reduce_obj.emit(value) if __name__ == '__main__': input_data = open(sys.argv[1]) map_reduce_obj.execute(input_data, mapper, reducer)