def do_job(pages_num, index, processed, to_do, another_to_do, letter, robot): mapper = MapReduce() while pages_num >= 0: try: url = to_do.get_nowait() except queue.Empty: break else: if UrlProcessor.get_name(url)[0] in letter: if url in processed or not robot.is_allowed(url): continue processed.append(url) parsed_page = UrlProcessor.get_parsed_page(url) paper_link = UrlProcessor.check_arxiv(url) if not paper_link: word_counts = mapper([url]) word_counts.sort(key=operator.itemgetter(1)) word_counts.reverse() index[url] = word_counts else: save_structured() pages_num -= 1 for url_new in UrlProcessor.get_links(url, parsed_page): to_do.put(url_new) else: another_to_do.put(url) return True
def main(): description = ('%s\n%s' % (__author__, __description__)) epilog = ('%s\n%s' % (__credits__, __copyright__)) parser = argparse.ArgumentParser(description=description, epilog=epilog) parser.add_argument('-i', '--input', dest='input', help='Text file in input', type=str, required=True) parser.add_argument('-t', '--top', dest='top', help='Maximum number of words to print out', type=int, default=50) options = parser.parse_args() num_words = options.top data = read_data(options.input) mapreduce = MapReduce(mapper, reducer) word_counts = mapreduce(data) word_counts.sort(key=operator.itemgetter(1)) word_counts.reverse() print('\nTOP %d WORDS BY FREQUENCY\n' % (num_words)) show = word_counts[:num_words] longest = max(len(word) for word, count in show) for word, count in show: print('%-*s: %5s' % (longest + 1, word, count))
def main(): """ Contributors: - Scot Matson """ # TODO: This needs to be dynamically generated. The data gets destroyed # during runtime - GitHub does not store empty directories. # # Going parallel, we may want to use multiple directories to help # separate data sets, i.e., web1, web2, web3....... etc. data_directory = './web_pages/' # This is the biggest bottleneck in the application # Parallelizing this piece would give a major efficiency boost Crawler.crawl_web('http://www.sjsu.edu', 10) # Build a list of the files we have filepaths = list() for filename in os.listdir(data_directory): filepaths.append(data_directory + filename) map_data = list() for filepath in filepaths: fh = open(filepath, 'r') map_data.append(MapReduce.map(filepaths[0], fh)) fh.close() # Remove files after parsing os.remove(filepath) intermediate_data = dict() # Shuffle data set intermediate_data = MapReduce.unshuffle(map_data) # Reduce should be condensing the data reduced_data = dict() reduced_data = MapReduce.reduce(intermediate_data) # Output results in reverse sequential ordering by value csvfile = open('out/results.csv', 'w') writer = csv.writer(csvfile, delimiter=',') for tag in sorted(reduced_data, key=reduced_data.get, reverse=True): print(tag + ': '+ str(reduced_data[tag])) writer.writerow([tag, reduced_data[tag]]) csvfile.close()
def __init__(self, input_dir, output_dir, n_mappers, n_reducers): MapReduce.__init__(self, input_dir, output_dir, n_mappers, n_reducers)
name = split_values[0] if (name == 'Shubham' or name == 'Dhruv' or name == 'Shantam'): emitter((name, int(split_values[1]))) # Define custom reducer function def udf_reducer(key, values, emitter): result = 0 for v in values: result += v emitter(key, result/len(values)) # Execute MapReduce job m, r, f, kill_idx = getConf("data/config_test_score.txt") f = "data/"+f mapred = MapReduce(m, r, f, udf_mapper, udf_reducer, kill_idx) # Sequential test verification print("Verifying MapReduce results:") # Read MapReduce results output_arr = mapred.read_output() averages_mapred = {} for line in output_arr: name, average = line.rsplit(':', 1) averages_mapred[name] = float(average) # Compute sequential results averages_seq = {'Shubham': 0, 'Dhruv': 0, 'Shantam': 0} score_count = {'Shubham': 0, 'Dhruv': 0, 'Shantam': 0} with open('data/test_scores.txt', 'r') as reader: input_data = reader.readlines() for idx, line in enumerate(input_data):
#!/usr/bin/env python #-*-coding: utf-8-*- ## File name: main.py import sys sys.path.append('common') sys.path.append('ub_model/mr') from mapreduce import MapReduce from mr_fac import MrFac from productline_fac import ProductLineFac if __name__ == '__main__': productline = sys.argv[1] mrname = sys.argv[2] mrfac = MrFac() mr = mrfac.getmr(mrname, productline) mrrunner = MapReduce() mrrunner.execute(mr)
def udf_mapper2(key, value, emitter): split_values = value.split(':') emitter( (0, (split_values[0], float(split_values[1]))) ) # Same key so that we can handle all the data in one udf_reducer call # Reducer 2: Sort the remaining values and persist the top K def udf_reducer2(key, values, emitter): values = sorted(values, key=lambda x: x[1]) for v in values[:K]: emitter(v[0]) # Execute MapReduce job m, r, f, kill_idx = getConf("data/config_knn.txt") f = "data/" + f mapred = MapReduce(m, r, f, udf_mapper, udf_reducer, kill_idx) output_path_from_previous_job = os.path.abspath(mapred.OUT_DIR) mapred2 = MapReduce(m, 1, output_path_from_previous_job, udf_mapper2, udf_reducer2, -1) # Sequential test verification print("Verifying MapReduce results:") # Read MapReduce results output_arr = mapred2.read_output() knn_mapred = set() for line in output_arr: knn_mapred.add(line) # Compute sequential results knn_seq = set() with open('data/knn-dataset.txt', 'r') as reader: input_data = reader.readlines()
with bz2.BZ2File(filename) as file_: for i, line in enumerate(file_): text = ujson.loads(line)['body'] doc = nlp(text) out = transform_doc(doc) output.append(out) print('Done processing file: {}'.format(filename)) return output def write_func(items): with io.open('/home/newscred/Workspace/extractor/train/output.txt', 'a+', encoding='utf8') as f: for x in items: try: f.write(x) except: pass return if __name__ == '__main__': input_files = glob.glob('train/*.bz2') mapper = MapReduce(transform_file, write_func) word_counts = mapper(input_files) print('Done processing!')
def main(): if len(sys.argv) not in [2, 3]: print("Usage: master.py <input-url> [chunk-size]") sys.exit(1) input_url = sys.argv[1] rds_host = os.environ["RDS_HOST"] rds_port = int(os.environ["RDS_PORT"]) bucket_url = os.environ["AWS_S3_BUCKET"] master_id = os.environ["MASTER_ID"] app_name = os.environ["APP_NAME"] chunk_size = 25_000_000 if len(sys.argv) == 3: chunk_size = int(sys.argv[2]) if chunk_size <= 0: raise RuntimeError( f"Chunk size must be a positive number: got {chunk_size}") kube = authenticate_kubernetes() bucket_name = s3helper.get_bucket_from_s3_url(bucket_url) mr = MapReduce(master_id, kube, RANGES, MAPPER_IMAGE, REDUCER_IMAGE, app_name) work_done = False state = 0 # Computing chunk sizes is slow: we want to compute it in the background while # spinning up mappers, to allow us to reduce the output of the mappers def spawn_mappers(): for chunk in s3helper.get_chunks(input_url, chunk_size): chunk_computer_output.put(chunk) chunk_computer_output.close() chunk_computer = multiprocessing.Process(target=spawn_mappers) chunk_computer_output = multiprocessing.Queue() print("Starting to compute chunks") chunk_computer.start() # Event loop updates state and looks for possible reduction # Terminates when state isn't changed, no reducers are started # and there are no running jobs. while (work_done or mr.is_active() or chunk_computer.is_alive() or not chunk_computer_output.empty()): work_done = False mr.update_state() print( f"State {state} - Mappers: [{mr.mappers}] Reducers: [{mr.reducers}]" ) state += 1 try: while True: c1, c2 = chunk_computer_output.get(block=False) mr.start_mapper(input_url, bucket_url, str(c1), str(c2), ",".join(RANGES)) except multiprocessing.queues.Empty: pass # Reduce mappers before other reducers # Logic behind explicit order is that the result of mappers are not # as far along the reduction process, so will need more time to be processed. # Termination condition is that there are not enough completed mappers # to start a new reducer with AND the mappers which are completed are # not the last few. while len(mr.mappers.completed) >= NUM_MAPPERS_TO_REDUCERS or (len( mr.mappers.running) == 0 and len(mr.mappers.completed) > 0): to_reduce, remaining = take_at_most_n(mr.mappers.completed, NUM_MAPPERS_TO_REDUCERS) mr.mappers.completed = remaining for tag in RANGES: mr.start_reducer( tag, ",".join( get_s3_url(bucket_url, mapper.metadata.name, tag) for mapper in to_reduce), bucket_url, ) work_done = True # Reduce multiple reducers when they are compatible # Termination condition is slightly different because the final completed # reducer does not need to be reduced. for tag in RANGES: while len(mr.reducers[tag].completed ) >= NUM_REDUCERS_TO_REDUCERS or ( len(mr.reducers[tag].running) == 0 and len(mr.reducers[tag].completed) > 1): to_reduce, remaining = take_at_most_n( mr.reducers[tag].completed, NUM_REDUCERS_TO_REDUCERS) mr.reducers[tag].completed = remaining mr.start_reducer( tag, ",".join( get_s3_url(bucket_url, reducer.metadata.name, "") for reducer in to_reduce), bucket_url, ) work_done = True time.sleep(EVENT_LOOP_UPDATE_INTERVAL) print("Processing reducer outputs") # Collect the reducer outputs into a single dictionary output = {"word": [], "letter": []} for tag in RANGES: if len(mr.reducers[tag].completed) < 1: continue # It's valid for the input to contain no letters in a range elif len(mr.reducers[tag].completed) > 1: raise RuntimeError( f"Expected exactly one reducer for {tag}: got {mr.reducers[tag]}" ) final_reducer_id = mr.reducers[tag].completed[0].metadata.name reducer_output = json.loads( s3helper.download_file(bucket_name, final_reducer_id).decode()) output["word"].extend(reducer_output["word"].items()) output["letter"].extend(reducer_output["letter"].items()) # Sort outputs: decreasing by frequency, increasing by word for r in output: output[r].sort(key=lambda x: x[0]) output[r].sort(key=lambda x: x[1], reverse=True) print("Writing results to database") write_to_db(rds_host, rds_port, output)
import numpy as np from mapreduce import MapReduce input_images = folders = [ f for f in glob.glob("data/LC08*.tif", recursive=True) ] output_image = "data/output.tif" def map_func(image): nir = image[0:1, :, :] red = image[2:3, :, :] ndvi = (nir - red) / (nir + red + 0.000001) ndvi = np.array(ndvi * 1000).astype(np.int32) return ndvi def reduce_func(images): return np.max(images, axis=0) mapreduce = MapReduce(input_images=input_images, output_image=output_image, map_func=map_func, reduce_func=reduce_func) mapreduce.run()
def __init__(self, np=None): MapReduce.__init__(self, backend=backends.ThreadBackend)
def __init__(self, np=None): MapReduce.__init__(self, backend=backends.ProcessBackend)