def distribute_shuffle(machine): connect_cmd = f"ssh -o \'StrictHostKeyChecking=no\' asenet@{machine}" send_shuffle_cmd = f' python3 /tmp/asenet/slave.py 3' return_code_shuffle = cmd_bash(connect_cmd + send_shuffle_cmd, wait=False, timeout=1200) return return_code_shuffle, machine
def launch_reduce(machine): cmd_reduce = f"ssh asenet@{machine} python3 /tmp/asenet/slave.py 2" logging.debug(cmd_reduce) return_cmd = cmd_bash(cmd_reduce, result=True) return_code_slave = return_cmd[0] return_result = return_cmd[1] logging.debug(f"reduce done on machine {machine} RETURN, {return_cmd}") return return_code_slave, machine, return_result
def launch_map(machine, file): cmd_map = f"ssh asenet@{machine} python3 /tmp/asenet/slave.py 0 {file}" return_code_slave = cmd_bash(cmd_map) return return_code_slave, machine, file
def launch_shuffle(machine, machines): cmd_shuffle = f"ssh asenet@{machine} 'python3 /tmp/asenet/slave.py 1 \"{machines}\"'" logging.debug(cmd_shuffle) return_code_slave = cmd_bash(cmd_shuffle, wait=True) return return_code_slave, machine
def main(): start = time.time() if len(sys.argv) > 3: print('usage: ./master.py {--deploy} file') sys.exit(1) logging.debug("test debug") ## Launch clean and Deploy (optionnal) if sys.argv[1] == '--deploy': logging.info(" START Clean and deploy") logging.debug(" launch Clean") cmd_clean = "python3 clean.py" return_clean = cmd_bash(cmd_clean, result=True, wait=False, timeout=100) logging.debug(return_clean) logging.debug(" launch deploy") cmd_deploy = "python3 deploy.py" return_deploy = cmd_bash(cmd_deploy, result=True, wait=False, timeout=100) logging.debug(return_deploy) if return_clean[0] == 0 and return_deploy[0] == 0: filename = sys.argv[2] point0 = time.time() elapsed = point0 - start start = point0 logging.info(f' Clean and deploy took : {elapsed:.5f} seconds') else: filename = sys.argv[1] filepath = f"input/{filename}" ## Count how many words txt_len, unique_words, number_of_characters = get_txt_words(filepath) logging.info( f" Text has {number_of_characters} caracters, {txt_len} words, {unique_words} different words" ) ## Remove existing splits cmd_del_splits = f"rm -rf /tmp/asenet/splits/*" return_code_del_splits = cmd_bash(cmd_del_splits) if return_code_del_splits == 0: logging.info(f" Removed splits from local folder") ## 1 split per up machine with open('/tmp/asenet/machines.txt') as f: machines_list = f.read().splitlines() logging.info(f' NB machines used : {len(machines_list)}') logging.debug('machines up: ' + str(machines_list)) # Making as much chunk as machines (adding 5% more due to line breaks not counted by len) char_chunk = ceil(number_of_characters / len(machines_list) * 1.10) ## Create Splits Files: splits = create_splits(filepath, char_chunk, splits_folder) logging.debug(f'splits {splits}') logging.info( f" Created {len(splits)} splits of {char_chunk} char size from text") point1 = time.time() elapsed = point1 - start logging.info(f' Splits Creation took : {elapsed:.5f} secondes') ## Assign each split to a machine split_files = list( map(os.path.basename, glob.glob(splits_folder + "*.txt"))) file_machine_assign = { file: machines_list[split_files.index(file) % len(machines_list)] for file in split_files } ## Create split Dir before scp multiprocess(make_dir, machines_list, repeat(splits_folder), action="Create Splits Folder") point2 = time.time() elapsed = point2 - point1 logging.info(f' Folder Created in: {elapsed:.5f} secondes') ## Send Machine file multiprocess(scp_file, repeat(root_folder), file_machine_assign.values(), repeat('machines.txt'), action="Send machines.txt") point2b = time.time() elapsed = point2b - point2 logging.info(f' Send machine list: {elapsed:.5f} secondes') ## SCP split file machines_working = multiprocess(scp_file, repeat(splits_folder), file_machine_assign.values(), file_machine_assign.keys(), action="Splits upload")[0] logging.info(f"Sent split {file_machine_assign}") logging.debug(f"marchines working {machines_working}") point3 = time.time() elapsed = point3 - point2b logging.info(f' Splits files sent in: {elapsed:.5f} secondes') ## Launch map multiprocess(launch_map, machines_working, split_files, action="Launch map") point4 = time.time() elapsed = point4 - point3 logging.info(f' MAPS FINISHED in: {elapsed:.5f} secondes') ##Make and distribute Shuffle files multiprocess(launch_shuffle, machines_working, repeat(machines_working), action="Shuffling") point5 = time.time() elapsed = point5 - point4 logging.info(f' SHUFFLE FINISHED in : {elapsed:.5f} secondes') ##Launch Reduce reduce_results = multiprocess(launch_reduce, machines_working, action="Reduce") logging.debug(f"Reduce results {reduce_results}") point7 = time.time() elapsed = point7 - point5 logging.info(f' REDUCE FINISHED in : {elapsed:.5f} secondes') ##Gather results word_count_final = show_results(reduce_results) logging.debug(f"word final {word_count_final}") end = time.time() logging.info(f"MAP/REDUCE in {(end - start)} s") logging.debug(f"END")