Ejemplo n.º 1
0
def distribute_shuffle(machine):
    connect_cmd = f"ssh -o \'StrictHostKeyChecking=no\' asenet@{machine}"
    send_shuffle_cmd = f' python3 /tmp/asenet/slave.py 3'
    return_code_shuffle = cmd_bash(connect_cmd + send_shuffle_cmd,
                                   wait=False,
                                   timeout=1200)

    return return_code_shuffle, machine
Ejemplo n.º 2
0
def launch_reduce(machine):
    cmd_reduce = f"ssh asenet@{machine} python3 /tmp/asenet/slave.py 2"
    logging.debug(cmd_reduce)
    return_cmd = cmd_bash(cmd_reduce, result=True)
    return_code_slave = return_cmd[0]
    return_result = return_cmd[1]
    logging.debug(f"reduce done on machine {machine}  RETURN, {return_cmd}")

    return return_code_slave, machine, return_result
Ejemplo n.º 3
0
def launch_map(machine, file):
    cmd_map = f"ssh asenet@{machine} python3 /tmp/asenet/slave.py 0 {file}"
    return_code_slave = cmd_bash(cmd_map)

    return return_code_slave, machine, file
Ejemplo n.º 4
0
def launch_shuffle(machine, machines):
    cmd_shuffle = f"ssh asenet@{machine} 'python3 /tmp/asenet/slave.py 1 \"{machines}\"'"
    logging.debug(cmd_shuffle)
    return_code_slave = cmd_bash(cmd_shuffle, wait=True)

    return return_code_slave, machine
Ejemplo n.º 5
0
def main():
    start = time.time()

    if len(sys.argv) > 3:
        print('usage: ./master.py {--deploy} file')
        sys.exit(1)

    logging.debug("test debug")

    ## Launch clean and Deploy (optionnal)
    if sys.argv[1] == '--deploy':
        logging.info(" START Clean and deploy")
        logging.debug(" launch Clean")
        cmd_clean = "python3 clean.py"
        return_clean = cmd_bash(cmd_clean,
                                result=True,
                                wait=False,
                                timeout=100)
        logging.debug(return_clean)

        logging.debug(" launch deploy")
        cmd_deploy = "python3 deploy.py"
        return_deploy = cmd_bash(cmd_deploy,
                                 result=True,
                                 wait=False,
                                 timeout=100)
        logging.debug(return_deploy)

        if return_clean[0] == 0 and return_deploy[0] == 0:
            filename = sys.argv[2]
            point0 = time.time()
            elapsed = point0 - start
            start = point0
            logging.info(f' Clean and deploy took : {elapsed:.5f} seconds')
    else:
        filename = sys.argv[1]

    filepath = f"input/{filename}"

    ## Count how many words
    txt_len, unique_words, number_of_characters = get_txt_words(filepath)
    logging.info(
        f" Text has {number_of_characters} caracters, {txt_len} words, {unique_words} different words"
    )

    ## Remove existing splits
    cmd_del_splits = f"rm -rf /tmp/asenet/splits/*"
    return_code_del_splits = cmd_bash(cmd_del_splits)
    if return_code_del_splits == 0:
        logging.info(f" Removed splits from local folder")

    ## 1 split per up machine
    with open('/tmp/asenet/machines.txt') as f:
        machines_list = f.read().splitlines()

    logging.info(f' NB machines used : {len(machines_list)}')
    logging.debug('machines up: ' + str(machines_list))

    # Making as much chunk as machines (adding 5% more due to line breaks not counted by len)
    char_chunk = ceil(number_of_characters / len(machines_list) * 1.10)

    ## Create Splits Files:
    splits = create_splits(filepath, char_chunk, splits_folder)
    logging.debug(f'splits {splits}')
    logging.info(
        f" Created {len(splits)} splits of {char_chunk} char size from text")
    point1 = time.time()
    elapsed = point1 - start
    logging.info(f' Splits Creation took : {elapsed:.5f} secondes')

    ## Assign each split to a machine
    split_files = list(
        map(os.path.basename, glob.glob(splits_folder + "*.txt")))
    file_machine_assign = {
        file: machines_list[split_files.index(file) % len(machines_list)]
        for file in split_files
    }

    ## Create split Dir before scp
    multiprocess(make_dir,
                 machines_list,
                 repeat(splits_folder),
                 action="Create Splits Folder")
    point2 = time.time()
    elapsed = point2 - point1
    logging.info(f' Folder Created in: {elapsed:.5f} secondes')

    ## Send Machine file
    multiprocess(scp_file,
                 repeat(root_folder),
                 file_machine_assign.values(),
                 repeat('machines.txt'),
                 action="Send machines.txt")
    point2b = time.time()
    elapsed = point2b - point2
    logging.info(f' Send machine list: {elapsed:.5f} secondes')

    ## SCP split file
    machines_working = multiprocess(scp_file,
                                    repeat(splits_folder),
                                    file_machine_assign.values(),
                                    file_machine_assign.keys(),
                                    action="Splits upload")[0]
    logging.info(f"Sent split {file_machine_assign}")
    logging.debug(f"marchines working {machines_working}")
    point3 = time.time()
    elapsed = point3 - point2b
    logging.info(f' Splits files sent in: {elapsed:.5f} secondes')

    ## Launch map
    multiprocess(launch_map,
                 machines_working,
                 split_files,
                 action="Launch map")
    point4 = time.time()
    elapsed = point4 - point3
    logging.info(f' MAPS FINISHED in: {elapsed:.5f} secondes')

    ##Make and distribute Shuffle files
    multiprocess(launch_shuffle,
                 machines_working,
                 repeat(machines_working),
                 action="Shuffling")
    point5 = time.time()
    elapsed = point5 - point4
    logging.info(f' SHUFFLE FINISHED in : {elapsed:.5f} secondes')

    ##Launch Reduce
    reduce_results = multiprocess(launch_reduce,
                                  machines_working,
                                  action="Reduce")
    logging.debug(f"Reduce results {reduce_results}")
    point7 = time.time()
    elapsed = point7 - point5
    logging.info(f' REDUCE FINISHED in : {elapsed:.5f} secondes')

    ##Gather results
    word_count_final = show_results(reduce_results)
    logging.debug(f"word final {word_count_final}")

    end = time.time()
    logging.info(f"MAP/REDUCE in {(end - start)} s")
    logging.debug(f"END")