Beispiel #1
0
 def run(self, num_run):
     with utils.open_cfg(mode='w') as cfg:
         cfg['out_folders'] = {}
         if not 'delete_hdfs' in cfg['main']:
             cfg['main']['delete_hdfs'] = 'true'
         cfg['main']['num_run'] = str(num_run)
         '''
         sess_file = Path("session.txt")
         session_no = 0
         if sess_file.exists():
             with open("session.txt", 'r') as f:
                 fc = f.read()
                 session_no = int(fc) + 1 if len(fc) > 0 else 0
                 f.close()
         with open("session.txt", 'w') as f:
                 f.write(str(session_no))
                 f.close()
         '''
     for i in range(num_run):
         if self.cluster_id == c.CLUSTER_MAP['spark']:
             print(bold('Experiment ({}/{})'.format(i + 1, num_run)))
         try:
             self.retrieve_nodes()
             with utils.open_cfg(mode='w') as cfg:
                 cfg['main']['iter_num'] = str(i + 1)
             x_run.run_benchmark(self.nodes)
             if i == 0:
                 with utils.open_cfg(mode='w') as cfg:
                     cfg['main']['delete_hdfs'] = 'false'
         except (OSError, IOError) as exc:
             print('ERROR: {}\n\nSkipping Experiment ({}/{})'.format(
                 exc, i + 1, num_run))
def submit(args):
    cluster_id = c.CLUSTER_MAP['spark']
    num_run = args.num_runs
    reuse_dataset = args.reuse_dataset
    #exp_filepath = args.exp_file_path if args.exp_file_path else "experiment.json"
    exp_filepaths = args.exp_file_paths if args.exp_file_paths else ["experiment.json"]
    for exp_filepath in exp_filepaths:
        exp_file = Path(exp_filepath)
        if exp_file.exists():
            experiment = json.load(open(exp_filepath))
            try:
                benchmark = experiment["BenchmarkName"]
            except KeyError as error:
                print("ERROR:  {} in experiment file: {}".format(error, exp_filepath))
                exit(1) 
        with utils.open_cfg(mode='w') as cfg:
            for s in cfg.sections():
                if s != 'hdfs':
                    cfg.remove_section(s)
            cfg['main'] = {}
            cfg['main']['tool_on_master'] = 'false'
            cfg['main']['experiment_file'] = exp_filepath
            cfg['main']['process_on_server'] = str(c.PROCESS_ON_SERVER)
            cfg['main']['iter_num'] = str(1)
            cfg['submit'] = {}
            cfg[benchmark] = {}
            #cfg[benchmark]['profile_name'] = '{}'.format(c.VAR_PAR_MAP[benchmark]['profile_name'])
            if reuse_dataset:
                cfg['main']['delete_hdfs'] = str(not reuse_dataset)
        print(bold('Submit experiment {} performing {} runs for benchmark {} on cluster {}'.format(exp_filepath, 
                                                                                num_run, benchmark, 
                                                                                cluster_id,)))
        run_xspark(current_cluster='spark', num_instance=0, num_run=num_run,
                   cluster_id=cluster_id, run=1, terminate=0, reboot=0)
def run_xspark(current_cluster, num_instance=c.NUM_INSTANCE, num_run=c.NUM_RUN, cluster_id=c.CLUSTER_ID, terminate=c.TERMINATE,
               run=c.RUN, reboot=c.REBOOT, assume_yes=False):
    """ Main function;
    * Launch NUMINSTANCE virtual machines
    * Run DagSymb Application
    * Download Log
    * Plot data from log
    """
    print(header('run_xspark(num_instance={}, num_run={}, cluster_id={},terminate={}, run={}, reboot={})'
          .format(num_instance, num_run, cluster_id, terminate, run, reboot)))
    ''' get cfg_file and initialize main settings'''
    with utils.open_cfg(mode='w') as cfg:
        if 'main' not in cfg:
            cfg['main'] = {}
        cfg.set('main', 'current_cluster', current_cluster)
        if 'tool_on_master' not in cfg['main']:
            cfg.set('main', 'tool_on_master', 'false')

    bench_instance = BenchInstanceFactory.get_bench_instance(c.PROVIDER, cluster_id)
    setup_ok = True

    if num_instance > 0:
        setup_ok = bench_instance.setup(num_instance, assume_yes)

    if reboot:
        bench_instance.reboot()

    if setup_ok and run:
        bench_instance.run(num_run)

    if terminate:
        bench_instance.terminate()
Beispiel #4
0
 def run_disabled(self, num_run):
     with utils.open_cfg(mode='w') as cfg:
         cfg['out_folders'] = {}
         cfg['main']['delete_hdfs'] = 'true'
     for i in range(num_run):
         if self.cluster_id == c.CLUSTER_MAP['spark']:
             print(bold('Experiment ({}/{})'.format(i + 1, num_run)))
         try:
             self.retrieve_nodes()
             x_run.run_benchmark(self.nodes)
             if i == 0:
                 with utils.open_cfg(mode='w') as cfg:
                     cfg['main']['delete_hdfs'] = 'false'
         except (OSError, IOError) as exc:
             print('ERROR: {}\n\nSkipping Experiment ({}/{})'.format(
                 exc, i + 1, num_run))
def kill_cluster(cluster):
    cluster_id = c.CLUSTER_MAP[cluster]
    print(bold('Terminate {}...'.format(cluster_id)))
    run_xspark(current_cluster=cluster,
               num_instance=0,
               cluster_id=cluster_id,
               run=0,
               terminate=1,
               reboot=0)
    with utils.open_cfg(mode='w') as cfg:
        cfg[cluster] = {}
def setup_application_agnostic(args):
    cluster = args.cluster
    num_instances = args.num_instances
    assume_yes = args.assume_yes
    if cluster == 'all' or cluster == 'hdfs':
        with utils.open_cfg(mode='w') as cfg:
            for s in cfg.sections():
                cfg.remove_section(s)
    if cluster == 'all':
        setup_cluster('hdfs', num_instances, assume_yes)
        setup_cluster('spark', num_instances, assume_yes)
    else:
        setup_cluster(cluster, num_instances, assume_yes)
def profile(args):
    cluster_id = c.CLUSTER_MAP['spark']
    num_run = args.num_runs
    reuse_dataset = args.reuse_dataset
    #exp_filepath = args.exp_file_path if args.exp_file_path else "experiment.json"
    exp_filepaths = args.exp_file_paths if args.exp_file_paths else ["experiment.json"]
    num_experiments = len(exp_filepaths)
    spark_seq = args.spark_seq if args.spark_seq else False
    index = 0
    for exp_filepath in exp_filepaths:
        exp_file = Path(exp_filepath)
        index += 1
        if exp_file.exists():
            experiment = json.load(open(exp_filepath))
            try:
                benchmark = experiment["BenchmarkName"]
                #benchmark = experiment["BenchmarkBench"][0]
            except KeyError as error:
                print("ERROR:  {} in experiment file: {}".format(error, exp_filepath))
                exit(1) 
        with utils.open_cfg(mode='w') as cfg:
            for s in cfg.sections():
                if s != 'hdfs':
                    cfg.remove_section(s)
            cfg['main'] = {}
            cfg['main']['tool_on_master'] = 'false'
            cfg['main']['experiment_file'] = exp_filepath
            cfg['main']['process_on_server'] = str(c.PROCESS_ON_SERVER)
            cfg['main']['iter_num'] = str(1) #vboxvm
            cfg['main']['num_experiments'] = str(num_experiments)
            cfg['main']['experiment_num'] = str(index)
            #cfg['main']['cluster_id'] = cluster_id
            cfg['profile'] = {}
            cfg['profile']['spark_seq'] = str(spark_seq)
            cfg[benchmark] = {}
            cfg[benchmark]['profile_name'] = '{}'.format(c.VAR_PAR_MAP[benchmark]['profile_name'])
            if reuse_dataset:
                cfg['main']['delete_hdfs'] = str(not reuse_dataset)
            
        print(bold('Profile experiment {} performing {} runs for benchmark {} on cluster {}'.format(exp_filepath, 
                                                                                                   num_run, benchmark,
                                                                                                   cluster_id,)))
        run_xspark(current_cluster='spark', num_instance=0, num_run=num_run,
                   cluster_id=cluster_id, run=1, terminate=0, reboot=0)
        if not c.PROCESS_ON_SERVER:
            average_runs.main(profile_name=utils.get_cfg()[benchmark]['profile_name'])
            deploy_profile(benchmark, cluster_id)
def launch_symex(args):
    cluster_id = c.CLUSTER_MAP['spark']
    app_name = args.app_name
    app_jar = args.app_jar
    app_class = args.app_class
    app_args = args.app_args
    num_run = args.num_runs
    reuse_dataset = args.reuse_dataset
    max_executors = args.max_executors
    num_partitions = args.num_partitions
    arg_string = ''
    with utils.open_cfg(mode='w') as cfg:
        for s in cfg.sections():
            if s != 'hdfs':
                cfg.remove_section(s)
        cfg['main'] = {}
        cfg['main']['profile'] = 'true' if args.profile else 'false'
        cfg['main']['time_analysis'] = 'true' if args.time_analysis else 'false'
        cfg['main']['tool_on_master'] = 'false'
        cfg['main']['app_name'] = '{}'.format(app_name)
        cfg['main']['app_jar'] = '{}'.format(app_jar)
        cfg['main']['app_class'] = '{}'.format(app_class)
        cfg['app_args'] = {}
        app_arg_pos = 0
        for app_arg in app_args:
            cfg['app_args'][str(app_arg_pos)] = '{}'.format(app_arg)
            arg_string += ' {}'.format(app_arg)
            app_arg_pos += 1
        arg_string += ' {}'.format(str(num_partitions))
        cfg['main']['child_args_string'] = '{}'.format(arg_string)
        cfg['main']['num_partitions'] = str(num_partitions)
        if reuse_dataset:
            cfg['main']['delete_hdfs'] = str(not reuse_dataset)
        if max_executors:
            cfg['main']['max_executors'] = str(max_executors)
    print(bold('Launch {} Experiments for application {} on cluster {} with args: {}'.format(num_run, app_name,
                                                                                           cluster_id, arg_string)))
    
    run_xspark(current_cluster='spark', num_instance=0, num_run=num_run,
               cluster_id=cluster_id, run=1, terminate=0, reboot=0)
    if not c.PROCESS_ON_SERVER:
        if args.profile:
            run_log_profiling(None)
    if args.time_analysis:
        run_time_analysis(None)
def setup(args):
    cluster = args.cluster
    num_instances = args.num_instances
    assume_yes = args.assume_yes
    with utils.open_cfg(mode='w') as cfg:
        if cluster == 'all' or cluster == 'hdfs':
            for s in cfg.sections():
                cfg.remove_section(s)
        if 'main' not in cfg:
            cfg['main'] = {}
        cfg.set('main', 'setup', 'true')
        if args.app_dir:
            cfg.set('main', 'appdir', args.app_dir)
    if cluster == 'all':
        setup_cluster('hdfs', num_instances, assume_yes)
        setup_cluster('spark', num_instances, assume_yes)
    else:
        setup_cluster(cluster, num_instances, assume_yes)
def launch_exp(args):
    cluster_id = c.CLUSTER_MAP['spark']
    var_par = args.var_par
    bench = args.benchmark
    num_run = args.num_runs
    reuse_dataset = args.reuse_dataset
    max_executors = args.max_executors
    num_partitions = args.num_partitions
    for v in var_par:
        with utils.open_cfg(mode='w') as cfg:
            for s in cfg.sections():
                if s != 'hdfs':
                    cfg.remove_section(s)
            cfg['main'] = {}
            cfg['main']['profile'] = 'true' if args.profile else 'false'
            cfg['main'][
                'time_analysis'] = 'true' if args.time_analysis else 'false'
            cfg['main']['tool_on_master'] = 'false'
            cfg['main']['benchmark'] = bench
            cfg[bench] = {}
            cfg[bench][c.VAR_PAR_MAP[bench]['var_name']] = '{}'.format(v)
            cfg[bench]['num_partitions'] = str(num_partitions)
            if reuse_dataset:
                cfg['main']['delete_hdfs'] = str(not reuse_dataset)
            if max_executors:
                cfg['main']['max_executors'] = str(max_executors)
        print(
            bold(
                'Launch {} Experiments for benchmark {} on cluster {} with {}={}...'
                .format(num_run, bench, cluster_id,
                        c.VAR_PAR_MAP[bench]['var_name'], v)))
        run_xspark(current_cluster='spark',
                   num_instance=0,
                   num_run=num_run,
                   cluster_id=cluster_id,
                   run=1,
                   terminate=0,
                   reboot=0)
        if not c.PROCESS_ON_SERVER:
            if args.profile:
                run_log_profiling(None)
        if args.time_analysis:
            run_time_analysis(None)
def profile_disabled(args):
    cluster_id = c.CLUSTER_MAP['spark']
    var_par = args.var_par
    exp_profile_name = args.exp_profile_name if args.exp_profile_name else ""
    benchmark = args.benchmark
    num_run = args.num_runs
    max_executors = args.max_executors
    num_partitions = args.num_partitions
    for v in var_par:
        with utils.open_cfg(mode='w') as cfg:
            cfg['main'] = {}
            cfg['main']['profile'] = 'true'
            cfg['main']['tool_on_master'] = 'false'
            cfg['main']['benchmark'] = benchmark
            cfg['main']['iter_num'] = str(1)  #vboxvm
            cfg[benchmark] = {}
            cfg[benchmark][c.VAR_PAR_MAP[benchmark]
                           ['var_name']] = '({}, {})'.format(
                               c.VAR_PAR_MAP[benchmark]['default'][0], v)
            cfg[benchmark]['profile_name']= \
                '{}'.format(c.VAR_PAR_MAP[benchmark]['profile_name']) if not args.exp_profile_name else args.exp_profile_name
            cfg[benchmark]['num_partitions'] = str(num_partitions)
            if max_executors:
                cfg['main']['max_executors'] = max_executors
        print(
            bold(
                'Profile {} performing {} runs for benchmark {} on cluster {} with {}={}...'
                .format(exp_profile_name, num_run, benchmark, cluster_id,
                        c.VAR_PAR_MAP[benchmark]['var_name'], v)))
        run_xspark(current_cluster='spark',
                   num_instance=0,
                   num_run=num_run,
                   cluster_id=cluster_id,
                   run=1,
                   terminate=0,
                   reboot=0)
        #profiling.main()
        average_runs.main(
            profile_name=utils.get_cfg()[benchmark]['profile_name'])
        #run_log_profiling(args.local)
        deploy_profile(benchmark, cluster_id)
def run_xspark_disabled(current_cluster,
                        num_instance=c.NUM_INSTANCE,
                        num_run=c.NUM_RUN,
                        cluster_id=c.CLUSTER_ID,
                        terminate=c.TERMINATE,
                        run=c.RUN,
                        reboot=c.REBOOT,
                        assume_yes=False):
    """ Main function;
    * Launch spot request of NUMINSTANCE
    * Run Benchmark
    * Download Log
    * Plot data from log
    """
    print(
        header(
            'run_xspark(num_instance={}, num_run={}, cluster_id={},terminate={}, run={}, reboot={})'
            .format(num_instance, num_run, cluster_id, terminate, run,
                    reboot)))
    # get cfg_file and initialize main settings
    with utils.open_cfg(mode='w') as cfg:
        if 'main' not in cfg:
            cfg['main'] = {}
        cfg.set('main', 'current_cluster', current_cluster)

    bench_instance = BenchInstanceFactory.get_bench_instance(
        c.PROVIDER, cluster_id)
    setup_ok = True

    if num_instance > 0:
        println("setup_ok = bench_instance.setup(num_instance, assume_yes)")

    if reboot:
        println("bench_instance.reboot()")

    if setup_ok and run:
        bench_instance.run(num_run)

    if terminate:
        println("bench_instance.terminate()")
def submit_symex(args):
    cluster_id = c.CLUSTER_MAP['spark']
    num_run = args.num_runs
    reuse_dataset = args.reuse_dataset
    exp_filepaths = args.exp_file_paths if args.exp_file_paths else ["experiment.json"]
    num_experiments = len(exp_filepaths)
    index = 0
    app_name = ''
    app_jar = ''
    app_class = ''
    guard_evaluator_class = ''
    num_partitions = ''
    app_args = {}
    meta_profile_name = ''
    for exp_filepath in exp_filepaths:
        exp_file = Path(exp_filepath)
        index += 1
        if exp_file.exists():
            experiment = json.load(open(exp_filepath))
            try:
                app_name = experiment["AppName"]
                app_jar = experiment["AppJar"]
                app_class = experiment["AppClass"]
                guard_evaluator_class = experiment["GuardEvaluatorClass"]
                num_partitions = experiment["NumPartitions"]
                app_args = experiment["AppConf"]
                data_multiplier = experiment["DataMultiplier"] if experiment["DataMultiplier"] else 1
                meta_profile_name = experiment["MetaProfileName"] if experiment["MetaProfileName"] else meta_profile_name
            except KeyError as error:
                print("ERROR:  {} in experiment file: {}".format(error, exp_filepath))
                exit(1) 
        with utils.open_cfg(mode='w') as cfg:
            for s in cfg.sections():
                if s != 'hdfs':
                    cfg.remove_section(s)
            cfg['main'] = {}
            cfg['main']['app_name'] = app_name
            cfg['main']['app_jar'] = app_jar
            cfg['main']['app_class'] = app_class
            cfg['main']['guard_evaluator_class'] = guard_evaluator_class
            cfg['main']['tool_on_master'] = 'false'
            cfg['main']['experiment_file'] = exp_filepath
            cfg['main']['process_on_server'] = str(c.PROCESS_ON_SERVER)
            cfg['experiment'] = {}
            cfg['experiment']['app_name'] = app_name
            cfg['experiment']['profile_name'] = app_name
            cfg['experiment']['meta_profile_name'] = meta_profile_name
            cfg['app_args'] = {}
            arg_string = ''
            not_to_scale_args = ["pastMonths", "inputFile", "outputFile", "delimiter", "parallelism", "minimumCompressionProgress", "progressCounter"]
            for key_app_arg in sorted(app_args.keys(), key=lambda k: int(k)):
                app_arg_name = '{}'.format(app_args[key_app_arg]["Name"])
                app_arg_val = '{}'.format(app_args[key_app_arg]["Value"]) 
                app_arg_value = app_arg_val if app_arg_name in not_to_scale_args else '{}'.format(int(app_arg_val) * int(data_multiplier)) 
                cfg['app_args']['arg'+key_app_arg+': ' + app_arg_name] = app_arg_value 
                arg_string += ' {}'.format(app_arg_value)
            #arg_string += ' {}'.format(str(num_partitions))
            cfg['main']['child_args_string'] = '{}'.format(arg_string)
            cfg['main']['num_partitions'] = str(num_partitions)
            cfg['main']['iter_num'] = str(1) #vboxvm
            cfg['main']['num_experiments'] = str(num_experiments)
            cfg['main']['experiment_num'] = str(index)
            
            if reuse_dataset:
                cfg['main']['delete_hdfs'] = str(not reuse_dataset)
            
        print(bold('Submit experiment {} performing {} runs for application {} on cluster {}'.format(exp_filepath, 
                                                                                num_run, app_name, 
                                                                                cluster_id,)))
        run_xspark(current_cluster='spark', num_instance=0, num_run=num_run,
                   cluster_id=cluster_id, run=1, terminate=0, reboot=0)
def profile_symex(args):
    cluster_id = c.CLUSTER_MAP['spark']
    num_run = args.num_runs
    reuse_dataset = args.reuse_dataset
    #exp_filepath = args.exp_file_path if args.exp_file_path else "experiment.json"
    exp_filepaths = args.exp_file_paths if args.exp_file_paths else ["experiment.json"]
    num_experiments = len(exp_filepaths)
    spark_seq = args.spark_seq if args.spark_seq else False
    index = 0
    app_name = ''
    app_jar = ''
    app_class = ''
    guard_evaluator_class = ''
    num_partitions = ''
    app_args = {}
    meta_profile_name = ''
    for exp_filepath in exp_filepaths:
        exp_file = Path(exp_filepath)
        index += 1
        if exp_file.exists():
            experiment = json.load(open(exp_filepath))
            try:
                app_name = experiment["AppName"]
                app_jar = experiment["AppJar"]
                app_class = experiment["AppClass"]
                guard_evaluator_class = experiment["GuardEvaluatorClass"]
                num_partitions = experiment["NumPartitions"]
                app_args = experiment["AppConf"]
                data_multiplier = experiment["DataMultiplier"] if experiment["DataMultiplier"] else 1
                meta_profile_name = experiment["MetaProfileName"] if experiment["MetaProfileName"] else meta_profile_name
            except KeyError as error:
                print("ERROR:  {} in experiment file: {}".format(error, exp_filepath))
                exit(1) 
        with utils.open_cfg(mode='w') as cfg:
            for s in cfg.sections():
                if s != 'hdfs':
                    cfg.remove_section(s)
            cfg['main'] = {}
            cfg['main']['app_name'] = app_name
            cfg['main']['app_jar'] = app_jar
            cfg['main']['app_class'] = app_class
            cfg['main']['guard_evaluator_class'] = guard_evaluator_class
            cfg['main']['tool_on_master'] = 'false'
            cfg['main']['experiment_file'] = exp_filepath
            cfg['main']['process_on_server'] = str(c.PROCESS_ON_SERVER)
            cfg['experiment'] = {}
            cfg['experiment']['app_name'] = app_name
            cfg['experiment']['profile_name'] = app_name
            cfg['experiment']['meta_profile_name'] = meta_profile_name
            cfg['app_args'] = {}
            arg_string = ''
            not_to_scale_args = ["pastMonths", "inputFile", "outputFile", "delimiter", "parallelism", "minimumCompressionProgress", "progressCounter"]
            for key_app_arg in sorted(app_args.keys(), key=lambda k: int(k)):
                app_arg_name = '{}'.format(app_args[key_app_arg]["Name"])
                app_arg_val = '{}'.format(app_args[key_app_arg]["Value"]) 
                app_arg_value = app_arg_val if app_arg_name in not_to_scale_args else '{}'.format(int(app_arg_val) * int(data_multiplier)) 
                cfg['app_args']['arg'+key_app_arg+': ' + app_arg_name] = app_arg_value 
                arg_string += ' {}'.format(app_arg_value)
            #arg_string += ' {}'.format(str(num_partitions))
            cfg['main']['child_args_string'] = '{}'.format(arg_string)
            cfg['main']['num_partitions'] = str(num_partitions)
            cfg['main']['iter_num'] = str(1) #vboxvm
            cfg['main']['num_experiments'] = str(num_experiments)
            cfg['main']['experiment_num'] = str(index)
            #cfg['main']['cluster_id'] = cluster_id
            cfg['profile'] = {}
            cfg['profile']['spark_seq'] = str(spark_seq)
            cfg['profile']['profile_name'] = app_name
            cfg['profile']['metaprofile_name'] = meta_profile_name
            
            if reuse_dataset:
                cfg['main']['delete_hdfs'] = str(not reuse_dataset)
                
        print(bold('Profile experiment {} performing {} runs for application {} on cluster {}'.format(exp_filepath, 
                                                                                                   num_run, app_name,
                                                                                                   cluster_id,)))
        run_xspark(current_cluster='spark', num_instance=0, num_run=num_run,
                   cluster_id=cluster_id, run=1, terminate=0, reboot=0)
        if not c.PROCESS_ON_SERVER:
            average_runs.main(profile_name=utils.get_cfg()['experiment']['profile_name'])
            
    join_jsons.join_dags(OUTPUT_DIR)
    #join_jsons.join_dags("spark_log_profiling"+os.sep+"avg_json")
    
    deploy_meta_profile(meta_profile_name, cluster_id, True)
    
    #upload all the normal (non-meta) profiles
    for filename in os.listdir(OUTPUT_DIR):
        profilename = filename.split(os.sep)[-1].split(".")[0]
        profile_fname = filename.split(os.sep)[-1]
        if profilename != meta_profile_name and not "collection" in profilename and profile_fname.split(".")[-1] == "json":
           deploy_meta_profile(profilename, cluster_id) 
Beispiel #15
0
def main(input_dir=INPUT_DIR, json_out_dir=OUTPUT_DIR, reprocess=False):
    processed_dir = os.path.join(ROOT_DIR, 'processed_logs')
    if reprocess:
        input_dir = processed_dir
    make_sure_path_exists(input_dir)
    make_sure_path_exists(processed_dir)
    print(
        "Start log profiling: \ninput_dir:\t{}\nprocessed_dir:\t{}\noutput_dir:\t{}"
        .format(input_dir, processed_dir, json_out_dir))
    log_index = 0
    for log in glob.glob(os.path.join(input_dir, 'app-*')):
        app_name = ""
        is_errfile = False
        app_start_time = 0
        app_end_time = 0
        app_act_start_time = 0
        app_act_end_time = 0
        dat_folder = 'home/ubuntu/dagsymb/num/' + log.split('.')[0].split(
            os.sep)[-1]
        print(ROOT_DIR, os.getcwd(), dat_folder)
        files = os.listdir(dat_folder)
        print("Files: ", files)
        dat_file = ""
        dat_files = [
            x for x in files if x.split('.')[-1] == 'dat' and (
                x.split('.')[-2] == 'app' or x.split('_')[-2] == 'run')
        ]
        dat_file = dat_files[0] if len(dat_files) > 0 else ''
        if dat_file == '':
            err_files = [
                x for x in files if x.split('.')[-1] == 'err'
                and x.split('.')[-2] != 'scheduling-throughput'
            ]
            dat_file = err_files[0] if len(err_files) > 0 else ''
            is_errfile = True
            print("Files .err: ", dat_files)
        dat_filepath = dat_folder + '/' + dat_file
        print(".dat filePath: ", dat_filepath)

        try:
            with utils.open_cfg(rpath=dat_folder) as cfg:
                profile_suffix = cfg['experiment']['profile_name'].split(
                    "-")[-1]
        except Exception:
            profile_suffix = log_index

        stages = []
        last_stage = 0

        # Build stage dictionary
        stage_dict = OrderedDict()
        if ".bz" in log:
            file_open = bz2.BZ2File(log, "r")
        else:
            file_open = open(log)

        with file_open as logfile:
            #print(log)
            if dat_file != '':
                fdat = open(dat_filepath)
            with fdat as dat:
                start_stage = 1 if contains_generation(dat) else 0

            for line in logfile:
                if ".bz" in log:
                    line = line.decode("utf-8")
                data = json.loads(line)
                try:
                    if data["Event"] == "SparkListenerApplicationStart":
                        app_name = data["App Name"]
                        app_start_time = data["Timestamp"]
                        stage_dict["jobs"] = {}
                        id_symbols = []
                    elif data["Event"] == "SparkListenerApplicationEnd":
                        app_end_time = data["Timestamp"]
                    elif data["Event"] == "SparkListenerStageSubmitted":
                        # print(data)
                        stage = data["Stage Info"]
                        stage_id = int(stage["Stage ID"]) - start_stage
                        if stage_id < 0:
                            continue
                        stages.append(stage_id)
                        if stage_id > last_stage:
                            last_stage = stage_id
                        if stage_id not in stage_dict.keys():
                            stage_dict[stage_id] = {}
                            if stage_id == 0:
                                stage_dict[0]["monocoretotalduration"] = 0
                                stage_dict[0]["totalduration"] = 0
                                stage_dict[0]["actualtotalduration"] = 0
                            stage_dict[stage_id]["duration"] = 0
                            stage_dict[stage_id]["name"] = stage['Stage Name']
                            stage_dict[stage_id]["genstage"] = False
                            #print(stage["Parent IDs"])
                            stage_dict[stage_id]["parentsIds"] = list(
                                map(lambda x: x - start_stage,
                                    stage["Parent IDs"]))
                            stage_dict[stage_id]["nominalrate"] = 0.0
                            stage_dict[stage_id]["weight"] = 0
                            stage_dict[stage_id]["RDDIds"] = {
                                x["RDD ID"]: {
                                    "name": x["Name"],
                                    "callsite": x["Callsite"]
                                }
                                for x in stage["RDD Info"]
                            }
                            stage_dict[stage_id]["skipped"] = False
                            stage_dict[stage_id]["cachedRDDs"] = []
                            stage_dict[stage_id]["numtask"] = 0
                            stage_dict[stage_id]["recordsread"] = 0.0
                            stage_dict[stage_id]["shufflerecordsread"] = 0.0
                            stage_dict[stage_id]["recordswrite"] = 0.0
                            stage_dict[stage_id]["shufflerecordswrite"] = 0.0
                            stage_dict[stage_id]["bytesread"] = 0.0
                            stage_dict[stage_id]["shufflebytesread"] = 0.0
                            stage_dict[stage_id]["byteswrite"] = 0.0
                            stage_dict[stage_id]["shufflebyteswrite"] = 0.0
                            for rdd_info in stage["RDD Info"]:
                                storage_level = rdd_info["Storage Level"]
                                if storage_level["Use Disk"] or storage_level["Use Memory"] or \
                                        storage_level["Deserialized"]:
                                    stage_dict[stage_id]["cachedRDDs"].append(
                                        rdd_info["RDD ID"])
                    elif data["Event"] == "SparkListenerStageCompleted":
                        # print(data)
                        stage_id = data["Stage Info"]["Stage ID"] - start_stage
                        #print(stage_id)
                        if stage_id < 0: continue
                        stage_dict[stage_id]["numtask"] = data["Stage Info"][
                            'Number of Tasks']
                        for acc in data["Stage Info"]["Accumulables"]:
                            if acc["Name"] == "internal.metrics.executorRunTime":
                                stage_dict[stage_id]["monocoreduration"] = int(
                                    acc["Value"])
                                stage_dict[0]["monocoretotalduration"] += int(
                                    acc["Value"])
                            if acc["Name"] == "internal.metrics.input.recordsRead":
                                stage_dict[stage_id]["recordsread"] = acc[
                                    "Value"]
                            if acc["Name"] == "internal.metrics.shuffle.read.recordsRead":
                                stage_dict[stage_id][
                                    "shufflerecordsread"] = acc["Value"]
                            if acc["Name"] == "internal.metrics.output.recordsWrite":
                                stage_dict[stage_id]["recordswrite"] = acc[
                                    "Value"]
                            if acc["Name"] == "internal.metrics.shuffle.write.recordsWritten":
                                stage_dict[stage_id][
                                    "shufflerecordswrite"] = acc["Value"]
                            if acc["Name"] == "internal.metrics.input.bytesRead":
                                stage_dict[stage_id]["bytesread"] = acc[
                                    "Value"]
                            if acc["Name"] == "internal.metrics.shuffle.read.localBytesRead":
                                stage_dict[stage_id]["shufflebytesread"] = acc[
                                    "Value"]
                            if acc["Name"] == "internal.metrics.output.bytesWrite":
                                stage_dict[stage_id]["byteswrite"] = acc[
                                    "Value"]
                            if acc["Name"] == "internal.metrics.shuffle.write.bytesWritten":
                                stage_dict[stage_id][
                                    "shufflebyteswrite"] = acc["Value"]
                except KeyError as e:
                    print(e)

        skipped = []
        if ".bz" in log:
            file_open = bz2.BZ2File(log, "r")
        else:
            file_open = open(log)
        with file_open as logfile:
            for line in logfile:
                if ".bz" in log:
                    line = line.decode("utf-8")
                data = json.loads(line)
                try:
                    if data["Event"] == "SparkListenerJobStart":
                        # print(data)
                        job_id = data["Job ID"]
                        stage_dict["jobs"][job_id] = {}
                        # print(stage_dict["jobs"])
                        id_symb_root = sorted(data["Stage Infos"],
                                              key = lambda k: k["Stage ID"])[-1]["Stage Name"]\
                                              .replace(" at ", "_") + "_"
                        seq = 0
                        while id_symb_root + str(seq) in id_symbols:
                            seq += 1
                        id_symb = id_symb_root + str(seq)
                        id_symbols.append(id_symb)
                        stage_dict["jobs"][job_id]["id-symb"] = id_symb
                        stage_dict["jobs"][job_id]["stages"] = sorted(
                            data["Stage IDs"])

                        for stage in data["Stage Infos"]:
                            stage_id = stage["Stage ID"] - start_stage
                            if stage_id < 0: continue
                            if stage_id not in stage_dict.keys():
                                stage_dict[stage_id] = {}
                                stage_dict[stage_id]["duration"] = 0
                                stage_dict[stage_id]["name"] = stage[
                                    'Stage Name']
                                stage_dict[stage_id]["genstage"] = False
                                stage_dict[stage_id]["parentsIds"] = list(
                                    map(lambda x: x - start_stage,
                                        stage["Parent IDs"]))
                                stage_dict[stage_id]["nominalrate"] = 0.0
                                stage_dict[stage_id]["weight"] = 0
                                stage_dict[stage_id]["RDDIds"] = {
                                    x["RDD ID"]: {
                                        "name": x["Name"],
                                        "callsite": x["Callsite"]
                                    }
                                    for x in stage["RDD Info"]
                                }
                                stage_dict[stage_id]["skipped"] = True
                                stage_dict[stage_id]["cachedRDDs"] = []
                                stage_dict[stage_id]["numtask"] = 0
                                stage_dict[stage_id]["recordsread"] = 0.0
                                stage_dict[stage_id][
                                    "shufflerecordsread"] = 0.0
                                stage_dict[stage_id]["recordswrite"] = 0.0
                                stage_dict[stage_id][
                                    "shufflerecordswrite"] = 0.0
                                stage_dict[stage_id]["bytesread"] = 0.0
                                stage_dict[stage_id]["shufflebytesread"] = 0.0
                                stage_dict[stage_id]["byteswrite"] = 0.0
                                stage_dict[stage_id]["shufflebyteswrite"] = 0.0
                                for rdd_info in stage["RDD Info"]:
                                    storage_level = rdd_info["Storage Level"]
                                    if storage_level["Use Disk"] or storage_level["Use Memory"] or \
                                            storage_level["Deserialized"]:
                                        stage_dict[stage_id][
                                            "cachedRDDs"].append(
                                                rdd_info["RDD ID"])
                                skipped.append(stage_id)
                except KeyError:
                    None
        stage_dict_key_stages = [k for k in stage_dict.keys() if k != "jobs"]
        # Replace skipped stage id in parents ids based on RDD IDs
        for skipped_id in skipped:
            for stage_id1 in stage_dict_key_stages:  #stage_dict.keys():
                if stage_id1 != skipped_id and stage_dict[skipped_id]["RDDIds"] == \
                        stage_dict[stage_id1]["RDDIds"]:
                    for stage_id2 in stage_dict_key_stages:  #stage_dict.keys():
                        if skipped_id in stage_dict[stage_id2]["parentsIds"]:
                            stage_dict[stage_id2]["parentsIds"].remove(
                                skipped_id)
                            stage_dict[stage_id2]["parentsIds"].append(
                                stage_id1)

        # stage_dict_key_stages = [k for k in stage_dict.keys() if k != "jobs"]
        for stage in stage_dict_key_stages:
            if len(stage_dict[stage]["parentsIds"]) == 0:
                try:
                    cached = list(stage_dict[stage]["cachedRDDs"])
                except KeyError:
                    None
                for i in range(0, stage):
                    try:
                        for rdd in cached:
                            if rdd in stage_dict[i]["cachedRDDs"]:
                                stage_dict[stage]["parentsIds"].append(i)
                                cached.remove(rdd)
                    except KeyError:
                        None

        #stages = list(stage_dict.keys())
        stages = stage_dict_key_stages
        stages_not_skipped = [s for s in stages if s not in skipped]
        stage_act_start_times = [0] * len(stages)
        stage_act_end_times = [0] * len(stages)

        if dat_file != '':
            fdat = open(dat_filepath)
            #print("fdat: ", fdat)
            with fdat as dat:
                #print(dat)
                for line in dat:
                    tokens = line.split(' ')
                    #print("after line 373")
                    if len(tokens) > 6:
                        if tokens[4] == 'Submitting' and (
                                tokens[5] == 'ResultStage'
                                or tokens[5] == 'ShuffleMapStage') and (
                                    start_stage == 0 or tokens[6] != '0'):
                            date = tokens[0]
                            time = tokens[1]
                            stage_act_start_times[
                                int(tokens[6]) -
                                start_stage] = date_time_to_timestamp_ms(
                                    date, time)
                            if tokens[6] == str(start_stage):
                                app_act_start_time = date_time_to_timestamp_ms(
                                    date, time)

                        if (tokens[4] == 'ResultStage'
                                or tokens[4] == 'ShuffleMapStage'
                            ) and tokens[9] == 'finished' and (
                                start_stage == 0 or tokens[5] != '0'):
                            date = tokens[0]
                            time = tokens[1]
                            stage_act_end_times[
                                int(tokens[5]) -
                                start_stage] = date_time_to_timestamp_ms(
                                    date, time)
                            if tokens[5] == str(last_stage + start_stage):
                                app_act_end_time = date_time_to_timestamp_ms(
                                    date, time)
        else:
            print('_run.dat file not found, no actualdurations calculated')

        sum_of_stages_durations = 0
        for i in stages:
            #if i != "jobs":
            stage_dict[i][
                "duration"] = stage_act_end_times[i] - stage_act_start_times[i]
            sum_of_stages_durations += stage_dict[i]["duration"]

        stage_dict[0]["totalduration"] = sum_of_stages_durations
        stage_dict[0][
            "actualtotalduration"] = app_act_end_time - app_act_start_time
        '''
        # Replace skipped stage id in parents ids based on RDD IDs
        for skipped_id in skipped:
            for stage_id1 in stage_dict.keys():
                if stage_id1 != skipped_id and stage_dict[skipped_id]["RDDIds"] == \
                        stage_dict[stage_id1]["RDDIds"]:
                    for stage_id2 in stage_dict.keys():
                        if skipped_id in stage_dict[stage_id2]["parentsIds"]:
                            stage_dict[stage_id2]["parentsIds"].remove(skipped_id)
                            stage_dict[stage_id2]["parentsIds"].append(stage_id1)

        for stage in stage_dict.keys():
            if len(stage_dict[stage]["parentsIds"]) == 0:
                try:
                    cached = list(stage_dict[stage]["cachedRDDs"])
                except KeyError:
                    None
                for i in range(0, stage):
                    try:
                        for rdd in cached:
                            if rdd in stage_dict[i]["cachedRDDs"]:
                                stage_dict[stage]["parentsIds"].append(i)
                                cached.remove(rdd)
                    except KeyError:
                        None
        '''
        if stage_dict:
            gather_records_rw(stage_dict)
            #print(stage_dict)

            # REPEATER = re.compile(r"(.+?)\1+$")
            # def repeated(s):
            #     match = REPEATER.match(s)
            #     return match.group(1) if match else None
            #
            # # Find iterations
            # lenparent = []
            # for key in stageDict.keys():
            #     lenparent.append(str(len(stageDict[key]['Parent IDs'])))
            # i = 0
            # stage_repeated = None
            # while stage_repeated == None and i < len(lenparent):
            #     stage_repeated = repeated("".join(lenparent[i:]))
            #     i += 1
            # print(i, stage_repeated)

            # def setWeight(key):
            #     for parentid in stageDict[key]['parentsIds']:
            #         w1 = stageDict[key]["weight"] + 1
            #         w2 = stageDict[parentid]["weight"]
            #         stageDict[parentid]["weight"] = max(w1, w2)
            #         setWeight(parentid)
            #
            # # Set weights
            # for key in reversed(stageDict.keys()):
            #     setWeight(key)

            #stage_to_do = len(list(stage_dict.keys())) - len(skipped)
            stage_to_do = len(stage_dict_key_stages) - len(skipped)
            #for stage_id in sorted(stage_dict.keys()):
            for stage_id in sorted(stage_dict_key_stages):
                parent_output = 0
                parent_input = 0
                parent_output_bytes = 0
                parent_input_bytes = 0
                if stage_id not in skipped:
                    stage_dict[stage_id]["weight"] = stage_to_do
                    stage_to_do -= 1
                    for parent_id in stage_dict[stage_id]["parentsIds"]:
                        parent_output += stage_dict[parent_id]["recordswrite"]
                        parent_output += stage_dict[parent_id][
                            "shufflerecordswrite"]
                        parent_input += stage_dict[parent_id]["recordsread"]
                        parent_input += stage_dict[parent_id][
                            "shufflerecordsread"]
                        parent_output_bytes += stage_dict[parent_id][
                            "byteswrite"]
                        parent_output_bytes += stage_dict[parent_id][
                            "shufflebyteswrite"]
                        parent_input_bytes += stage_dict[parent_id][
                            "bytesread"]
                        parent_input_bytes += stage_dict[parent_id][
                            "shufflebytesread"]
                    if parent_output != 0:
                        stage_dict[stage_id]["nominalrate"] = parent_output / (
                            stage_dict[stage_id]["monocoreduration"] / 1000.0)
                        stage_dict[stage_id][
                            "nominalrate_bytes"] = parent_input_bytes / (
                                stage_dict[stage_id]["monocoreduration"] /
                                1000.0)
                    elif parent_input != 0:
                        stage_dict[stage_id]["nominalrate"] = parent_input / (
                            stage_dict[stage_id]["monocoreduration"] / 1000.0)
                        stage_dict[stage_id][
                            "nominalrate_bytes"] = parent_input_bytes / (
                                stage_dict[stage_id]["monocoreduration"] /
                                1000.0)
                    else:
                        stage_input = stage_dict[stage_id][
                            "recordsread"] + stage_dict[stage_id][
                                "shufflerecordsread"]
                        stage_input_bytes = stage_dict[stage_id][
                            "bytesread"] + stage_dict[stage_id][
                                "shufflebytesread"]
                        if stage_input != 0 and stage_input != stage_dict[
                                stage_id]["numtask"]:
                            stage_dict[stage_id][
                                "nominalrate"] = stage_input / (
                                    stage_dict[stage_id]["monocoreduration"] /
                                    1000.0)
                            stage_dict[stage_id][
                                "nominalrate_bytes"] = stage_input_bytes / (
                                    stage_dict[stage_id]["monocoreduration"] /
                                    1000.0)
                        else:
                            stage_output = stage_dict[stage_id][
                                "recordswrite"] + stage_dict[stage_id][
                                    "shufflerecordswrite"]
                            stage_output_bytes = stage_dict[stage_id][
                                "byteswrite"] + stage_dict[stage_id][
                                    "shufflebyteswrite"]
                            stage_dict[stage_id][
                                "nominalrate"] = stage_input / (
                                    stage_dict[stage_id]["monocoreduration"] /
                                    1000.0)
                            stage_dict[stage_id][
                                "nominalrate_bytes"] = stage_input_bytes / (
                                    stage_dict[stage_id]["monocoreduration"] /
                                    1000.0)
                    if stage_dict[stage_id]["nominalrate"] == 0.0:
                        stage_dict[stage_id]["genstage"] = True

            totalduration = stage_dict[0]["monocoretotalduration"]
            #for key in stage_dict.keys():
            for key in stage_dict_key_stages:
                if key not in skipped:
                    old_weight = stage_dict[key]["weight"]
                    stage_dict[key]["weight"] = np.mean([
                        old_weight,
                        totalduration / stage_dict[key]["monocoreduration"]
                    ])
                    totalduration -= stage_dict[key]["monocoreduration"]

            # Create json output
            stage_dict[0]["jobs"] = stage_dict["jobs"]
            stage_dict.pop("jobs")
            '''
            # with open(os.path.join(path, re.sub("[^a-zA-Z0-9.-]", "_", app_name)+"-"+str(log_index)+".json"),
            with open(os.path.join(path, re.sub("[^a-zA-Z0-9.-]", "_", app_name)+".json"),
                      "w") as jsonoutput:
                json.dump(stage_dict, jsonoutput, indent=4, sort_keys=True)
            #log_index += 1
            '''
            '''
            stages = list(stage_dict.keys())
            stages_not_skipped = [s for s in stages if s not in skipped]
            stage_act_start_times = [0] * len(stages)
            stage_act_end_times = [0] * len(stages)

            if dat_file != '':
                fdat = open(dat_filepath)
                #print("fdat: ", fdat)
                with fdat as dat:
                    #print(dat)
                    for line in dat:
                        tokens = line.split(' ')
                        #print("after line 373")
                        if len(tokens) > 6:
                            if tokens[4] == 'Submitting' and (tokens[5] == 'ResultStage' or tokens[5] == 'ShuffleMapStage') and (start_stage == 0 or tokens[6] != '0'):
                                date = tokens[0]
                                time = tokens [1]
                                stage_act_start_times[int(tokens[6]) - start_stage] = date_time_to_timestamp_ms(date, time)
                                if tokens[6] == str(start_stage):
                                    app_act_start_time = date_time_to_timestamp_ms(date, time)

                            if (tokens[4] == 'ResultStage' or tokens[4] == 'ShuffleMapStage') and tokens[9] == 'finished' and (start_stage == 0 or tokens[5] != '0'):
                                date = tokens[0]
                                time = tokens [1]
                                stage_act_end_times[int(tokens[5]) - start_stage] = date_time_to_timestamp_ms(date, time)
                                if tokens[5] == str(last_stage + start_stage):
                                    app_act_end_time = date_time_to_timestamp_ms(date, time)
            else:
                print('_run.dat file not found, no actualdurations calculated')
            
            for i in stages:
                stage_dict[i]["duration"] = stage_act_end_times[i] - stage_act_start_times[i]

            stage_dict[0]["totalduration"] = app_act_end_time - app_act_start_time
            '''
            # create output dir
            log_name = os.path.basename(log)

            output_dir = os.path.join(
                OUTPUT_DIR,
                re.sub("[^a-zA-Z0-9.-]", "_", app_name) + "_" +
                log_name.split("-")[1]) if not json_out_dir else json_out_dir
            make_sure_path_exists(output_dir)
            # Create json output
            datagen_strings = ['datagen', 'scheduling-throughput']
            out_filename = 'app_datagen.json' if any(x in app_name.lower() for x in datagen_strings) \
                else re.sub("[^a-zA-Z0-9.-]", "_", app_name)+"-"+str(profile_suffix)+"_"+log_name.split("-")[1]+ ".json"
            #out_filename = 'app_datagen.json' if any(x in app_name.lower() for x in datagen_strings) else 'app.json'
            print('ROOT_DIR: {}\nAPP_NAME: {}\noutputdir: {}\noutfilename:{}'.
                  format(ROOT_DIR, app_name, output_dir, out_filename))
            with open(os.path.join(output_dir, out_filename),
                      "w") as jsonoutput:
                json.dump(stage_dict, jsonoutput, indent=4, sort_keys=True)
            #os.rename(log, os.path.join(processed_dir, os.path.basename(log_name)))
            os.rename(log,
                      os.path.join(processed_dir,
                                   log.split(os.path.sep)[-1]))

        log_index += 1
Beispiel #16
0
print("MAX_EXECUTOR in process_on_server b4 local assignment: " + str(c.MAX_EXECUTOR))
c.cfg_dict["MaxExecutor"] = c.MAX_EXECUTOR = str(end_index - 1)
print("MAX_EXECUTOR in process_on_server after local assignment: " + str(c.MAX_EXECUTOR))
c.CONFIG_DICT["Control"]["MaxExecutor"] = c.MAX_EXECUTOR
c.cfg_dict["ConfigDict"] = c.CONFIG_DICT        
c.update_config_parms(c)
'''
#print("process_on_server config_instance.cfg_dict: ")
#pp.pprint(c.cfg_dict)
#print("passed to log.download: c.CONFIG_DICT: ")
#pp.pprint(c.CONFIG_DICT)
# DOWNLOAD LOGS
output_folder = log.download(logfolder, [i for i in nodes[:end_index]],
                             master_ip, output_folder, c.CONFIG_DICT)

with open_cfg() as cfg:
    profile = True if 'profile' in cfg else False
    profile_option = cfg.getboolean(
        'main',
        'profile') if 'main' in cfg and 'profile' in cfg['main'] else False
    if profile or profile_option:  # Profiling
        processing.main()  # Profiling
        for filename in os.listdir(
                './spark_log_profiling/output_json/'):  # Profiling
            if output_folder.split("/")[-1].split(
                    "-"
            )[-1] in filename:  # Profiling                                                             # Profilimg
                shutil.copy('./spark_log_profiling/output_json/' + filename,
                            output_folder + "/" + filename)  # Profiling

run.write_config(output_folder)
Beispiel #17
0
from drivers.ccglibcloud.ec2spot import set_spot_drivers
from drivers.azurearm.driver import set_azurearm_driver
from util.utils import get_cfg, write_cfg, open_cfg
#import config as c
import pprint
pp = pprint.PrettyPrinter(indent=4)
#from configure import config_instance
import libcloud.common.base

libcloud.common.base.RETRY_FAILED_HTTP_REQUESTS = True

folder = sys.argv[1]
#folder = "home/ubuntu/dagsymb/num/app-20190128162903-0000"

cfg_filename = os.path.join(folder, c.CLUSTERS_CFG_FILENAME)
with open_cfg(r_path=cfg_filename) as cfg:
    c.CONFIG_DICT["Deadline"] = c.cfg_dict["Deadline"] = c.DEADLINE = int(
        cfg["experiment"]["deadline"])
    c.cfg_dict["ConfigDict"] = c.CONFIG_DICT
    print(cfg_filename, c.CONFIG_DICT["Deadline"])

try:
    print('in plot')
    plot.plot(folder)
except Exception as e:
    print("Plot failed: ", e)
try:
    print('in metrics')
    metrics.compute_metrics(folder)
except Exception as e:
    print("Metrics failed: ", e)