def run_system_command(cmd, rec, shell=False, env=None, noop=False, raise_on_error=True): t0 = datetime.datetime.utcnow() return_code, output, errors = system_command( cmd, print_command=True, print_output=True, raise_on_error=False, env=env, shell=shell, noop=noop, ) t1 = datetime.datetime.utcnow() td = t1 - t0 logging.info('exit_code=%d' % return_code) rec['command'] = cmd rec['utc_begin'] = t0.isoformat() rec['utc_end'] = t1.isoformat() rec['elapsed_sec'] = time_duration_to_seconds(td) rec['error'] = (return_code != 0) rec['exit_code'] = return_code rec['command_timed_out'] = (return_code == -1) rec['output'] = output rec['errors'] = errors if not noop and raise_on_error and return_code != 0: raise Exception('System command returned %d: %s' % (return_code, cmd))
def configure_environment(self): config = self.test_config super(SqlTest, self).configure_environment() db_type = config['db_type'] if db_type == 'hawq': if config.get('restart_hawq', False): system_command('/etc/init.d/hawq stop') system_command('/etc/init.d/hawq start') elif db_type == 'impala': cmd = [] cmd.extend(['impala-shell']) cmd.extend([ '--impalad', '%s:%d' % (config.get('impalad_host', 'localhost'), config.get('impalad_port', 21000)) ]) cmd.extend(['--database', self.db_name()]) cmd.extend(['-q', 'invalidate metadata']) system_command(cmd, print_command=True, print_output=True, raise_on_error=True, shell=False)
def run_mapred_job(self, key_prefix='', raise_on_error=False): rec = self.test_config # Build environment for command. env = None hadoop_command_env = rec.get('%shadoop_command_env' % key_prefix) if hadoop_command_env: env = dict(os.environ) env.update(hadoop_command_env) t0 = datetime.datetime.utcnow() return_code, output, errors = system_command( rec['%shadoop_command' % key_prefix], print_command=True, print_output=True, timeout=rec.get('%scommand_timeout_sec' % key_prefix), raise_on_error=False, shell=False, noop=rec.get('%snoop' % key_prefix, False), env=env) t1 = datetime.datetime.utcnow() td = t1 - t0 rec['%sutc_begin' % key_prefix] = t0.isoformat() rec['%sutc_end' % key_prefix] = t1.isoformat() rec['%selapsed_sec' % key_prefix] = time_duration_to_seconds(td) rec['%serror' % key_prefix] = (return_code != 0) rec['%scommand_timed_out' % key_prefix] = (return_code == -1) rec['%sexit_code' % key_prefix] = return_code rec['%soutput' % key_prefix] = output rec['%serrors' % key_prefix] = errors rec['%sbytes_read_hdfs' % key_prefix] = float( regex_first_group('Bytes Read=(.*)', errors, return_on_no_match='nan', search=True)) rec['%sbytes_written_hdfs' % key_prefix] = float( regex_first_group('Bytes Written=(.*)', errors, return_on_no_match='nan', search=True)) rec['%shadoop_job_id' % key_prefix] = regex_first_group( 'Running job: (job_[0-9_]+)', errors, search=True) if rec['%serror' % key_prefix]: raise Exception('Hadoop job failed')
def run_mapred_job(self): config = self.test_config with self.metrics_collector_context(): self.start_metrics() # Build environment for command. env = None hadoop_command_env = config.get('hadoop_command_env') if hadoop_command_env: env = dict(os.environ) env.update(hadoop_command_env) logging.info('*****************************************************************'); logging.info(config['test_desc']) t0 = datetime.datetime.utcnow() exit_code, output, errors = system_command(config['hadoop_command'], print_command=True, print_output=True, raise_on_error=False, shell=False, noop=config['noop'], env=env, timeout=config.get('command_timeout_sec',None)) t1 = datetime.datetime.utcnow() td = t1 - t0 config['utc_begin'] = t0.isoformat() config['utc_end'] = t1.isoformat() config['elapsed_sec'] = time_duration_to_seconds(td) config['error'] = (exit_code != 0) config['command_timed_out'] = (exit_code == -1) config['exit_code'] = exit_code config['output'] = output config['errors'] = errors config['bytes_read_hdfs'] = float(regex_first_group('Bytes Read=(.*)', errors, return_on_no_match='nan', search=True)) config['bytes_written_hdfs'] = float(regex_first_group('Bytes Written=(.*)', errors, return_on_no_match='nan', search=True)) config['hadoop_job_id'] = regex_first_group('Running job: (job_[0-9_]+)', errors, search=True) self.get_completed_job_info()
def run_test(self): rec = self.test_config self.deploy() git_commit = subprocess.run(['git', 'log', '--oneline', '-1'], capture_output=True, check=True).stdout.decode() test_uuid = rec['test_uuid'] driver = rec['driver'] workload = rec['workload'] numWorkers = rec['numWorkers'] localWorker = rec['localWorker'] namespace = rec['namespace'] params = { 'test_uuid': test_uuid, 'utc_begin': rec['utc_begin'], 'driver': driver, 'workload': workload, 'numWorkers': numWorkers, 'git_commit': git_commit, } # Encode all parameters in workload name attribute so they get written to the results file. workload['name'] = json.dumps(params) print(yaml.dump(params, default_flow_style=False)) driver_file_name = '/tmp/driver-' + test_uuid + '.yaml' workload_file_name = '/tmp/workload-' + test_uuid + '.yaml' payload_file_name = '/tmp/payload-' + test_uuid + '.data' workload['payloadFile'] = payload_file_name create_yaml_file(driver, driver_file_name, namespace) create_yaml_file(workload, workload_file_name, namespace) if localWorker: workers_args = '' else: workers = [ 'http://%s-openmessaging-benchmarking-worker-%d.%s-openmessaging-benchmarking-worker:8080' % (namespace, worker_number, namespace) for worker_number in range(numWorkers) ] workers_args = '--workers %s' % ','.join(workers) cmd = [ 'kubectl', 'exec', '-n', namespace, 'examples-openmessaging-benchmarking-driver', '--', 'bash', '-c', 'rm -f /tmp/logs.tar.gz' + ' && dd if=/dev/urandom of=' + payload_file_name + ' bs=' + str(workload['messageSize']) + ' count=1 status=none' + ' && bin/benchmark --drivers ' + driver_file_name + ' ' + workers_args + ' ' + workload_file_name + ' && tar -czvf /tmp/logs-' + test_uuid + '.tar.gz *' + test_uuid + '*.json' + ' && rm -f ' + payload_file_name ] rec['_status_node'].set_status('Running command: %s' % str(cmd)) t0 = datetime.datetime.utcnow() return_code, output, errors = system_command( cmd, print_output=True, shell=False, timeout=(workload['testDurationMinutes'] + 5) * 60, raise_on_error=False, noop=rec['noop'], ) t1 = datetime.datetime.utcnow() td = t1 - t0 logging.info('exit_code=%d' % return_code) rec['utc_begin'] = t0.isoformat() rec['utc_end'] = t1.isoformat() rec['elapsed_sec'] = time_duration_to_seconds(td) rec['error'] = (return_code != 0) rec['exit_code'] = return_code rec['command_timed_out'] = (return_code == -1) rec['output'] = output rec['errors'] = errors # Collect logs to store in results.json cmd = [ 'kubectl', 'exec', '-n', namespace, 'examples-openmessaging-benchmarking-driver', '--', 'bash', '-c', 'cat *' + test_uuid + '*.json', ] return_code, results_json, errors = system_command( cmd, print_output=False, shell=False, raise_on_error=False) rec['omb_results'] = json.load(StringIO(results_json.decode())) # Collect and extract logs (outside of results.json) (not required) cmd = [ 'kubectl', 'cp', '%s/%s-openmessaging-benchmarking-driver:/tmp/logs-%s.tar.gz' % (namespace, namespace, test_uuid), 'logs/logs-%s.tar.gz' % test_uuid, ] subprocess.run(cmd, check=True) cmd = [ 'tar', '-xzvf', 'logs/logs-%s.tar.gz' % test_uuid, '-C', 'logs', ] subprocess.run(cmd, check=True) rec['run_as_test'] = rec['test'] if 'record_as_test' in rec: rec['test'] = rec['record_as_test'] if 'result_filename' in rec: record_result(rec, rec['result_filename']) if rec['command_timed_out']: raise TimeoutException() if rec['error']: raise Exception('Command failed')
def submit_slurm_jobs(args): if args.batch_uuid is None: args.batch_uuid = str(uuid.uuid4()) logging.info('batch_uuid=%s' % args.batch_uuid) log_dir = os.path.join(args.log_dir, args.batch_uuid) logging.info('log_dir=%s' % log_dir) sample_records = [line.split(',') for line in args.sample_id] for sample_id_file_name in args.sample_id_file: with open(sample_id_file_name) as f: sample_records += [line.rstrip('\n').split(',') for line in f] logging.info('sample_records=%s', str(sample_records)) if args.cancel_jobs: cmd = ['scancel', '-u', os.environ['USER']] system_command( cmd, print_command=True, print_output=True, raise_on_error=True, shell=False, noop=args.noop, ) # for host in args.host: # cmd = 'docker stop \\$(docker ps -a -q --filter ancestor=parabricks/release:v2.3.2 --format="{{.ID}}")' # ssh('root', host, cmd, raise_on_error=False) if not args.noop: os.makedirs(log_dir, exist_ok=True) flush_caches(args) log_files = [] if True: for sample_rec in sample_records: sample_id = sample_rec[0] log_file = os.path.join(log_dir, '%s.log' % sample_id) log_files += [log_file] job_name = '%s__%s' % (sample_id, args.batch_uuid) cmd = [ 'sbatch', '--gres', 'gpu:%d' % args.num_gpus, '--job-name', job_name, '--output', log_file, '--cpus-per-task', '%d' % args.num_cpus, # '--mem-per-cpu', '%d' % args.mem_per_cpu, '--requeue', ] if len(args.host) == 1: cmd += ['--nodelist', ','.join(args.host)] cmd += [ 'parabricks_germline_pipeline.py', '--sample_id', sample_id, '--batch_uuid', args.batch_uuid, '--num_cpus', '%d' % args.num_cpus, # '--mem_per_cpu', '%d' % args.mem_per_cpu, ] cmd += args.unknown_args return_code, output, errors = system_command( cmd, print_command=True, print_output=True, raise_on_error=True, shell=False, noop=args.noop, ) logging.info('Jobs started. Logging to: %s' % log_dir) if not args.noop: subprocess.run(['tail', '-n', '1000', '-F'] + log_files)
def run_query(query_config): rec = query_config print_output = rec.get('print_output', True) stream_id = rec.get('stream_id', 0) rec['db_name'] = rec['db_name'] % rec if rec.get('kill_all_yarn_jobs_before_each_query', False): kill_all_yarn_jobs() rec['query_filename_contents'] = read_file_to_string(rec['query_filename']) shell = False db_type = rec['db_type'] # Build query command. if db_type == 'hawq': cmd = [] cmd.extend(['psql']) cmd.extend(['-v', 'ON_ERROR_STOP=1']) cmd.extend(['-d', rec['db_name']]) cmd.extend(['-tAf', rec['query_filename']]) elif db_type == 'hive': if not 'hiveconf:hive.tez.java.opts' in rec and 'java_opts_xmx_ratio' in rec and 'hiveconf:hive.tez.container.size' in rec: rec['hiveconf:hive.tez.java.opts'] = '-Xmx%dm' % ( rec['hiveconf:hive.tez.container.size'] * rec['java_opts_xmx_ratio']) hiveconf = [] for k, v in rec.items(): prop = regex_first_group('^hiveconf:(.*)', k) if prop: hiveconf.extend(['--hiveconf', '"%s=%s"' % (prop, v)]) cmd = [] cmd.extend(['hive']) cmd.extend(['--database', rec['db_name']]) cmd.extend(['-f', rec['query_filename']]) if 'hive_init_file' in rec: cmd.extend(['-i', rec['hive_init_file']]) # Record contents of file in result. rec['hive_init_file_contents'] = read_file_to_string( rec['hive_init_file']) cmd.extend(hiveconf) elif db_type == 'impala': cmd = [] cmd.extend(['impala-shell']) cmd.extend([ '--impalad', '%s:%d' % (rec.get('impalad_host', 'localhost'), rec.get('impalad_port', 21000)) ]) cmd.extend(['--database', rec['db_name']]) cmd.extend(['-f', rec['query_filename']]) cmd.extend(['-B']) # turn off pretty printing cmd.extend(['-o', '/dev/null']) if rec.get('profile_query'): cmd.extend(['--show_profiles']) else: raise ('Unknown db_type') logging.info('%d: # %s' % (stream_id, ' '.join(cmd))) rec['query_command'] = cmd t0 = datetime.datetime.utcnow() # Run query. return_code, output, errors = system_command(cmd, print_command=False, print_output=print_output, timeout=rec.get( 'command_timeout_sec', None), raise_on_error=False, shell=shell) t1 = datetime.datetime.utcnow() td = t1 - t0 rec['utc_begin'] = t0.isoformat() rec['utc_end'] = t1.isoformat() rec['elapsed_sec'] = time_duration_to_seconds(td) rec['error'] = (return_code != 0) rec['exit_code'] = return_code rec['command_timed_out'] = (return_code == -1) rec['output'] = output rec['errors'] = errors rec['record_type'] = 'query_result' # Parse query output to determine elapsed time and rows returned. if db_type == 'hive': rec['application_id'] = regex_first_group( '\\(Executing on YARN cluster with App id (application_.*)\\)$', errors, return_on_no_match=None, search=True, flags=re.MULTILINE) # Extract actual query duration from stderr text. Note that we must find the last occurance of 'Time taken'. query_elapsed_sec = regex_first_group('Time taken: ([0-9.]+) seconds', errors, return_on_no_match='nan', search=True, flags=re.MULTILINE, match_last=True) if query_elapsed_sec == 'nan': logging.warn('Time taken not returned by command.') rec['error'] = True rec['query_elapsed_sec'] = float(query_elapsed_sec) rec['non_query_elapsed_sec'] = rec['elapsed_sec'] - rec[ 'query_elapsed_sec'] # Extract row count from stderr text. Note that some queries will not report fetched rows. query_rows_returned = regex_first_group('Fetched: ([0-9]+) row', errors, return_on_no_match='0', search=True, flags=re.MULTILINE) rec['query_rows_returned'] = int(query_rows_returned) logging.info( 'error=%d, query_elapsed_sec=%f, non_query_elapsed_sec=%f, query_rows_returned=%d' % (rec['error'], rec['query_elapsed_sec'], rec['non_query_elapsed_sec'], rec['query_rows_returned'])) elif db_type == 'impala': # Extract actual query duration from stderr text. # Fetched 100 row(s) in 0.98s query_elapsed_sec = regex_first_group( 'Fetched [0-9]+ row\\(s\\) in ([0-9.]+)s', errors, return_on_no_match='nan', search=True, flags=re.MULTILINE, match_last=True) if query_elapsed_sec == 'nan': logging.warn('Time taken not returned by command.') rec['error'] = True rec['query_elapsed_sec'] = float(query_elapsed_sec) rec['non_query_elapsed_sec'] = rec['elapsed_sec'] - rec[ 'query_elapsed_sec'] # Extract row count from stderr text. Note that some queries will not report fetched rows. query_rows_returned = regex_first_group('Fetched ([0-9]+) row\\(s\\)', errors, return_on_no_match='0', search=True, flags=re.MULTILINE) rec['query_rows_returned'] = int(query_rows_returned) logging.info( 'error=%d, query_elapsed_sec=%f, non_query_elapsed_sec=%f, query_rows_returned=%d' % (rec['error'], rec['query_elapsed_sec'], rec['non_query_elapsed_sec'], rec['query_rows_returned'])) else: rec['query_elapsed_sec'] = rec['elapsed_sec'] rec['non_query_elapsed_sec'] = 0.0 rec['query_rows_returned'] = np.nan # Handle errors. if rec['error']: logging.info('%d: return_code=%d' % (stream_id, return_code)) if not print_output: logging.info('%d: %s' % (stream_id, output)) if db_type == 'hive': # Kill YARN application if rec['application_id']: kill_yarn_job(rec['application_id']) if errors != '': if not print_output: logging.info('%d: %s' % (stream_id, errors)) if not rec['error']: logging.info('%d: %s: %0.3f seconds' % (stream_id, rec['query_filename'], rec['elapsed_sec'])) return rec
def process_sample(args): logging.info('BEGIN') record_uuid = str(uuid.uuid4()) sample_id = args.sample_id hostname = socket.gethostname() logging.info('record_uuid=%s' % record_uuid) logging.info('sample_id=%s' % sample_id) logging.info('hostname=%s' % hostname) t0 = datetime.datetime.utcnow() rec = {} rec['batch_uuid'] = args.batch_uuid rec['record_uuid'] = record_uuid rec['sample_id'] = sample_id rec['hostname'] = hostname rec['args'] = args.__dict__ exception = None try: input_dir = os.path.join(args.input_dir, sample_id) output_dir = os.path.join(args.output_dir, sample_id) temp_dir = os.path.join(args.temp_dir, sample_id) rec['input_dir'] = input_dir rec['output_dir'] = output_dir rec['temp_dir'] = temp_dir logging.debug('input_dir=%s' % input_dir) logging.debug('output_dir=%s' % output_dir) logging.debug('temp_dir=%s' % temp_dir) if not args.noop and os.path.exists(temp_dir): shutil.rmtree(temp_dir) os.makedirs(temp_dir, exist_ok=True) os.makedirs(output_dir, exist_ok=True) # Create copy of Parabricks installation just for this process. # If installation directories are used by different processes concurrently, corruption # in the Singularity image may occur. cmd = [ 'tar', '-xzvf', args.parabricks_install_tgz_file, '-C', temp_dir ] system_command( cmd, print_command=True, print_output=True, raise_on_error=True, shell=False, noop=args.noop, ) pbrun_file_name = os.path.join(temp_dir, 'parabricks', 'pbrun') logging.debug('pbrun_file_name=%s' % pbrun_file_name) assert os.path.exists(pbrun_file_name) # Slurm sets CUDA_VISIBLE_DEVICES but pbrun requires NVIDIA_VISIBLE_DEVICES. env = os.environ.copy() cuda_visible_devices = os.environ.get('CUDA_VISIBLE_DEVICES', '0,1,2,3') logging.info('cuda_visible_devices=%s' % cuda_visible_devices) num_gpus = len(cuda_visible_devices.split(',')) logging.info('num_gpus=%d' % num_gpus) env['NVIDIA_VISIBLE_DEVICES'] = cuda_visible_devices rec['env'] = env rec['cuda_visible_devices'] = cuda_visible_devices rec['num_gpus'] = num_gpus fq_pairs = [] fq_file_sizes = [] for i in range(args.max_num_fq_pairs): pair = [] for j in range(1, 3): filename = os.path.join(input_dir, '%d_%d.fq.gz' % (i, j)) if os.path.isfile(filename): pair += [filename] fq_file_sizes += [os.path.getsize(filename)] if pair: fq_pairs += [pair] logging.debug('fq_pairs=%s' % str(fq_pairs)) rec['fq_pairs'] = fq_pairs logging.info('fq_file_sizes=%s' % str(fq_file_sizes)) rec['fq_file_sizes'] = fq_file_sizes in_fq_cmd = [] for i, fq_pair in enumerate(fq_pairs): header = '@RG\\tID:%d\\tLB:lib1\\tPL:bar\\tSM:%s\\tPU:%d' % ( i, sample_id, i) in_fq_cmd += ['--in-fq'] + fq_pair + [header] logging.debug('in_fq_cmd=%s' % str(in_fq_cmd)) bam_file_name = os.path.join(output_dir, '%s.bam' % sample_id) gvcf_file_name = os.path.join(output_dir, '%s.g.vcf' % sample_id) dv_gvcf_file_name = os.path.join(output_dir, '%s_dv.g.vcf' % sample_id) if args.fq2bam: cmd = [ pbrun_file_name, 'fq2bam', '--ref', os.path.join(args.reference_files_dir, 'Homo_sapiens_assembly38.fasta'), '--out-bam', bam_file_name, '--out-recal-file', os.path.join(output_dir, '%s.txt' % sample_id), '--knownSites', os.path.join( args.reference_files_dir, 'Mills_and_1000G_gold_standard.indels.hg38.vcf.gz'), '--knownSites', os.path.join(args.reference_files_dir, 'Homo_sapiens_assembly38.dbsnp138.vcf'), '--tmp-dir', temp_dir, '--num-gpus', '%d' % num_gpus, ] cmd += in_fq_cmd rec['fq2bam_result'] = {} run_system_command(cmd, rec['fq2bam_result'], env=env, noop=args.noop) if args.germline: cmd = [ pbrun_file_name, 'germline', '--ref', os.path.join(args.reference_files_dir, 'Homo_sapiens_assembly38.fasta'), '--out-bam', bam_file_name, '--out-recal-file', os.path.join(output_dir, '%s.txt' % sample_id), '--knownSites', os.path.join( args.reference_files_dir, 'Mills_and_1000G_gold_standard.indels.hg38.vcf.gz'), '--knownSites', os.path.join(args.reference_files_dir, 'Homo_sapiens_assembly38.dbsnp138.vcf'), '--out-variants', gvcf_file_name, '--gvcf', '--tmp-dir', temp_dir, '--num-gpus', '%d' % num_gpus, ] cmd += in_fq_cmd rec['germline_result'] = {} run_system_command(cmd, rec['germline_result'], env=env, noop=args.noop) rec['haplotypecaller_gvcf_file_size_bytes'] = os.path.getsize( gvcf_file_name) rec['bam_file_size_bytes'] = os.path.getsize(bam_file_name) logging.debug('bam_file_size_bytes=%d' % rec['bam_file_size_bytes']) if args.haplotypecaller: cmd = [ pbrun_file_name, 'haplotypecaller', '--ref', os.path.join(args.reference_files_dir, 'Homo_sapiens_assembly38.fasta'), '--in-bam', bam_file_name, '--in-recal-file', os.path.join(output_dir, '%s.txt' % sample_id), '--out-variants', gvcf_file_name, '--gvcf', '--tmp-dir', temp_dir, '--num-gpus', '%d' % num_gpus, ] rec['haplotypecaller_result'] = {} run_system_command(cmd, rec['haplotypecaller_result'], env=env, noop=args.noop) rec['haplotypecaller_gvcf_file_size_bytes'] = os.path.getsize( gvcf_file_name) if args.deepvariant: # deepvariant uses the bam output of fq2bam or germline. cmd = [ pbrun_file_name, 'deepvariant', '--ref', os.path.join(args.reference_files_dir, 'Homo_sapiens_assembly38.fasta'), '--in-bam', bam_file_name, '--out-variants', dv_gvcf_file_name, '--gvcf', '--tmp-dir', temp_dir, '--num-gpus', '%d' % num_gpus, ] rec['deepvariant_result'] = {} run_system_command(cmd, rec['deepvariant_result'], env=env, noop=args.noop) rec['deepvariant_gvcf_file_size_bytes'] = os.path.getsize( dv_gvcf_file_name) if not args.noop and os.path.exists(temp_dir): shutil.rmtree(temp_dir) except Exception as e: exception = e rec['error'] = True t1 = datetime.datetime.utcnow() td = t1 - t0 rec['utc_begin'] = t0.isoformat() rec['utc_end'] = t1.isoformat() rec['elapsed_sec'] = time_duration_to_seconds(td) if args.summary_file: record_result(rec, args.summary_file) logging.info('END') if exception: raise exception
def run_test(self): rec = self.test_config data_size_MB = rec['data_size_MB'] base_directory = rec['base_directory'] % rec test_directory = '%s/TPCx-HS-benchmark' % base_directory sort_input_directory = '%s/HSsort-input' % test_directory sort_output_directory = '%s/HSsort-output' % test_directory validate_output_directory = '%s/HSValidate' % test_directory rec['data_size_TB'] = rec['data_size_MB'] / 1e6 rec['sf'] = rec['data_size_TB'] rec['error'] = False # # Build commands # # HSGen rec_size = 100 recs = int(data_size_MB * 1000.0 * 1000.0 / rec_size) cmd = [] cmd.extend(['hadoop', 'jar', rec['jar'], 'HSGen']) cmd.extend(get_hadoop_parameters(rec)) cmd.extend([str(recs), sort_input_directory]) rec['hsgen:hadoop_command'] = cmd # HSSort cmd = [] cmd.extend(['hadoop', 'jar', rec['jar'], 'HSSort']) cmd.extend(get_hadoop_parameters(rec)) cmd.extend([sort_input_directory, sort_output_directory]) rec['hssort:hadoop_command'] = cmd # HSValidate cmd = [] cmd.extend(['hadoop', 'jar', rec['jar'], 'HSValidate']) hsvalidate_config = rec.copy() del hsvalidate_config['map_tasks'] del hsvalidate_config['reduce_tasks'] cmd.extend(get_hadoop_parameters(hsvalidate_config)) cmd.extend([sort_output_directory, validate_output_directory]) rec['hsvalidate:hadoop_command'] = cmd for key in [ 'hsgen:hadoop_command', 'hssort:hadoop_command', 'hsvalidate:hadoop_command' ]: logging.info('%s: %s' % (key, rec[key])) # # Prepare for benchmark # self.hadoop_authenticate() self.configure_environment() self.delete_hadoop_directory('%s/*' % test_directory) system_command(['hadoop', 'fs', '-expunge'], print_command=True, print_output=True, raise_on_error=True, shell=False) logging.info('Sleeping for %0.0f seconds' % rec['sleep_after_delete_sec']) time.sleep(rec['sleep_after_delete_sec']) with self.metrics_collector_context(): self.start_metrics() # # Run benchmark # t0 = datetime.datetime.utcnow() try: # HSGen self.run_mapred_job(key_prefix='hsgen:', raise_on_error=True) system_command( ['hdfs', 'dfs', '-ls', '%s/*' % sort_input_directory], print_command=True, print_output=True, raise_on_error=True, shell=False) # HSSort self.run_mapred_job(key_prefix='hssort:', raise_on_error=True) system_command( ['hdfs', 'dfs', '-ls', '%s/*' % sort_output_directory], print_command=True, print_output=True, raise_on_error=True, shell=False) # HSValidate self.run_mapred_job(key_prefix='hsvalidate:', raise_on_error=True) system_command( ['hdfs', 'dfs', '-ls', '%s/*' % validate_output_directory], print_command=True, print_output=True, raise_on_error=True, shell=False) except: logging.error('EXCEPTION: %s' % traceback.format_exc()) rec['error'] = True t1 = datetime.datetime.utcnow() td = t1 - t0 rec['elapsed_sec'] = time_duration_to_seconds(td) if not rec['error']: rec['total_io_rate_MB_per_sec'] = rec['data_size_MB'] / rec[ 'elapsed_sec'] rec['io_rate_MB_per_sec_per_storage_node'] = rec[ 'total_io_rate_MB_per_sec'] / rec.get('storage_num_nodes', float('nan')) rec['HSph@SF'] = rec['sf'] / (rec['elapsed_sec'] / 3600.0) logging.info('RESULT: elapsed_sec=%f, HSph@SF=%f' % (rec['elapsed_sec'], rec['HSph@SF'])) self.record_result() if rec['error']: raise Exception('Test failed')