Exemple #1
0
def run_system_command(cmd,
                       rec,
                       shell=False,
                       env=None,
                       noop=False,
                       raise_on_error=True):
    t0 = datetime.datetime.utcnow()
    return_code, output, errors = system_command(
        cmd,
        print_command=True,
        print_output=True,
        raise_on_error=False,
        env=env,
        shell=shell,
        noop=noop,
    )
    t1 = datetime.datetime.utcnow()
    td = t1 - t0
    logging.info('exit_code=%d' % return_code)
    rec['command'] = cmd
    rec['utc_begin'] = t0.isoformat()
    rec['utc_end'] = t1.isoformat()
    rec['elapsed_sec'] = time_duration_to_seconds(td)
    rec['error'] = (return_code != 0)
    rec['exit_code'] = return_code
    rec['command_timed_out'] = (return_code == -1)
    rec['output'] = output
    rec['errors'] = errors
    if not noop and raise_on_error and return_code != 0:
        raise Exception('System command returned %d: %s' % (return_code, cmd))
Exemple #2
0
    def configure_environment(self):
        config = self.test_config
        super(SqlTest, self).configure_environment()

        db_type = config['db_type']

        if db_type == 'hawq':
            if config.get('restart_hawq', False):
                system_command('/etc/init.d/hawq stop')
                system_command('/etc/init.d/hawq start')

        elif db_type == 'impala':
            cmd = []
            cmd.extend(['impala-shell'])
            cmd.extend([
                '--impalad',
                '%s:%d' % (config.get('impalad_host', 'localhost'),
                           config.get('impalad_port', 21000))
            ])
            cmd.extend(['--database', self.db_name()])
            cmd.extend(['-q', 'invalidate metadata'])
            system_command(cmd,
                           print_command=True,
                           print_output=True,
                           raise_on_error=True,
                           shell=False)
    def run_mapred_job(self, key_prefix='', raise_on_error=False):
        rec = self.test_config

        # Build environment for command.
        env = None
        hadoop_command_env = rec.get('%shadoop_command_env' % key_prefix)
        if hadoop_command_env:
            env = dict(os.environ)
            env.update(hadoop_command_env)

        t0 = datetime.datetime.utcnow()

        return_code, output, errors = system_command(
            rec['%shadoop_command' % key_prefix],
            print_command=True,
            print_output=True,
            timeout=rec.get('%scommand_timeout_sec' % key_prefix),
            raise_on_error=False,
            shell=False,
            noop=rec.get('%snoop' % key_prefix, False),
            env=env)

        t1 = datetime.datetime.utcnow()
        td = t1 - t0

        rec['%sutc_begin' % key_prefix] = t0.isoformat()
        rec['%sutc_end' % key_prefix] = t1.isoformat()
        rec['%selapsed_sec' % key_prefix] = time_duration_to_seconds(td)
        rec['%serror' % key_prefix] = (return_code != 0)
        rec['%scommand_timed_out' % key_prefix] = (return_code == -1)
        rec['%sexit_code' % key_prefix] = return_code
        rec['%soutput' % key_prefix] = output
        rec['%serrors' % key_prefix] = errors
        rec['%sbytes_read_hdfs' % key_prefix] = float(
            regex_first_group('Bytes Read=(.*)',
                              errors,
                              return_on_no_match='nan',
                              search=True))
        rec['%sbytes_written_hdfs' % key_prefix] = float(
            regex_first_group('Bytes Written=(.*)',
                              errors,
                              return_on_no_match='nan',
                              search=True))
        rec['%shadoop_job_id' % key_prefix] = regex_first_group(
            'Running job: (job_[0-9_]+)', errors, search=True)

        if rec['%serror' % key_prefix]:
            raise Exception('Hadoop job failed')
    def run_mapred_job(self):
        config = self.test_config

        with self.metrics_collector_context():
            self.start_metrics()

            # Build environment for command.
            env = None
            hadoop_command_env = config.get('hadoop_command_env')
            if hadoop_command_env:
                env = dict(os.environ)
                env.update(hadoop_command_env)

            logging.info('*****************************************************************');
            logging.info(config['test_desc'])
            
            t0 = datetime.datetime.utcnow()

            exit_code, output, errors = system_command(config['hadoop_command'], print_command=True, print_output=True, 
                raise_on_error=False, shell=False, noop=config['noop'], env=env,
                timeout=config.get('command_timeout_sec',None))
        
            t1 = datetime.datetime.utcnow()
            td = t1 - t0

            config['utc_begin'] = t0.isoformat()
            config['utc_end'] = t1.isoformat()
            config['elapsed_sec'] = time_duration_to_seconds(td)
            config['error'] = (exit_code != 0)
            config['command_timed_out'] = (exit_code == -1)
            config['exit_code'] = exit_code
            config['output'] = output
            config['errors'] = errors

            config['bytes_read_hdfs'] = float(regex_first_group('Bytes Read=(.*)', errors, return_on_no_match='nan', search=True))
            config['bytes_written_hdfs'] = float(regex_first_group('Bytes Written=(.*)', errors, return_on_no_match='nan', search=True))
            config['hadoop_job_id'] = regex_first_group('Running job: (job_[0-9_]+)', errors, search=True)

            self.get_completed_job_info()
    def run_test(self):
        rec = self.test_config

        self.deploy()

        git_commit = subprocess.run(['git', 'log', '--oneline', '-1'],
                                    capture_output=True,
                                    check=True).stdout.decode()

        test_uuid = rec['test_uuid']
        driver = rec['driver']
        workload = rec['workload']
        numWorkers = rec['numWorkers']
        localWorker = rec['localWorker']
        namespace = rec['namespace']

        params = {
            'test_uuid': test_uuid,
            'utc_begin': rec['utc_begin'],
            'driver': driver,
            'workload': workload,
            'numWorkers': numWorkers,
            'git_commit': git_commit,
        }
        # Encode all parameters in workload name attribute so they get written to the results file.
        workload['name'] = json.dumps(params)
        print(yaml.dump(params, default_flow_style=False))

        driver_file_name = '/tmp/driver-' + test_uuid + '.yaml'
        workload_file_name = '/tmp/workload-' + test_uuid + '.yaml'
        payload_file_name = '/tmp/payload-' + test_uuid + '.data'

        workload['payloadFile'] = payload_file_name

        create_yaml_file(driver, driver_file_name, namespace)
        create_yaml_file(workload, workload_file_name, namespace)

        if localWorker:
            workers_args = ''
        else:
            workers = [
                'http://%s-openmessaging-benchmarking-worker-%d.%s-openmessaging-benchmarking-worker:8080'
                % (namespace, worker_number, namespace)
                for worker_number in range(numWorkers)
            ]
            workers_args = '--workers %s' % ','.join(workers)

        cmd = [
            'kubectl', 'exec', '-n', namespace,
            'examples-openmessaging-benchmarking-driver', '--', 'bash', '-c',
            'rm -f /tmp/logs.tar.gz' + ' && dd if=/dev/urandom of=' +
            payload_file_name + ' bs=' + str(workload['messageSize']) +
            ' count=1 status=none' + ' && bin/benchmark --drivers ' +
            driver_file_name + ' ' + workers_args + ' ' + workload_file_name +
            ' && tar -czvf /tmp/logs-' + test_uuid + '.tar.gz *' + test_uuid +
            '*.json' + ' && rm -f ' + payload_file_name
        ]
        rec['_status_node'].set_status('Running command: %s' % str(cmd))

        t0 = datetime.datetime.utcnow()

        return_code, output, errors = system_command(
            cmd,
            print_output=True,
            shell=False,
            timeout=(workload['testDurationMinutes'] + 5) * 60,
            raise_on_error=False,
            noop=rec['noop'],
        )

        t1 = datetime.datetime.utcnow()
        td = t1 - t0

        logging.info('exit_code=%d' % return_code)

        rec['utc_begin'] = t0.isoformat()
        rec['utc_end'] = t1.isoformat()
        rec['elapsed_sec'] = time_duration_to_seconds(td)
        rec['error'] = (return_code != 0)
        rec['exit_code'] = return_code
        rec['command_timed_out'] = (return_code == -1)
        rec['output'] = output
        rec['errors'] = errors

        # Collect logs to store in results.json
        cmd = [
            'kubectl',
            'exec',
            '-n',
            namespace,
            'examples-openmessaging-benchmarking-driver',
            '--',
            'bash',
            '-c',
            'cat *' + test_uuid + '*.json',
        ]
        return_code, results_json, errors = system_command(
            cmd, print_output=False, shell=False, raise_on_error=False)
        rec['omb_results'] = json.load(StringIO(results_json.decode()))

        # Collect and extract logs (outside of results.json) (not required)
        cmd = [
            'kubectl',
            'cp',
            '%s/%s-openmessaging-benchmarking-driver:/tmp/logs-%s.tar.gz' %
            (namespace, namespace, test_uuid),
            'logs/logs-%s.tar.gz' % test_uuid,
        ]
        subprocess.run(cmd, check=True)
        cmd = [
            'tar',
            '-xzvf',
            'logs/logs-%s.tar.gz' % test_uuid,
            '-C',
            'logs',
        ]
        subprocess.run(cmd, check=True)

        rec['run_as_test'] = rec['test']
        if 'record_as_test' in rec:
            rec['test'] = rec['record_as_test']
        if 'result_filename' in rec:
            record_result(rec, rec['result_filename'])
        if rec['command_timed_out']:
            raise TimeoutException()
        if rec['error']:
            raise Exception('Command failed')
def submit_slurm_jobs(args):
    if args.batch_uuid is None:
        args.batch_uuid = str(uuid.uuid4())
    logging.info('batch_uuid=%s' % args.batch_uuid)

    log_dir = os.path.join(args.log_dir, args.batch_uuid)
    logging.info('log_dir=%s' % log_dir)

    sample_records = [line.split(',') for line in args.sample_id]
    for sample_id_file_name in args.sample_id_file:
        with open(sample_id_file_name) as f:
            sample_records += [line.rstrip('\n').split(',') for line in f]

    logging.info('sample_records=%s', str(sample_records))

    if args.cancel_jobs:
        cmd = ['scancel', '-u', os.environ['USER']]
        system_command(
            cmd,
            print_command=True,
            print_output=True,
            raise_on_error=True,
            shell=False,
            noop=args.noop,
        )

        # for host in args.host:
        #     cmd = 'docker stop \\$(docker ps -a -q --filter ancestor=parabricks/release:v2.3.2 --format="{{.ID}}")'
        #     ssh('root', host, cmd, raise_on_error=False)

    if not args.noop:
        os.makedirs(log_dir, exist_ok=True)

    flush_caches(args)

    log_files = []
    if True:
        for sample_rec in sample_records:
            sample_id = sample_rec[0]
            log_file = os.path.join(log_dir, '%s.log' % sample_id)
            log_files += [log_file]
            job_name = '%s__%s' % (sample_id, args.batch_uuid)
            cmd = [
                'sbatch',
                '--gres',
                'gpu:%d' % args.num_gpus,
                '--job-name',
                job_name,
                '--output',
                log_file,
                '--cpus-per-task',
                '%d' % args.num_cpus,
                # '--mem-per-cpu', '%d' % args.mem_per_cpu,
                '--requeue',
            ]
            if len(args.host) == 1:
                cmd += ['--nodelist', ','.join(args.host)]
            cmd += [
                'parabricks_germline_pipeline.py',
                '--sample_id',
                sample_id,
                '--batch_uuid',
                args.batch_uuid,
                '--num_cpus',
                '%d' % args.num_cpus,
                # '--mem_per_cpu', '%d' % args.mem_per_cpu,
            ]
            cmd += args.unknown_args
            return_code, output, errors = system_command(
                cmd,
                print_command=True,
                print_output=True,
                raise_on_error=True,
                shell=False,
                noop=args.noop,
            )

    logging.info('Jobs started. Logging to: %s' % log_dir)
    if not args.noop: subprocess.run(['tail', '-n', '1000', '-F'] + log_files)
Exemple #7
0
def run_query(query_config):
    rec = query_config
    print_output = rec.get('print_output', True)
    stream_id = rec.get('stream_id', 0)

    rec['db_name'] = rec['db_name'] % rec

    if rec.get('kill_all_yarn_jobs_before_each_query', False):
        kill_all_yarn_jobs()

    rec['query_filename_contents'] = read_file_to_string(rec['query_filename'])

    shell = False
    db_type = rec['db_type']

    # Build query command.

    if db_type == 'hawq':
        cmd = []
        cmd.extend(['psql'])
        cmd.extend(['-v', 'ON_ERROR_STOP=1'])
        cmd.extend(['-d', rec['db_name']])
        cmd.extend(['-tAf', rec['query_filename']])

    elif db_type == 'hive':
        if not 'hiveconf:hive.tez.java.opts' in rec and 'java_opts_xmx_ratio' in rec and 'hiveconf:hive.tez.container.size' in rec:
            rec['hiveconf:hive.tez.java.opts'] = '-Xmx%dm' % (
                rec['hiveconf:hive.tez.container.size'] *
                rec['java_opts_xmx_ratio'])
        hiveconf = []
        for k, v in rec.items():
            prop = regex_first_group('^hiveconf:(.*)', k)
            if prop:
                hiveconf.extend(['--hiveconf', '"%s=%s"' % (prop, v)])
        cmd = []
        cmd.extend(['hive'])
        cmd.extend(['--database', rec['db_name']])
        cmd.extend(['-f', rec['query_filename']])
        if 'hive_init_file' in rec:
            cmd.extend(['-i', rec['hive_init_file']])
            # Record contents of file in result.
            rec['hive_init_file_contents'] = read_file_to_string(
                rec['hive_init_file'])
        cmd.extend(hiveconf)

    elif db_type == 'impala':
        cmd = []
        cmd.extend(['impala-shell'])
        cmd.extend([
            '--impalad',
            '%s:%d' % (rec.get('impalad_host',
                               'localhost'), rec.get('impalad_port', 21000))
        ])
        cmd.extend(['--database', rec['db_name']])
        cmd.extend(['-f', rec['query_filename']])
        cmd.extend(['-B'])  # turn off pretty printing
        cmd.extend(['-o', '/dev/null'])
        if rec.get('profile_query'):
            cmd.extend(['--show_profiles'])

    else:
        raise ('Unknown db_type')

    logging.info('%d: # %s' % (stream_id, ' '.join(cmd)))
    rec['query_command'] = cmd

    t0 = datetime.datetime.utcnow()

    # Run query.

    return_code, output, errors = system_command(cmd,
                                                 print_command=False,
                                                 print_output=print_output,
                                                 timeout=rec.get(
                                                     'command_timeout_sec',
                                                     None),
                                                 raise_on_error=False,
                                                 shell=shell)

    t1 = datetime.datetime.utcnow()
    td = t1 - t0

    rec['utc_begin'] = t0.isoformat()
    rec['utc_end'] = t1.isoformat()
    rec['elapsed_sec'] = time_duration_to_seconds(td)
    rec['error'] = (return_code != 0)
    rec['exit_code'] = return_code
    rec['command_timed_out'] = (return_code == -1)
    rec['output'] = output
    rec['errors'] = errors
    rec['record_type'] = 'query_result'

    # Parse query output to determine elapsed time and rows returned.

    if db_type == 'hive':
        rec['application_id'] = regex_first_group(
            '\\(Executing on YARN cluster with App id (application_.*)\\)$',
            errors,
            return_on_no_match=None,
            search=True,
            flags=re.MULTILINE)

        # Extract actual query duration from stderr text. Note that we must find the last occurance of 'Time taken'.
        query_elapsed_sec = regex_first_group('Time taken: ([0-9.]+) seconds',
                                              errors,
                                              return_on_no_match='nan',
                                              search=True,
                                              flags=re.MULTILINE,
                                              match_last=True)
        if query_elapsed_sec == 'nan':
            logging.warn('Time taken not returned by command.')
            rec['error'] = True
        rec['query_elapsed_sec'] = float(query_elapsed_sec)
        rec['non_query_elapsed_sec'] = rec['elapsed_sec'] - rec[
            'query_elapsed_sec']

        # Extract row count from stderr text. Note that some queries will not report fetched rows.
        query_rows_returned = regex_first_group('Fetched: ([0-9]+) row',
                                                errors,
                                                return_on_no_match='0',
                                                search=True,
                                                flags=re.MULTILINE)
        rec['query_rows_returned'] = int(query_rows_returned)

        logging.info(
            'error=%d, query_elapsed_sec=%f, non_query_elapsed_sec=%f, query_rows_returned=%d'
            % (rec['error'], rec['query_elapsed_sec'],
               rec['non_query_elapsed_sec'], rec['query_rows_returned']))

    elif db_type == 'impala':
        # Extract actual query duration from stderr text.
        # Fetched 100 row(s) in 0.98s
        query_elapsed_sec = regex_first_group(
            'Fetched [0-9]+ row\\(s\\) in ([0-9.]+)s',
            errors,
            return_on_no_match='nan',
            search=True,
            flags=re.MULTILINE,
            match_last=True)
        if query_elapsed_sec == 'nan':
            logging.warn('Time taken not returned by command.')
            rec['error'] = True
        rec['query_elapsed_sec'] = float(query_elapsed_sec)
        rec['non_query_elapsed_sec'] = rec['elapsed_sec'] - rec[
            'query_elapsed_sec']

        # Extract row count from stderr text. Note that some queries will not report fetched rows.
        query_rows_returned = regex_first_group('Fetched ([0-9]+) row\\(s\\)',
                                                errors,
                                                return_on_no_match='0',
                                                search=True,
                                                flags=re.MULTILINE)
        rec['query_rows_returned'] = int(query_rows_returned)

        logging.info(
            'error=%d, query_elapsed_sec=%f, non_query_elapsed_sec=%f, query_rows_returned=%d'
            % (rec['error'], rec['query_elapsed_sec'],
               rec['non_query_elapsed_sec'], rec['query_rows_returned']))

    else:
        rec['query_elapsed_sec'] = rec['elapsed_sec']
        rec['non_query_elapsed_sec'] = 0.0
        rec['query_rows_returned'] = np.nan

    # Handle errors.

    if rec['error']:
        logging.info('%d: return_code=%d' % (stream_id, return_code))
        if not print_output:
            logging.info('%d: %s' % (stream_id, output))

        if db_type == 'hive':
            # Kill YARN application
            if rec['application_id']:
                kill_yarn_job(rec['application_id'])

    if errors != '':
        if not print_output:
            logging.info('%d: %s' % (stream_id, errors))

    if not rec['error']:
        logging.info('%d: %s: %0.3f seconds' %
                     (stream_id, rec['query_filename'], rec['elapsed_sec']))

    return rec
Exemple #8
0
def process_sample(args):
    logging.info('BEGIN')
    record_uuid = str(uuid.uuid4())
    sample_id = args.sample_id
    hostname = socket.gethostname()
    logging.info('record_uuid=%s' % record_uuid)
    logging.info('sample_id=%s' % sample_id)
    logging.info('hostname=%s' % hostname)

    t0 = datetime.datetime.utcnow()

    rec = {}
    rec['batch_uuid'] = args.batch_uuid
    rec['record_uuid'] = record_uuid
    rec['sample_id'] = sample_id
    rec['hostname'] = hostname
    rec['args'] = args.__dict__

    exception = None

    try:
        input_dir = os.path.join(args.input_dir, sample_id)
        output_dir = os.path.join(args.output_dir, sample_id)
        temp_dir = os.path.join(args.temp_dir, sample_id)
        rec['input_dir'] = input_dir
        rec['output_dir'] = output_dir
        rec['temp_dir'] = temp_dir

        logging.debug('input_dir=%s' % input_dir)
        logging.debug('output_dir=%s' % output_dir)
        logging.debug('temp_dir=%s' % temp_dir)

        if not args.noop and os.path.exists(temp_dir): shutil.rmtree(temp_dir)
        os.makedirs(temp_dir, exist_ok=True)
        os.makedirs(output_dir, exist_ok=True)

        # Create copy of Parabricks installation just for this process.
        # If installation directories are used by different processes concurrently, corruption
        # in the Singularity image may occur.
        cmd = [
            'tar', '-xzvf', args.parabricks_install_tgz_file, '-C', temp_dir
        ]
        system_command(
            cmd,
            print_command=True,
            print_output=True,
            raise_on_error=True,
            shell=False,
            noop=args.noop,
        )
        pbrun_file_name = os.path.join(temp_dir, 'parabricks', 'pbrun')
        logging.debug('pbrun_file_name=%s' % pbrun_file_name)
        assert os.path.exists(pbrun_file_name)

        # Slurm sets CUDA_VISIBLE_DEVICES but pbrun requires NVIDIA_VISIBLE_DEVICES.
        env = os.environ.copy()
        cuda_visible_devices = os.environ.get('CUDA_VISIBLE_DEVICES',
                                              '0,1,2,3')
        logging.info('cuda_visible_devices=%s' % cuda_visible_devices)
        num_gpus = len(cuda_visible_devices.split(','))
        logging.info('num_gpus=%d' % num_gpus)
        env['NVIDIA_VISIBLE_DEVICES'] = cuda_visible_devices
        rec['env'] = env
        rec['cuda_visible_devices'] = cuda_visible_devices
        rec['num_gpus'] = num_gpus

        fq_pairs = []
        fq_file_sizes = []
        for i in range(args.max_num_fq_pairs):
            pair = []
            for j in range(1, 3):
                filename = os.path.join(input_dir, '%d_%d.fq.gz' % (i, j))
                if os.path.isfile(filename):
                    pair += [filename]
                    fq_file_sizes += [os.path.getsize(filename)]
            if pair:
                fq_pairs += [pair]
        logging.debug('fq_pairs=%s' % str(fq_pairs))
        rec['fq_pairs'] = fq_pairs
        logging.info('fq_file_sizes=%s' % str(fq_file_sizes))
        rec['fq_file_sizes'] = fq_file_sizes

        in_fq_cmd = []
        for i, fq_pair in enumerate(fq_pairs):
            header = '@RG\\tID:%d\\tLB:lib1\\tPL:bar\\tSM:%s\\tPU:%d' % (
                i, sample_id, i)
            in_fq_cmd += ['--in-fq'] + fq_pair + [header]

        logging.debug('in_fq_cmd=%s' % str(in_fq_cmd))

        bam_file_name = os.path.join(output_dir, '%s.bam' % sample_id)
        gvcf_file_name = os.path.join(output_dir, '%s.g.vcf' % sample_id)
        dv_gvcf_file_name = os.path.join(output_dir, '%s_dv.g.vcf' % sample_id)

        if args.fq2bam:
            cmd = [
                pbrun_file_name,
                'fq2bam',
                '--ref',
                os.path.join(args.reference_files_dir,
                             'Homo_sapiens_assembly38.fasta'),
                '--out-bam',
                bam_file_name,
                '--out-recal-file',
                os.path.join(output_dir, '%s.txt' % sample_id),
                '--knownSites',
                os.path.join(
                    args.reference_files_dir,
                    'Mills_and_1000G_gold_standard.indels.hg38.vcf.gz'),
                '--knownSites',
                os.path.join(args.reference_files_dir,
                             'Homo_sapiens_assembly38.dbsnp138.vcf'),
                '--tmp-dir',
                temp_dir,
                '--num-gpus',
                '%d' % num_gpus,
            ]
            cmd += in_fq_cmd
            rec['fq2bam_result'] = {}
            run_system_command(cmd,
                               rec['fq2bam_result'],
                               env=env,
                               noop=args.noop)

        if args.germline:
            cmd = [
                pbrun_file_name,
                'germline',
                '--ref',
                os.path.join(args.reference_files_dir,
                             'Homo_sapiens_assembly38.fasta'),
                '--out-bam',
                bam_file_name,
                '--out-recal-file',
                os.path.join(output_dir, '%s.txt' % sample_id),
                '--knownSites',
                os.path.join(
                    args.reference_files_dir,
                    'Mills_and_1000G_gold_standard.indels.hg38.vcf.gz'),
                '--knownSites',
                os.path.join(args.reference_files_dir,
                             'Homo_sapiens_assembly38.dbsnp138.vcf'),
                '--out-variants',
                gvcf_file_name,
                '--gvcf',
                '--tmp-dir',
                temp_dir,
                '--num-gpus',
                '%d' % num_gpus,
            ]
            cmd += in_fq_cmd
            rec['germline_result'] = {}
            run_system_command(cmd,
                               rec['germline_result'],
                               env=env,
                               noop=args.noop)
            rec['haplotypecaller_gvcf_file_size_bytes'] = os.path.getsize(
                gvcf_file_name)

        rec['bam_file_size_bytes'] = os.path.getsize(bam_file_name)
        logging.debug('bam_file_size_bytes=%d' % rec['bam_file_size_bytes'])

        if args.haplotypecaller:
            cmd = [
                pbrun_file_name,
                'haplotypecaller',
                '--ref',
                os.path.join(args.reference_files_dir,
                             'Homo_sapiens_assembly38.fasta'),
                '--in-bam',
                bam_file_name,
                '--in-recal-file',
                os.path.join(output_dir, '%s.txt' % sample_id),
                '--out-variants',
                gvcf_file_name,
                '--gvcf',
                '--tmp-dir',
                temp_dir,
                '--num-gpus',
                '%d' % num_gpus,
            ]
            rec['haplotypecaller_result'] = {}
            run_system_command(cmd,
                               rec['haplotypecaller_result'],
                               env=env,
                               noop=args.noop)
            rec['haplotypecaller_gvcf_file_size_bytes'] = os.path.getsize(
                gvcf_file_name)

        if args.deepvariant:
            # deepvariant uses the bam output of fq2bam or germline.
            cmd = [
                pbrun_file_name,
                'deepvariant',
                '--ref',
                os.path.join(args.reference_files_dir,
                             'Homo_sapiens_assembly38.fasta'),
                '--in-bam',
                bam_file_name,
                '--out-variants',
                dv_gvcf_file_name,
                '--gvcf',
                '--tmp-dir',
                temp_dir,
                '--num-gpus',
                '%d' % num_gpus,
            ]
            rec['deepvariant_result'] = {}
            run_system_command(cmd,
                               rec['deepvariant_result'],
                               env=env,
                               noop=args.noop)
            rec['deepvariant_gvcf_file_size_bytes'] = os.path.getsize(
                dv_gvcf_file_name)

        if not args.noop and os.path.exists(temp_dir): shutil.rmtree(temp_dir)

    except Exception as e:
        exception = e
        rec['error'] = True

    t1 = datetime.datetime.utcnow()
    td = t1 - t0
    rec['utc_begin'] = t0.isoformat()
    rec['utc_end'] = t1.isoformat()
    rec['elapsed_sec'] = time_duration_to_seconds(td)

    if args.summary_file:
        record_result(rec, args.summary_file)

    logging.info('END')
    if exception: raise exception
    def run_test(self):
        rec = self.test_config

        data_size_MB = rec['data_size_MB']
        base_directory = rec['base_directory'] % rec
        test_directory = '%s/TPCx-HS-benchmark' % base_directory
        sort_input_directory = '%s/HSsort-input' % test_directory
        sort_output_directory = '%s/HSsort-output' % test_directory
        validate_output_directory = '%s/HSValidate' % test_directory

        rec['data_size_TB'] = rec['data_size_MB'] / 1e6
        rec['sf'] = rec['data_size_TB']
        rec['error'] = False

        #
        # Build commands
        #

        # HSGen
        rec_size = 100
        recs = int(data_size_MB * 1000.0 * 1000.0 / rec_size)
        cmd = []
        cmd.extend(['hadoop', 'jar', rec['jar'], 'HSGen'])
        cmd.extend(get_hadoop_parameters(rec))
        cmd.extend([str(recs), sort_input_directory])
        rec['hsgen:hadoop_command'] = cmd

        # HSSort
        cmd = []
        cmd.extend(['hadoop', 'jar', rec['jar'], 'HSSort'])
        cmd.extend(get_hadoop_parameters(rec))
        cmd.extend([sort_input_directory, sort_output_directory])
        rec['hssort:hadoop_command'] = cmd

        # HSValidate
        cmd = []
        cmd.extend(['hadoop', 'jar', rec['jar'], 'HSValidate'])
        hsvalidate_config = rec.copy()
        del hsvalidate_config['map_tasks']
        del hsvalidate_config['reduce_tasks']
        cmd.extend(get_hadoop_parameters(hsvalidate_config))
        cmd.extend([sort_output_directory, validate_output_directory])
        rec['hsvalidate:hadoop_command'] = cmd

        for key in [
                'hsgen:hadoop_command', 'hssort:hadoop_command',
                'hsvalidate:hadoop_command'
        ]:
            logging.info('%s: %s' % (key, rec[key]))

        #
        # Prepare for benchmark
        #

        self.hadoop_authenticate()
        self.configure_environment()
        self.delete_hadoop_directory('%s/*' % test_directory)
        system_command(['hadoop', 'fs', '-expunge'],
                       print_command=True,
                       print_output=True,
                       raise_on_error=True,
                       shell=False)
        logging.info('Sleeping for %0.0f seconds' %
                     rec['sleep_after_delete_sec'])
        time.sleep(rec['sleep_after_delete_sec'])

        with self.metrics_collector_context():
            self.start_metrics()

            #
            # Run benchmark
            #

            t0 = datetime.datetime.utcnow()

            try:
                # HSGen
                self.run_mapred_job(key_prefix='hsgen:', raise_on_error=True)
                system_command(
                    ['hdfs', 'dfs', '-ls',
                     '%s/*' % sort_input_directory],
                    print_command=True,
                    print_output=True,
                    raise_on_error=True,
                    shell=False)

                # HSSort
                self.run_mapred_job(key_prefix='hssort:', raise_on_error=True)
                system_command(
                    ['hdfs', 'dfs', '-ls',
                     '%s/*' % sort_output_directory],
                    print_command=True,
                    print_output=True,
                    raise_on_error=True,
                    shell=False)

                # HSValidate
                self.run_mapred_job(key_prefix='hsvalidate:',
                                    raise_on_error=True)
                system_command(
                    ['hdfs', 'dfs', '-ls',
                     '%s/*' % validate_output_directory],
                    print_command=True,
                    print_output=True,
                    raise_on_error=True,
                    shell=False)
            except:
                logging.error('EXCEPTION: %s' % traceback.format_exc())
                rec['error'] = True

            t1 = datetime.datetime.utcnow()
            td = t1 - t0

        rec['elapsed_sec'] = time_duration_to_seconds(td)
        if not rec['error']:
            rec['total_io_rate_MB_per_sec'] = rec['data_size_MB'] / rec[
                'elapsed_sec']
            rec['io_rate_MB_per_sec_per_storage_node'] = rec[
                'total_io_rate_MB_per_sec'] / rec.get('storage_num_nodes',
                                                      float('nan'))
            rec['HSph@SF'] = rec['sf'] / (rec['elapsed_sec'] / 3600.0)
            logging.info('RESULT: elapsed_sec=%f, HSph@SF=%f' %
                         (rec['elapsed_sec'], rec['HSph@SF']))
        self.record_result()
        if rec['error']:
            raise Exception('Test failed')