Example #1
0
def run_system_command(cmd,
                       rec,
                       shell=False,
                       env=None,
                       noop=False,
                       raise_on_error=True):
    t0 = datetime.datetime.utcnow()
    return_code, output, errors = system_command(
        cmd,
        print_command=True,
        print_output=True,
        raise_on_error=False,
        env=env,
        shell=shell,
        noop=noop,
    )
    t1 = datetime.datetime.utcnow()
    td = t1 - t0
    logging.info('exit_code=%d' % return_code)
    rec['command'] = cmd
    rec['utc_begin'] = t0.isoformat()
    rec['utc_end'] = t1.isoformat()
    rec['elapsed_sec'] = time_duration_to_seconds(td)
    rec['error'] = (return_code != 0)
    rec['exit_code'] = return_code
    rec['command_timed_out'] = (return_code == -1)
    rec['output'] = output
    rec['errors'] = errors
    if not noop and raise_on_error and return_code != 0:
        raise Exception('System command returned %d: %s' % (return_code, cmd))
Example #2
0
def run_query_stream(queue, stream_config):
    stream_id = stream_config['stream_id']
    logging.info('%d: Stream begin' % stream_id)
    t0 = datetime.datetime.utcnow()
    stream_error = False

    for query_index, query_filename in enumerate(
            stream_config['query_filenames']):
        logging.info('%d: query_index=%d, query_filename=%s' %
                     (stream_id, query_index, query_filename))
        query_config = stream_config.copy()
        del query_config['query_filenames']
        query_config['query_index'] = query_index
        query_config['query_filename'] = query_filename
        run_query(query_config)
        if query_config['error']: stream_error = True
        # Place query_result record in queue. These will be collected and recorded by SqlBatchTest.run_test().
        queue.put(query_config)

    t1 = datetime.datetime.utcnow()
    td = t1 - t0
    rec = stream_config.copy()
    rec['record_type'] = 'query_stream_summary'
    rec['utc_begin'] = t0.isoformat()
    rec['utc_end'] = t1.isoformat()
    rec['elapsed_sec'] = time_duration_to_seconds(td)
    rec['error'] = stream_error
    # Place query_stream_summary record in queue. These will be collected and recorded by SqlBatchTest.run_test().
    queue.put(rec)

    logging.info('%d: Stream end' % stream_id)
Example #3
0
    def run_mapred_job(self, key_prefix='', raise_on_error=False):
        rec = self.test_config

        # Build environment for command.
        env = None
        hadoop_command_env = rec.get('%shadoop_command_env' % key_prefix)
        if hadoop_command_env:
            env = dict(os.environ)
            env.update(hadoop_command_env)

        t0 = datetime.datetime.utcnow()

        return_code, output, errors = system_command(
            rec['%shadoop_command' % key_prefix],
            print_command=True,
            print_output=True,
            timeout=rec.get('%scommand_timeout_sec' % key_prefix),
            raise_on_error=False,
            shell=False,
            noop=rec.get('%snoop' % key_prefix, False),
            env=env)

        t1 = datetime.datetime.utcnow()
        td = t1 - t0

        rec['%sutc_begin' % key_prefix] = t0.isoformat()
        rec['%sutc_end' % key_prefix] = t1.isoformat()
        rec['%selapsed_sec' % key_prefix] = time_duration_to_seconds(td)
        rec['%serror' % key_prefix] = (return_code != 0)
        rec['%scommand_timed_out' % key_prefix] = (return_code == -1)
        rec['%sexit_code' % key_prefix] = return_code
        rec['%soutput' % key_prefix] = output
        rec['%serrors' % key_prefix] = errors
        rec['%sbytes_read_hdfs' % key_prefix] = float(
            regex_first_group('Bytes Read=(.*)',
                              errors,
                              return_on_no_match='nan',
                              search=True))
        rec['%sbytes_written_hdfs' % key_prefix] = float(
            regex_first_group('Bytes Written=(.*)',
                              errors,
                              return_on_no_match='nan',
                              search=True))
        rec['%shadoop_job_id' % key_prefix] = regex_first_group(
            'Running job: (job_[0-9_]+)', errors, search=True)

        if rec['%serror' % key_prefix]:
            raise Exception('Hadoop job failed')
Example #4
0
    def run_mapred_job(self):
        config = self.test_config

        with self.metrics_collector_context():
            self.start_metrics()

            # Build environment for command.
            env = None
            hadoop_command_env = config.get('hadoop_command_env')
            if hadoop_command_env:
                env = dict(os.environ)
                env.update(hadoop_command_env)

            logging.info('*****************************************************************');
            logging.info(config['test_desc'])
            
            t0 = datetime.datetime.utcnow()

            exit_code, output, errors = system_command(config['hadoop_command'], print_command=True, print_output=True, 
                raise_on_error=False, shell=False, noop=config['noop'], env=env,
                timeout=config.get('command_timeout_sec',None))
        
            t1 = datetime.datetime.utcnow()
            td = t1 - t0

            config['utc_begin'] = t0.isoformat()
            config['utc_end'] = t1.isoformat()
            config['elapsed_sec'] = time_duration_to_seconds(td)
            config['error'] = (exit_code != 0)
            config['command_timed_out'] = (exit_code == -1)
            config['exit_code'] = exit_code
            config['output'] = output
            config['errors'] = errors

            config['bytes_read_hdfs'] = float(regex_first_group('Bytes Read=(.*)', errors, return_on_no_match='nan', search=True))
            config['bytes_written_hdfs'] = float(regex_first_group('Bytes Written=(.*)', errors, return_on_no_match='nan', search=True))
            config['hadoop_job_id'] = regex_first_group('Running job: (job_[0-9_]+)', errors, search=True)

            self.get_completed_job_info()
    def run_test(self):
        rec = self.test_config
        test_uuid = rec['test_uuid']
        driver = rec['driver']
        workload = rec['workload']
        numWorkers = rec['numWorkers']
        localWorker = rec['localWorker']

        self.inspect_environment()

        workload['name'] = test_uuid
        driver_file_name = '/tmp/driver-' + test_uuid + '.yaml'
        workload_file_name = '/tmp/workload-' + test_uuid + '.yaml'
        payload_file_name = '/tmp/payload-' + test_uuid + '.data'
        workload['payloadFile'] = payload_file_name

        self.deploy()

        if localWorker:
            # TODO: Doesn't work because workers.yaml exists.
            workers_args = ''
        else:
            return_code, results_yaml, errors = self.ssh(
                'cat /opt/benchmark/workers.yaml')
            workers = yaml.load(StringIO(results_yaml))['workers']
            workers = workers[0:numWorkers]
            logging.info("workers=%s" % str(workers))
            rec['omb_workers'] = workers
            workers_args = '--workers %s' % ','.join(workers)

        if driver['name'] == 'Pravega':
            return_code, results_yaml, errors = self.ssh(
                'cat /opt/benchmark/driver-pravega/pravega.yaml')
            deployed_driver = yaml.load(StringIO(results_yaml))
            driver['client']['controllerURI'] = deployed_driver['client'][
                'controllerURI']
        elif driver['name'] == 'Pulsar':
            return_code, results_yaml, errors = self.ssh(
                'cat /opt/benchmark/driver-pulsar/pulsar.yaml')
            deployed_driver = yaml.load(StringIO(results_yaml))
            driver['client']['serviceUrl'] = deployed_driver['client'][
                'serviceUrl']
            driver['client']['httpUrl'] = deployed_driver['client']['httpUrl']
        elif driver['name'] == 'Kafka':
            return_code, results_yaml, errors = self.ssh(
                'cat /opt/benchmark/driver-kafka/kafka.yaml')
            deployed_driver = yaml.load(StringIO(results_yaml))
            driver['commonConfig'] = deployed_driver['commonConfig']
        else:
            raise Exception('Unsupported driver')

        self.create_yaml_file(driver, driver_file_name)
        self.create_yaml_file(workload, workload_file_name)

        cmd = ('cd /opt/benchmark' + ' && sudo chmod go+rw .' +
               ' && dd if=/dev/urandom of=' + payload_file_name + ' bs=' +
               str(workload['messageSize']) + ' count=1 status=none' +
               ' && bin/benchmark --drivers ' + driver_file_name + ' ' +
               workers_args + ' ' + workload_file_name)
        rec['_status_node'].set_status('Running command: %s' % str(cmd))

        t0 = datetime.datetime.utcnow()

        return_code, output, errors = self.ssh(cmd, raise_on_error=False)

        t1 = datetime.datetime.utcnow()
        td = t1 - t0

        rec['utc_begin'] = t0.isoformat()
        rec['utc_end'] = t1.isoformat()
        rec['elapsed_sec'] = time_duration_to_seconds(td)
        rec['error'] = (return_code != 0)
        rec['exit_code'] = return_code
        rec['command_timed_out'] = (return_code == -1)
        rec['output'] = output
        rec['errors'] = errors

        # Collect results to store in results.json
        try:
            return_code, results_json, errors = self.ssh(
                'cat /opt/benchmark/*' + test_uuid + '*.json',
                print_output=False,
            )
            rec['omb_results'] = json.load(StringIO(results_json.decode()))
        except Exception as e:
            logging.warn('Unable to collect logs: %s' % e)
            rec['error'] = True

        rec['run_as_test'] = rec['test']
        if 'record_as_test' in rec:
            rec['test'] = rec['record_as_test']
        if 'result_filename' in rec:
            record_result(rec, rec['result_filename'])
        if rec['command_timed_out']:
            raise TimeoutException()
        if rec['error']:
            raise Exception('Command failed')
    def run_test(self):
        rec = self.test_config

        self.deploy()

        git_commit = subprocess.run(['git', 'log', '--oneline', '-1'],
                                    capture_output=True,
                                    check=True).stdout.decode()

        test_uuid = rec['test_uuid']
        driver = rec['driver']
        workload = rec['workload']
        numWorkers = rec['numWorkers']
        localWorker = rec['localWorker']
        namespace = rec['namespace']

        params = {
            'test_uuid': test_uuid,
            'utc_begin': rec['utc_begin'],
            'driver': driver,
            'workload': workload,
            'numWorkers': numWorkers,
            'git_commit': git_commit,
        }
        # Encode all parameters in workload name attribute so they get written to the results file.
        workload['name'] = json.dumps(params)
        print(yaml.dump(params, default_flow_style=False))

        driver_file_name = '/tmp/driver-' + test_uuid + '.yaml'
        workload_file_name = '/tmp/workload-' + test_uuid + '.yaml'
        payload_file_name = '/tmp/payload-' + test_uuid + '.data'

        workload['payloadFile'] = payload_file_name

        create_yaml_file(driver, driver_file_name, namespace)
        create_yaml_file(workload, workload_file_name, namespace)

        if localWorker:
            workers_args = ''
        else:
            workers = [
                'http://%s-openmessaging-benchmarking-worker-%d.%s-openmessaging-benchmarking-worker:8080'
                % (namespace, worker_number, namespace)
                for worker_number in range(numWorkers)
            ]
            workers_args = '--workers %s' % ','.join(workers)

        cmd = [
            'kubectl', 'exec', '-n', namespace,
            'examples-openmessaging-benchmarking-driver', '--', 'bash', '-c',
            'rm -f /tmp/logs.tar.gz' + ' && dd if=/dev/urandom of=' +
            payload_file_name + ' bs=' + str(workload['messageSize']) +
            ' count=1 status=none' + ' && bin/benchmark --drivers ' +
            driver_file_name + ' ' + workers_args + ' ' + workload_file_name +
            ' && tar -czvf /tmp/logs-' + test_uuid + '.tar.gz *' + test_uuid +
            '*.json' + ' && rm -f ' + payload_file_name
        ]
        rec['_status_node'].set_status('Running command: %s' % str(cmd))

        t0 = datetime.datetime.utcnow()

        return_code, output, errors = system_command(
            cmd,
            print_output=True,
            shell=False,
            timeout=(workload['testDurationMinutes'] + 5) * 60,
            raise_on_error=False,
            noop=rec['noop'],
        )

        t1 = datetime.datetime.utcnow()
        td = t1 - t0

        logging.info('exit_code=%d' % return_code)

        rec['utc_begin'] = t0.isoformat()
        rec['utc_end'] = t1.isoformat()
        rec['elapsed_sec'] = time_duration_to_seconds(td)
        rec['error'] = (return_code != 0)
        rec['exit_code'] = return_code
        rec['command_timed_out'] = (return_code == -1)
        rec['output'] = output
        rec['errors'] = errors

        # Collect logs to store in results.json
        cmd = [
            'kubectl',
            'exec',
            '-n',
            namespace,
            'examples-openmessaging-benchmarking-driver',
            '--',
            'bash',
            '-c',
            'cat *' + test_uuid + '*.json',
        ]
        return_code, results_json, errors = system_command(
            cmd, print_output=False, shell=False, raise_on_error=False)
        rec['omb_results'] = json.load(StringIO(results_json.decode()))

        # Collect and extract logs (outside of results.json) (not required)
        cmd = [
            'kubectl',
            'cp',
            '%s/%s-openmessaging-benchmarking-driver:/tmp/logs-%s.tar.gz' %
            (namespace, namespace, test_uuid),
            'logs/logs-%s.tar.gz' % test_uuid,
        ]
        subprocess.run(cmd, check=True)
        cmd = [
            'tar',
            '-xzvf',
            'logs/logs-%s.tar.gz' % test_uuid,
            '-C',
            'logs',
        ]
        subprocess.run(cmd, check=True)

        rec['run_as_test'] = rec['test']
        if 'record_as_test' in rec:
            rec['test'] = rec['record_as_test']
        if 'result_filename' in rec:
            record_result(rec, rec['result_filename'])
        if rec['command_timed_out']:
            raise TimeoutException()
        if rec['error']:
            raise Exception('Command failed')
Example #7
0
def run_query(query_config):
    rec = query_config
    print_output = rec.get('print_output', True)
    stream_id = rec.get('stream_id', 0)

    rec['db_name'] = rec['db_name'] % rec

    if rec.get('kill_all_yarn_jobs_before_each_query', False):
        kill_all_yarn_jobs()

    rec['query_filename_contents'] = read_file_to_string(rec['query_filename'])

    shell = False
    db_type = rec['db_type']

    # Build query command.

    if db_type == 'hawq':
        cmd = []
        cmd.extend(['psql'])
        cmd.extend(['-v', 'ON_ERROR_STOP=1'])
        cmd.extend(['-d', rec['db_name']])
        cmd.extend(['-tAf', rec['query_filename']])

    elif db_type == 'hive':
        if not 'hiveconf:hive.tez.java.opts' in rec and 'java_opts_xmx_ratio' in rec and 'hiveconf:hive.tez.container.size' in rec:
            rec['hiveconf:hive.tez.java.opts'] = '-Xmx%dm' % (
                rec['hiveconf:hive.tez.container.size'] *
                rec['java_opts_xmx_ratio'])
        hiveconf = []
        for k, v in rec.items():
            prop = regex_first_group('^hiveconf:(.*)', k)
            if prop:
                hiveconf.extend(['--hiveconf', '"%s=%s"' % (prop, v)])
        cmd = []
        cmd.extend(['hive'])
        cmd.extend(['--database', rec['db_name']])
        cmd.extend(['-f', rec['query_filename']])
        if 'hive_init_file' in rec:
            cmd.extend(['-i', rec['hive_init_file']])
            # Record contents of file in result.
            rec['hive_init_file_contents'] = read_file_to_string(
                rec['hive_init_file'])
        cmd.extend(hiveconf)

    elif db_type == 'impala':
        cmd = []
        cmd.extend(['impala-shell'])
        cmd.extend([
            '--impalad',
            '%s:%d' % (rec.get('impalad_host',
                               'localhost'), rec.get('impalad_port', 21000))
        ])
        cmd.extend(['--database', rec['db_name']])
        cmd.extend(['-f', rec['query_filename']])
        cmd.extend(['-B'])  # turn off pretty printing
        cmd.extend(['-o', '/dev/null'])
        if rec.get('profile_query'):
            cmd.extend(['--show_profiles'])

    else:
        raise ('Unknown db_type')

    logging.info('%d: # %s' % (stream_id, ' '.join(cmd)))
    rec['query_command'] = cmd

    t0 = datetime.datetime.utcnow()

    # Run query.

    return_code, output, errors = system_command(cmd,
                                                 print_command=False,
                                                 print_output=print_output,
                                                 timeout=rec.get(
                                                     'command_timeout_sec',
                                                     None),
                                                 raise_on_error=False,
                                                 shell=shell)

    t1 = datetime.datetime.utcnow()
    td = t1 - t0

    rec['utc_begin'] = t0.isoformat()
    rec['utc_end'] = t1.isoformat()
    rec['elapsed_sec'] = time_duration_to_seconds(td)
    rec['error'] = (return_code != 0)
    rec['exit_code'] = return_code
    rec['command_timed_out'] = (return_code == -1)
    rec['output'] = output
    rec['errors'] = errors
    rec['record_type'] = 'query_result'

    # Parse query output to determine elapsed time and rows returned.

    if db_type == 'hive':
        rec['application_id'] = regex_first_group(
            '\\(Executing on YARN cluster with App id (application_.*)\\)$',
            errors,
            return_on_no_match=None,
            search=True,
            flags=re.MULTILINE)

        # Extract actual query duration from stderr text. Note that we must find the last occurance of 'Time taken'.
        query_elapsed_sec = regex_first_group('Time taken: ([0-9.]+) seconds',
                                              errors,
                                              return_on_no_match='nan',
                                              search=True,
                                              flags=re.MULTILINE,
                                              match_last=True)
        if query_elapsed_sec == 'nan':
            logging.warn('Time taken not returned by command.')
            rec['error'] = True
        rec['query_elapsed_sec'] = float(query_elapsed_sec)
        rec['non_query_elapsed_sec'] = rec['elapsed_sec'] - rec[
            'query_elapsed_sec']

        # Extract row count from stderr text. Note that some queries will not report fetched rows.
        query_rows_returned = regex_first_group('Fetched: ([0-9]+) row',
                                                errors,
                                                return_on_no_match='0',
                                                search=True,
                                                flags=re.MULTILINE)
        rec['query_rows_returned'] = int(query_rows_returned)

        logging.info(
            'error=%d, query_elapsed_sec=%f, non_query_elapsed_sec=%f, query_rows_returned=%d'
            % (rec['error'], rec['query_elapsed_sec'],
               rec['non_query_elapsed_sec'], rec['query_rows_returned']))

    elif db_type == 'impala':
        # Extract actual query duration from stderr text.
        # Fetched 100 row(s) in 0.98s
        query_elapsed_sec = regex_first_group(
            'Fetched [0-9]+ row\\(s\\) in ([0-9.]+)s',
            errors,
            return_on_no_match='nan',
            search=True,
            flags=re.MULTILINE,
            match_last=True)
        if query_elapsed_sec == 'nan':
            logging.warn('Time taken not returned by command.')
            rec['error'] = True
        rec['query_elapsed_sec'] = float(query_elapsed_sec)
        rec['non_query_elapsed_sec'] = rec['elapsed_sec'] - rec[
            'query_elapsed_sec']

        # Extract row count from stderr text. Note that some queries will not report fetched rows.
        query_rows_returned = regex_first_group('Fetched ([0-9]+) row\\(s\\)',
                                                errors,
                                                return_on_no_match='0',
                                                search=True,
                                                flags=re.MULTILINE)
        rec['query_rows_returned'] = int(query_rows_returned)

        logging.info(
            'error=%d, query_elapsed_sec=%f, non_query_elapsed_sec=%f, query_rows_returned=%d'
            % (rec['error'], rec['query_elapsed_sec'],
               rec['non_query_elapsed_sec'], rec['query_rows_returned']))

    else:
        rec['query_elapsed_sec'] = rec['elapsed_sec']
        rec['non_query_elapsed_sec'] = 0.0
        rec['query_rows_returned'] = np.nan

    # Handle errors.

    if rec['error']:
        logging.info('%d: return_code=%d' % (stream_id, return_code))
        if not print_output:
            logging.info('%d: %s' % (stream_id, output))

        if db_type == 'hive':
            # Kill YARN application
            if rec['application_id']:
                kill_yarn_job(rec['application_id'])

    if errors != '':
        if not print_output:
            logging.info('%d: %s' % (stream_id, errors))

    if not rec['error']:
        logging.info('%d: %s: %0.3f seconds' %
                     (stream_id, rec['query_filename'], rec['elapsed_sec']))

    return rec
Example #8
0
    def run_test(self):
        config = self.test_config

        config['root_test_uuid'] = config['test_uuid']
        child_messages = {}

        # Create random query list for each stream
        config['query_filenames'] = sorted(
            glob_file_list(config['query_filespec']))
        random.seed(config['random_seed'])
        stream_configs = []
        queries_per_stream = config.get('queries_per_stream', 0)
        for stream_id in range(0, config.get('stream_count', 1)):
            stream_config = config.copy()
            stream_config['stream_id'] = stream_id
            if config['random_seed'] != 0:
                random.shuffle(stream_config['query_filenames'])
            if queries_per_stream > 0:
                stream_config['query_filenames'] = stream_config[
                    'query_filenames'][0:queries_per_stream]
            logging.info('Queries for stream %d: %s' %
                         (stream_config['stream_id'], ' '.join(
                             stream_config['query_filenames'])))
            stream_configs.append(stream_config)

        self.hadoop_authenticate()
        self.configure_environment()

        with self.metrics_collector_context():
            self.start_metrics()

            error_count = 0
            success_count = 0
            t0 = datetime.datetime.utcnow()

            # Start stream processes
            active_streams = {}
            queue = multiprocessing.Queue()
            for stream_config in stream_configs:
                stream_config = stream_config.copy()
                del stream_config[
                    '_status_node']  # We can't send this between processes.
                stream_id = stream_config['stream_id']
                process = multiprocessing.Process(target=run_query_stream,
                                                  args=(queue, stream_config))
                process.start()
                active_streams[stream_id] = {
                    'process': process,
                    'stream_config': stream_config
                }

            # Monitor stream processes
            while len(active_streams.keys()) > 0:
                # Update status
                status_text = 'successful queries=%d, errors=%d' % (
                    success_count, error_count)
                status_node = config['_status_node']
                status_node.set_status(status_text, destroy_children=False)

                # Handle any completed stream processes
                for stream_id in active_streams.keys():
                    process = active_streams[stream_id]['process']
                    if not process.is_alive():
                        logging.info('Stream %d is done' % stream_id)
                        process.join()
                        return_code = process.exitcode
                        if return_code != 0:
                            # An uncaught exception has occured. Normal query failures are not handled here.
                            logging.error('Stream %d returned error %d' %
                                          (stream_id, return_code))
                            error_count += 1
                        del active_streams[stream_id]

                # Process messages (individual query results, stream results) from stream processes
                try:
                    while True:
                        # Wait up to 1 second for next message in queue.
                        message = queue.get(True, 1)
                        # Create a new test_uuid for this child record.
                        # The query batch test_uuid is in root_test_uuid.
                        message['record_uuid'] = str(uuid.uuid4())
                        message['test_uuid'] = message['record_uuid']
                        # Record individual message to a file for immediate visibility.
                        record_result(message, message['result_filename'])
                        # Also add to child_messages key of the query batch record.
                        record_type = message['record_type']
                        if record_type not in child_messages:
                            child_messages[record_type] = []
                        child_messages[record_type].append(message)
                        # Count successful and error queries.
                        if message['record_type'] == 'query_result':
                            if message['error']:
                                error_count += 1
                            else:
                                success_count += 1
                except Queue.Empty:
                    pass
                except KeyboardInterrupt:
                    raise
                except:
                    logging.error('Unexpected error: %s' % sys.exc_info()[0])

            t1 = datetime.datetime.utcnow()
            td = t1 - t0
            logging.info('All streams are done')

        rec = config.copy()
        rec['record_uuid'] = rec['test_uuid']
        rec['record_type'] = 'query_batch_summary'
        rec['utc_begin'] = t0.isoformat()
        rec['utc_end'] = t1.isoformat()
        rec['elapsed_sec'] = time_duration_to_seconds(td)
        rec['error'] = (error_count > 0)
        rec['child_messages'] = child_messages
        record_result(rec, rec['result_filename'])

        logging.info('successful queries=%d, errors=%d' %
                     (success_count, error_count))

        if rec['error']:
            raise Exception('Query batch failed')
Example #9
0
def process_sample(args):
    logging.info('BEGIN')
    record_uuid = str(uuid.uuid4())
    sample_id = args.sample_id
    hostname = socket.gethostname()
    logging.info('record_uuid=%s' % record_uuid)
    logging.info('sample_id=%s' % sample_id)
    logging.info('hostname=%s' % hostname)

    t0 = datetime.datetime.utcnow()

    rec = {}
    rec['batch_uuid'] = args.batch_uuid
    rec['record_uuid'] = record_uuid
    rec['sample_id'] = sample_id
    rec['hostname'] = hostname
    rec['args'] = args.__dict__

    exception = None

    try:
        input_dir = os.path.join(args.input_dir, sample_id)
        output_dir = os.path.join(args.output_dir, sample_id)
        temp_dir = os.path.join(args.temp_dir, sample_id)
        rec['input_dir'] = input_dir
        rec['output_dir'] = output_dir
        rec['temp_dir'] = temp_dir

        logging.debug('input_dir=%s' % input_dir)
        logging.debug('output_dir=%s' % output_dir)
        logging.debug('temp_dir=%s' % temp_dir)

        if not args.noop and os.path.exists(temp_dir): shutil.rmtree(temp_dir)
        os.makedirs(temp_dir, exist_ok=True)
        os.makedirs(output_dir, exist_ok=True)

        # Create copy of Parabricks installation just for this process.
        # If installation directories are used by different processes concurrently, corruption
        # in the Singularity image may occur.
        cmd = [
            'tar', '-xzvf', args.parabricks_install_tgz_file, '-C', temp_dir
        ]
        system_command(
            cmd,
            print_command=True,
            print_output=True,
            raise_on_error=True,
            shell=False,
            noop=args.noop,
        )
        pbrun_file_name = os.path.join(temp_dir, 'parabricks', 'pbrun')
        logging.debug('pbrun_file_name=%s' % pbrun_file_name)
        assert os.path.exists(pbrun_file_name)

        # Slurm sets CUDA_VISIBLE_DEVICES but pbrun requires NVIDIA_VISIBLE_DEVICES.
        env = os.environ.copy()
        cuda_visible_devices = os.environ.get('CUDA_VISIBLE_DEVICES',
                                              '0,1,2,3')
        logging.info('cuda_visible_devices=%s' % cuda_visible_devices)
        num_gpus = len(cuda_visible_devices.split(','))
        logging.info('num_gpus=%d' % num_gpus)
        env['NVIDIA_VISIBLE_DEVICES'] = cuda_visible_devices
        rec['env'] = env
        rec['cuda_visible_devices'] = cuda_visible_devices
        rec['num_gpus'] = num_gpus

        fq_pairs = []
        fq_file_sizes = []
        for i in range(args.max_num_fq_pairs):
            pair = []
            for j in range(1, 3):
                filename = os.path.join(input_dir, '%d_%d.fq.gz' % (i, j))
                if os.path.isfile(filename):
                    pair += [filename]
                    fq_file_sizes += [os.path.getsize(filename)]
            if pair:
                fq_pairs += [pair]
        logging.debug('fq_pairs=%s' % str(fq_pairs))
        rec['fq_pairs'] = fq_pairs
        logging.info('fq_file_sizes=%s' % str(fq_file_sizes))
        rec['fq_file_sizes'] = fq_file_sizes

        in_fq_cmd = []
        for i, fq_pair in enumerate(fq_pairs):
            header = '@RG\\tID:%d\\tLB:lib1\\tPL:bar\\tSM:%s\\tPU:%d' % (
                i, sample_id, i)
            in_fq_cmd += ['--in-fq'] + fq_pair + [header]

        logging.debug('in_fq_cmd=%s' % str(in_fq_cmd))

        bam_file_name = os.path.join(output_dir, '%s.bam' % sample_id)
        gvcf_file_name = os.path.join(output_dir, '%s.g.vcf' % sample_id)
        dv_gvcf_file_name = os.path.join(output_dir, '%s_dv.g.vcf' % sample_id)

        if args.fq2bam:
            cmd = [
                pbrun_file_name,
                'fq2bam',
                '--ref',
                os.path.join(args.reference_files_dir,
                             'Homo_sapiens_assembly38.fasta'),
                '--out-bam',
                bam_file_name,
                '--out-recal-file',
                os.path.join(output_dir, '%s.txt' % sample_id),
                '--knownSites',
                os.path.join(
                    args.reference_files_dir,
                    'Mills_and_1000G_gold_standard.indels.hg38.vcf.gz'),
                '--knownSites',
                os.path.join(args.reference_files_dir,
                             'Homo_sapiens_assembly38.dbsnp138.vcf'),
                '--tmp-dir',
                temp_dir,
                '--num-gpus',
                '%d' % num_gpus,
            ]
            cmd += in_fq_cmd
            rec['fq2bam_result'] = {}
            run_system_command(cmd,
                               rec['fq2bam_result'],
                               env=env,
                               noop=args.noop)

        if args.germline:
            cmd = [
                pbrun_file_name,
                'germline',
                '--ref',
                os.path.join(args.reference_files_dir,
                             'Homo_sapiens_assembly38.fasta'),
                '--out-bam',
                bam_file_name,
                '--out-recal-file',
                os.path.join(output_dir, '%s.txt' % sample_id),
                '--knownSites',
                os.path.join(
                    args.reference_files_dir,
                    'Mills_and_1000G_gold_standard.indels.hg38.vcf.gz'),
                '--knownSites',
                os.path.join(args.reference_files_dir,
                             'Homo_sapiens_assembly38.dbsnp138.vcf'),
                '--out-variants',
                gvcf_file_name,
                '--gvcf',
                '--tmp-dir',
                temp_dir,
                '--num-gpus',
                '%d' % num_gpus,
            ]
            cmd += in_fq_cmd
            rec['germline_result'] = {}
            run_system_command(cmd,
                               rec['germline_result'],
                               env=env,
                               noop=args.noop)
            rec['haplotypecaller_gvcf_file_size_bytes'] = os.path.getsize(
                gvcf_file_name)

        rec['bam_file_size_bytes'] = os.path.getsize(bam_file_name)
        logging.debug('bam_file_size_bytes=%d' % rec['bam_file_size_bytes'])

        if args.haplotypecaller:
            cmd = [
                pbrun_file_name,
                'haplotypecaller',
                '--ref',
                os.path.join(args.reference_files_dir,
                             'Homo_sapiens_assembly38.fasta'),
                '--in-bam',
                bam_file_name,
                '--in-recal-file',
                os.path.join(output_dir, '%s.txt' % sample_id),
                '--out-variants',
                gvcf_file_name,
                '--gvcf',
                '--tmp-dir',
                temp_dir,
                '--num-gpus',
                '%d' % num_gpus,
            ]
            rec['haplotypecaller_result'] = {}
            run_system_command(cmd,
                               rec['haplotypecaller_result'],
                               env=env,
                               noop=args.noop)
            rec['haplotypecaller_gvcf_file_size_bytes'] = os.path.getsize(
                gvcf_file_name)

        if args.deepvariant:
            # deepvariant uses the bam output of fq2bam or germline.
            cmd = [
                pbrun_file_name,
                'deepvariant',
                '--ref',
                os.path.join(args.reference_files_dir,
                             'Homo_sapiens_assembly38.fasta'),
                '--in-bam',
                bam_file_name,
                '--out-variants',
                dv_gvcf_file_name,
                '--gvcf',
                '--tmp-dir',
                temp_dir,
                '--num-gpus',
                '%d' % num_gpus,
            ]
            rec['deepvariant_result'] = {}
            run_system_command(cmd,
                               rec['deepvariant_result'],
                               env=env,
                               noop=args.noop)
            rec['deepvariant_gvcf_file_size_bytes'] = os.path.getsize(
                dv_gvcf_file_name)

        if not args.noop and os.path.exists(temp_dir): shutil.rmtree(temp_dir)

    except Exception as e:
        exception = e
        rec['error'] = True

    t1 = datetime.datetime.utcnow()
    td = t1 - t0
    rec['utc_begin'] = t0.isoformat()
    rec['utc_end'] = t1.isoformat()
    rec['elapsed_sec'] = time_duration_to_seconds(td)

    if args.summary_file:
        record_result(rec, args.summary_file)

    logging.info('END')
    if exception: raise exception
Example #10
0
    def run_test(self):
        rec = self.test_config

        data_size_MB = rec['data_size_MB']
        base_directory = rec['base_directory'] % rec
        test_directory = '%s/TPCx-HS-benchmark' % base_directory
        sort_input_directory = '%s/HSsort-input' % test_directory
        sort_output_directory = '%s/HSsort-output' % test_directory
        validate_output_directory = '%s/HSValidate' % test_directory

        rec['data_size_TB'] = rec['data_size_MB'] / 1e6
        rec['sf'] = rec['data_size_TB']
        rec['error'] = False

        #
        # Build commands
        #

        # HSGen
        rec_size = 100
        recs = int(data_size_MB * 1000.0 * 1000.0 / rec_size)
        cmd = []
        cmd.extend(['hadoop', 'jar', rec['jar'], 'HSGen'])
        cmd.extend(get_hadoop_parameters(rec))
        cmd.extend([str(recs), sort_input_directory])
        rec['hsgen:hadoop_command'] = cmd

        # HSSort
        cmd = []
        cmd.extend(['hadoop', 'jar', rec['jar'], 'HSSort'])
        cmd.extend(get_hadoop_parameters(rec))
        cmd.extend([sort_input_directory, sort_output_directory])
        rec['hssort:hadoop_command'] = cmd

        # HSValidate
        cmd = []
        cmd.extend(['hadoop', 'jar', rec['jar'], 'HSValidate'])
        hsvalidate_config = rec.copy()
        del hsvalidate_config['map_tasks']
        del hsvalidate_config['reduce_tasks']
        cmd.extend(get_hadoop_parameters(hsvalidate_config))
        cmd.extend([sort_output_directory, validate_output_directory])
        rec['hsvalidate:hadoop_command'] = cmd

        for key in [
                'hsgen:hadoop_command', 'hssort:hadoop_command',
                'hsvalidate:hadoop_command'
        ]:
            logging.info('%s: %s' % (key, rec[key]))

        #
        # Prepare for benchmark
        #

        self.hadoop_authenticate()
        self.configure_environment()
        self.delete_hadoop_directory('%s/*' % test_directory)
        system_command(['hadoop', 'fs', '-expunge'],
                       print_command=True,
                       print_output=True,
                       raise_on_error=True,
                       shell=False)
        logging.info('Sleeping for %0.0f seconds' %
                     rec['sleep_after_delete_sec'])
        time.sleep(rec['sleep_after_delete_sec'])

        with self.metrics_collector_context():
            self.start_metrics()

            #
            # Run benchmark
            #

            t0 = datetime.datetime.utcnow()

            try:
                # HSGen
                self.run_mapred_job(key_prefix='hsgen:', raise_on_error=True)
                system_command(
                    ['hdfs', 'dfs', '-ls',
                     '%s/*' % sort_input_directory],
                    print_command=True,
                    print_output=True,
                    raise_on_error=True,
                    shell=False)

                # HSSort
                self.run_mapred_job(key_prefix='hssort:', raise_on_error=True)
                system_command(
                    ['hdfs', 'dfs', '-ls',
                     '%s/*' % sort_output_directory],
                    print_command=True,
                    print_output=True,
                    raise_on_error=True,
                    shell=False)

                # HSValidate
                self.run_mapred_job(key_prefix='hsvalidate:',
                                    raise_on_error=True)
                system_command(
                    ['hdfs', 'dfs', '-ls',
                     '%s/*' % validate_output_directory],
                    print_command=True,
                    print_output=True,
                    raise_on_error=True,
                    shell=False)
            except:
                logging.error('EXCEPTION: %s' % traceback.format_exc())
                rec['error'] = True

            t1 = datetime.datetime.utcnow()
            td = t1 - t0

        rec['elapsed_sec'] = time_duration_to_seconds(td)
        if not rec['error']:
            rec['total_io_rate_MB_per_sec'] = rec['data_size_MB'] / rec[
                'elapsed_sec']
            rec['io_rate_MB_per_sec_per_storage_node'] = rec[
                'total_io_rate_MB_per_sec'] / rec.get('storage_num_nodes',
                                                      float('nan'))
            rec['HSph@SF'] = rec['sf'] / (rec['elapsed_sec'] / 3600.0)
            logging.info('RESULT: elapsed_sec=%f, HSph@SF=%f' %
                         (rec['elapsed_sec'], rec['HSph@SF']))
        self.record_result()
        if rec['error']:
            raise Exception('Test failed')