def run_bfc(self, ctx, params): """ BFC (Bloom Filter) error correcting app for sequencing errors in llluminia short reads. :param params: instance of mapping from String to unspecified object :returns: instance of type "ReportBFCResults" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: results #BEGIN run_bfc log('Running run_bfc with params=') pprint(params) bfc_cmd = [self.BFC] shared_dir = "/kb/module/work/tmp" # validate parameters if 'workspace_name' not in params: raise ValueError('workspace_name parameter is required') if 'input_reads_upa' not in params: raise ValueError('input_reads_upa parameter is required') if 'output_reads_name' not in params: raise ValueError('output_reads_name parameter is required') if 'drop_unique_kmer_reads' in params: if params['drop_unique_kmer_reads']: bfc_cmd.append(str('-1')) if 'est_genome_size' in params: if params['est_genome_size']: if 'est_genome_size_units' in params: if params['est_genome_size_units'] in [ "G", "M", "K", "g", "m", "k" ]: bfc_cmd.append('-s') bfc_cmd.append( str(params['est_genome_size']) + str(params['est_genome_size_units'])) else: raise ValueError( 'est_genome_size_units must be G, M or K') else: raise ValueError('est_genome_size_units must be set') if 'kmer_size' in params: if params['kmer_size']: if params['kmer_size'] < 64: bfc_cmd.append('-k') bfc_cmd.append(str(params['kmer_size'])) else: raise ValueError('kmer_size must be <= 63') input_reads_upa = params['input_reads_upa'] output_reads_name = params['output_reads_name'] os.chdir(shared_dir) output_reads_file = output_reads_name + ".fq" bfc_output_file = "bfc_" + output_reads_name + ".fq" seqtk_output_file = "seqtk_bfc_" + output_reads_name + ".fq" workspace_name = params['workspace_name'] # get the reads library as gzipped interleaved file reads_params = { 'read_libraries': [input_reads_upa], 'interleaved': 'true', 'gzipped': 'true' } ru = _ReadsUtils(self.callbackURL) reads = ru.download_reads(reads_params)['files'] log(reads) input_reads_file = os.path.basename( reads[input_reads_upa]['files']['fwd']) log('Input reads files:') log(' ' + input_reads_file) # hardcoding a couple parameters bfc_cmd.append('-t') bfc_cmd.append(str(self.THREADS)) bfc_cmd.append(input_reads_file) bfc_cmd.append('>') bfc_cmd.append(bfc_output_file) log('Running BFC:') log(' ' + ' '.join(bfc_cmd)) bfc_cmd_output = self.run_command(' '.join(bfc_cmd)) # drop non-paired reads using seqtk seqtk_cmd = [ self.SEQTK, "dropse", bfc_output_file, ">", seqtk_output_file ] self.run_command(' '.join(seqtk_cmd)) # upload reads output shutil.copy(seqtk_output_file, output_reads_file) out_reads_upa = ru.upload_reads({ 'fwd_file': os.path.join(shared_dir, output_reads_file), 'interleaved': 1, 'wsname': workspace_name, 'name': output_reads_name, 'source_reads_ref': input_reads_upa }) # create report ws = _Workspace(self.ws_url) input_meta = ws.get_objects2({ 'objects': [{ 'ref': input_reads_upa }], 'no_data': 1 })['data'][0] input_reads_name = input_meta['info'][1] input_reads_count = input_meta['info'][10]['read_count'] output_meta = ws.get_objects2({ 'objects': [{ 'ref': out_reads_upa['obj_ref'] }], 'no_data': 1 })['data'][0] output_reads_count = output_meta['info'][10]['read_count'] # get total filtered reads filtered_reads = int(input_reads_count) - int(output_reads_count) # add commas for readability input_reads_count = "{:,}".format(int(input_reads_count)) output_reads_count = "{:,}".format(int(output_reads_count)) filtered_reads = "{:,}".format(filtered_reads) filtered_reads = str(filtered_reads) k_mer_size = str(params['kmer_size']) bfc_main = '\n'.join([ l for l in bfc_cmd_output.split('\n') if l.startswith('[M::main') ]) report = 'Successfully ran bfc, on input reads: {}\n'.format( input_reads_name) report += 'with command: {}\n\n{}\n'.format(' '.join(bfc_cmd), bfc_main) report += 'created object: {}({})\n\n'.format(output_reads_name, out_reads_upa['obj_ref']) report += ' input reads: {}\n k-mer size: {}\n filtered reads: {}\n output reads: {}'.format( input_reads_count, k_mer_size, filtered_reads, output_reads_count) log('Saving report') kbr = _KBaseReport(self.callbackURL) report_info = kbr.create_extended_report({ 'message': report, 'objects_created': [{ 'ref': out_reads_upa['obj_ref'], 'description': 'Corrected reads' }], 'workspace_name': workspace_name, 'report_object_name': 'bfc_report_' + str(uuid.uuid4()) }) results = { 'report_name': report_info['name'], 'report_ref': report_info['ref'] } #END run_bfc # At some point might do deeper type checking... if not isinstance(results, dict): raise ValueError('Method run_bfc return value ' + 'results is not type dict as required.') # return the results return [results]
def build_samples( config: Dict[str, str]) -> Tuple[Samples, KBaseUserLookup, List[str]]: ''' Build the sample service instance from the SDK server provided parameters. :param cfg: The SDK generated configuration. :returns: A samples instance. ''' if not config: raise ValueError('config is empty, cannot start service') arango_url = _check_string_req(config.get('arango-url'), 'config param arango-url') arango_db = _check_string_req(config.get('arango-db'), 'config param arango-db') arango_user = _check_string_req(config.get('arango-user'), 'config param arango-user') arango_pwd = _check_string_req(config.get('arango-pwd'), 'config param arango-pwd') col_sample = _check_string_req(config.get('sample-collection'), 'config param sample-collection') col_version = _check_string_req(config.get('version-collection'), 'config param version-collection') col_ver_edge = _check_string_req(config.get('version-edge-collection'), 'config param version-edge-collection') col_node = _check_string_req(config.get('node-collection'), 'config param node-collection') col_node_edge = _check_string_req(config.get('node-edge-collection'), 'config param node-edge-collection') col_data_link = _check_string_req(config.get('data-link-collection'), 'config param data-link-collection') col_ws_obj_ver = _check_string_req( config.get('workspace-object-version-shadow-collection'), 'config param workspace-object-version-shadow-collection') col_schema = _check_string_req(config.get('schema-collection'), 'config param schema-collection') auth_root_url = _check_string_req(config.get('auth-root-url'), 'config param auth-root-url') auth_token = _check_string_req(config.get('auth-token'), 'config param auth-token') full_roles = split_value(config, 'auth-full-admin-roles') read_roles = split_value(config, 'auth-read-admin-roles') read_exempt_roles = split_value(config, 'auth-read-exempt-roles') ws_url = _check_string_req(config.get('workspace-url'), 'config param workspace-url') ws_token = _check_string_req(config.get('workspace-read-admin-token'), 'config param workspace-read-admin-token') kafka_servers = _check_string(config.get('kafka-bootstrap-servers'), 'config param kafka-bootstrap-servers', optional=True) kafka_topic = None if kafka_servers: # have to start the server twice to test no kafka scenario kafka_topic = _check_string(config.get('kafka-topic'), 'config param kafka-topic') metaval_url = _check_string(config.get('metadata-validator-config-url'), 'config param metadata-validator-config-url', optional=True) # meta params may have info that shouldn't be logged so don't log any for now. # Add code to deal with this later if needed print(f''' Starting server with config: arango-url: {arango_url} arango-db: {arango_db} arango-user: {arango_user} arango-pwd: [REDACTED FOR YOUR SAFETY AND COMFORT] sample-collection: {col_sample} version-collection: {col_version} version-edge-collection: {col_ver_edge} node-collection: {col_node} node-edge-collection: {col_node_edge} data-link-collection: {col_data_link} workspace-object-version-shadow-collection: {col_ws_obj_ver} schema-collection: {col_schema} auth-root-url: {auth_root_url} auth-token: [REDACTED FOR YOUR CONVENIENCE AND ENJOYMENT] auth-full-admin-roles: {', '.join(full_roles)} auth-read-admin-roles: {', '.join(read_roles)} auth-read-exempt-roles: {', '.join(read_exempt_roles)} workspace-url: {ws_url} workspace-read-admin-token: [REDACTED FOR YOUR ULTIMATE PLEASURE] kafka-bootstrap-servers: {kafka_servers} kafka-topic: {kafka_topic} metadata-validators-config-url: {metaval_url} ''') # build the validators before trying to connect to arango metaval = get_validators( metaval_url) if metaval_url else MetadataValidatorSet() arangoclient = _arango.ArangoClient(hosts=arango_url) arango_db = arangoclient.db(arango_db, username=arango_user, password=arango_pwd, verify=True) storage = _ArangoSampleStorage( arango_db, col_sample, col_version, col_ver_edge, col_node, col_node_edge, col_ws_obj_ver, col_data_link, col_schema, ) storage.start_consistency_checker() kafka = _KafkaNotifer(kafka_servers, _cast( str, kafka_topic)) if kafka_servers else None user_lookup = KBaseUserLookup(auth_root_url, auth_token, full_roles, read_roles) ws = _WS(_Workspace(ws_url, token=ws_token)) return Samples(storage, user_lookup, metaval, ws, kafka), user_lookup, read_exempt_roles
def __init__(self, jars_dir: _Path, mongo_controller: _MongoController, mongo_db: str, mongo_type_db: str, auth_url: str, root_temp_dir: _Path): ''' Create and start a new Workspace service. An unused port will be selected for the server. :param jars_dir: The path to the lib/jars dir of the KBase Jars repo (https://github.com/kbase/jars), e.g /path_to_repo/lib/jars. :param mongo_controller: A MongoDB controller. :param mongo_db: The database in which to store Workspace data. :param mongo_type_db: The database in which to store Workspace type specifications. :param auth_url: The root url of an instance of the KBase auth service. :param root_temp_dir: A temporary directory in which to store Auth data and log files. The files will be stored inside a child directory that is unique per invocation. ''' if not jars_dir or not _os.access(jars_dir, _os.X_OK): raise _TestException( 'jars_dir {} does not exist or is not executable.'.format( jars_dir)) if not mongo_controller: raise _TestException('mongo_controller must be provided') if not mongo_db: raise _TestException('mongo_db must be provided') if not mongo_type_db: raise _TestException('mongo_type_db must be provided') if not auth_url: raise _TestException('auth_url must be provided') if not root_temp_dir: raise _TestException('root_temp_dir is None') self._mongo = mongo_controller self._db = mongo_db jars_dir = jars_dir.resolve() class_path = self._get_class_path(jars_dir) # make temp dirs root_temp_dir = root_temp_dir.absolute() _os.makedirs(root_temp_dir, exist_ok=True) self.temp_dir = _Path( _tempfile.mkdtemp(prefix='WorkspaceController-', dir=str(root_temp_dir))) ws_temp_dir = self.temp_dir.joinpath('temp_files') _os.makedirs(ws_temp_dir) configfile = self._create_deploy_cfg(self.temp_dir, ws_temp_dir, f'localhost:{self._mongo.port}', mongo_db, mongo_type_db, auth_url) newenv = _os.environ.copy() newenv['KB_DEPLOYMENT_CONFIG'] = configfile self.port = _test_utils.find_free_port() command = ['java', '-classpath', class_path, _WS_CLASS, str(self.port)] self._wslog = self.temp_dir / 'ws.log' self._outfile = open(self._wslog, 'w') self._proc = _subprocess.Popen(command, stdout=self._outfile, stderr=_subprocess.STDOUT, env=newenv) ws = _Workspace(f'http://localhost:{self.port}') for count in range(40): err = None _time.sleep(1) # wait for server to start try: self.version = ws.ver() break except (_ServerError, _requests.exceptions.ConnectionError) as se: err = _TestException(se.args[0]) err.__cause__ = se if err: print( 'Error starting workspace service. Dumping logs and throwing error' ) self._print_ws_logs() raise err self.startup_count = count + 1