예제 #1
0
    def loadReadsSet(self):
        if hasattr(self.__class__, 'reads_set_ref'):
            return self.__class__.reads_set_ref
        pe_reads_ref = self.pe_reads_ref
        reads_set_name = 'TophatTestReadsSet'
        # create the set object

        reads_set_data = {
            'description':
            'Reads Set for testing Bowtie',
            'items': [{
                'ref': pe_reads_ref,
                'label': 'rs1'
            }, {
                'ref': pe_reads_ref,
                'label': 'rs2'
            }, {
                'ref': pe_reads_ref,
                'label': 'rs3'
            }]
        }
        # test a save
        set_api = SetAPI(self.srv_wiz_url, service_ver='dev')
        res = set_api.save_reads_set_v1({
            'data': reads_set_data,
            'output_object_name': reads_set_name,
            'workspace': self.getWsName()
        })
        reads_set_ref = res['set_ref']

        print('Loaded ReadsSet: ' + reads_set_ref)
        return reads_set_ref
예제 #2
0
    def __init__(self, config, ctx):
        self.ctx = ctx
        self.scratch = os.path.abspath(config['scratch'])
        self.ws_url = config['workspace-url']
        self.serviceWizardURL = config['srv-wiz-url']
        self.callbackURL = config['SDK_CALLBACK_URL']
        if not os.path.exists(self.scratch):
            os.makedirs(self.scratch)

        self.SE_flag = 'SE'
        self.PE_flag = 'PE'

        SERVICE_VER = 'release'

        # readsUtils_Client
        try:
            self.readsUtils_Client = ReadsUtils(self.callbackURL,
                                                token=self.ctx['token'],
                                                service_ver=SERVICE_VER)
        except Exception as e:
            raise ValueError(
                'Unable to instantiate readsUtils_Client with callbackURL: ' +
                self.callbackURL + ' ERROR: ' + str(e))

        # setAPI_Client
        try:
            #setAPI_Client = SetAPI (url=self.callbackURL, token=self.ctx['token'])  # for SDK local.  local doesn't work for SetAPI
            self.setAPI_Client = SetAPI(
                url=self.serviceWizardURL,
                token=self.ctx['token'])  # for dynamic service
        except Exception as e:
            raise ValueError(
                'Unable to instantiate setAPI_Client with serviceWizardURL: ' +
                self.serviceWizardURL + ' ERROR: ' + str(e))
예제 #3
0
    def fetch_reads_refs_from_sampleset(self, ref, info, validated_params):
        """
        Note: adapted from kbaseapps/kb_hisat2 - file_util.py

        From the given object ref, return a list of all reads objects that are a part of that
        object. E.g., if ref is a ReadsSet, return a list of all PairedEndLibrary or SingleEndLibrary
        refs that are a member of that ReadsSet. This is returned as a list of dictionaries as follows:
        {
            "ref": reads object reference,
            "condition": condition string associated with that reads object
        }
        The only one required is "ref", all other keys may or may not be present, based on the reads
        object or object type in initial ref variable. E.g. a RNASeqSampleSet might have condition info
        for each reads object, but a single PairedEndLibrary may not have that info.
        If ref is already a Reads library, just returns a list with ref as a single element.
        """
        obj_type = self.get_type_from_obj_info(info)
        refs = list()
        refs_for_ws_info = list()
        if "KBaseSets.ReadsSet" in obj_type or "KBaseRNASeq.RNASeqSampleSet" in obj_type:
            print("Looking up reads references in ReadsSet object")
            set_api = SetAPI(self.srv_wiz_url)
            reads_set = set_api.get_reads_set_v1({'ref': ref,
                                                  'include_item_info': 0,
                                                  'include_set_item_ref_paths': 1
                                                  })

            for reads in reads_set["data"]["items"]:
                refs.append({'ref': reads['ref_path'],
                             'condition': reads['label']
                             })
                refs_for_ws_info.append({'ref': reads['ref_path']})
        else:
            raise ValueError("Unable to fetch reads reference from object {} "
                             "which is a {}".format(ref, obj_type))

        # get object info so we can name things properly
        infos = self.ws.get_object_info3({'objects': refs_for_ws_info})['infos']

        name_ext = '_alignment'
        if 'output_alignment_suffix' in validated_params \
                and validated_params['output_alignment_suffix'] is not None:
            ext = validated_params['output_alignment_suffix'].replace(' ', '')
            if ext:
                name_ext = ext

        unique_name_lookup = {}
        for k in range(0, len(refs)):
            refs[k]['info'] = infos[k]
            name = infos[k][1]
            if name not in unique_name_lookup:
                unique_name_lookup[name] = 1
            else:
                unique_name_lookup[name] += 1
                name = name + '_' + str(unique_name_lookup[name])
            name = name + name_ext
            refs[k]['alignment_output_name'] = name

        return refs
예제 #4
0
 def __init__(self, scratch_dir, callback_url, workspace_url, srv_wiz_url):
     self.scratch_dir = scratch_dir
     self.rau = ReadsAlignmentUtils(callback_url)
     self.kbr = KBaseReport(callback_url)
     self.dfu = DataFileUtil(callback_url)
     self.set_api = SetAPI(srv_wiz_url)
     self.ws = Workspace(workspace_url)
     self.valid_commands = ['bamqc', 'multi-bamqc']
예제 #5
0
    def __init__(self, scratch_dir, workspace_url, callback_url, srv_wiz_url, provenance):
        self.workspace_url = workspace_url
        self.callback_url = callback_url
        self.srv_wiz_url = srv_wiz_url
        self.au = AssemblyUtil(self.callback_url)
        self.dfu = DataFileUtil(self.callback_url, service_ver='beta')
        self.scratch = scratch_dir
        self.working_dir = scratch_dir
        self.prog_runner = Program_Runner(self.STAR_BIN, self.scratch)
        self.provenance = provenance
        self.ws_client = Workspace(self.workspace_url)

        self.parallel_runner = KBParallel(self.callback_url)
        self.qualimap = kb_QualiMap(self.callback_url, service_ver='dev')
        self.set_api_client = SetAPI(self.srv_wiz_url, service_ver='dev')
        self.eu = ExpressionUtils(self.callback_url, service_ver='beta')
예제 #6
0
    def setUpClass(cls):
        token = environ.get('KB_AUTH_TOKEN', None)
        config_file = environ.get('KB_DEPLOYMENT_CONFIG', None)
        cls.cfg = {}
        config = ConfigParser()
        config.read(config_file)
        for nameval in config.items('kb_Msuite'):
            cls.cfg[nameval[0]] = nameval[1]
        # Getting username from Auth profile for token
        authServiceUrl = cls.cfg['auth-service-url']
        auth_client = _KBaseAuth(authServiceUrl)
        user_id = auth_client.get_user(token)
        # WARNING: don't call any logging methods on the context object,
        # it'll result in a NoneType error
        cls.ctx = MethodContext(None)
        cls.ctx.update({
            'token':
            token,
            'user_id':
            user_id,
            'provenance': [{
                'service': 'kb_Msuite',
                'method': 'please_never_use_it_in_production',
                'method_params': []
            }],
            'authenticated':
            1
        })
        cls.wsURL = cls.cfg['workspace-url']
        cls.wsClient = workspaceService(cls.wsURL)
        cls.serviceImpl = kb_Msuite(cls.cfg)
        cls.scratch = cls.cfg['scratch']
        cls.callback_url = os.environ['SDK_CALLBACK_URL']
        cls.checkm_runner = CheckMUtil(cls.cfg, cls.ctx)

        suffix = int(time.time() * 1000)
        cls.wsName = "test_kb_Msuite_" + str(suffix)
        cls.ws_info = cls.wsClient.create_workspace({'workspace': cls.wsName})
        cls.au = AssemblyUtil(os.environ['SDK_CALLBACK_URL'])
        cls.setAPI = SetAPI(url=cls.cfg['srv-wiz-url'], token=cls.ctx['token'])
        cls.gfu = GenomeFileUtil(os.environ['SDK_CALLBACK_URL'],
                                 service_ver='dev')
        cls.mu = MetagenomeUtils(os.environ['SDK_CALLBACK_URL'])

        # stage an input and output directory
        """
        cls.input_dir = os.path.join(cls.scratch, 'input_1')
        cls.output_dir = os.path.join(cls.scratch, 'output_1')
        cls.all_seq_fasta = os.path.join(cls.scratch, 'all_seq.fna')
        shutil.copytree(os.path.join('data', 'example_out', 'input'), cls.input_dir)
        shutil.copytree(os.path.join('data', 'example_out', 'output'), cls.output_dir)
        shutil.copy(os.path.join('data', 'example_out', 'all_seq.fna'), cls.all_seq_fasta)
        """

        # prepare WS data
        cls.prepare_data()
예제 #7
0
    def __init__(self, config):
        self.ws_url = config["workspace-url"]
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        self.shock_url = config['shock-url']
        self.srv_wiz_url = config['srv-wiz-url']
        self.scratch = config['scratch']
        self.dfu = DataFileUtil(self.callback_url)
        self.ke_util = kb_ke_util(self.callback_url, service_ver="dev")
        self.gen_api = GenericsAPI(self.callback_url, service_ver="dev")

        self.ws = Workspace(self.ws_url, token=self.token)
        self.set_client = SetAPI(self.srv_wiz_url)
예제 #8
0
 def __init__(self, config):
     self.ws_url = config["workspace-url"]
     self.callback_url = config['SDK_CALLBACK_URL']
     self.token = config['KB_AUTH_TOKEN']
     self.shock_url = config['shock-url']
     self.scratch = config['scratch']
     self.srv_wiz_url = config['srv-wiz-url']
     self.ws = Workspace(self.ws_url, token=self.token)
     self.bt = kb_Bowtie2(self.callback_url)
     self.rau = ReadsAlignmentUtils(self.callback_url)
     self.qualimap = kb_QualiMap(self.callback_url)
     self.ru = ReadsUtils(self.callback_url)
     self.dfu = DataFileUtil(self.callback_url)
     self.set_client = SetAPI(self.srv_wiz_url)
예제 #9
0
 def __init__(self, config):
     self.ws_url = config["workspace-url"]
     self.callback_url = config['SDK_CALLBACK_URL']
     self.token = config['KB_AUTH_TOKEN']
     self.shock_url = config['shock-url']
     self.srv_wiz_url = config['srv-wiz-url']
     self.scratch = config['scratch']
     self.dfu = DataFileUtil(self.callback_url)
     self.gfu = GenomeFileUtil(self.callback_url)
     self.rau = ReadsAlignmentUtils(self.callback_url)
     self.au = AssemblyUtil(self.callback_url)
     self.eu = ExpressionUtils(self.callback_url, service_ver='dev')
     self.ws = Workspace(self.ws_url, token=self.token)
     self.set_client = SetAPI(self.srv_wiz_url)
예제 #10
0
    def setUpClass(cls):
        token = environ.get('KB_AUTH_TOKEN', None)
        config_file = environ.get('KB_DEPLOYMENT_CONFIG', None)
        cls.cfg = {}
        config = ConfigParser()
        config.read(config_file)
        for nameval in config.items('kb_kaiju'):
            cls.cfg[nameval[0]] = nameval[1]
        # Getting username from Auth profile for token
        authServiceUrl = cls.cfg['auth-service-url']
        auth_client = _KBaseAuth(authServiceUrl)
        user_id = auth_client.get_user(token)
        # WARNING: don't call any logging methods on the context object,
        # it'll result in a NoneType error
        cls.ctx = MethodContext(None)
        cls.ctx.update({
            'token':
            token,
            'user_id':
            user_id,
            'provenance': [{
                'service': 'kb_kaiju',
                'method': 'please_never_use_it_in_production',
                'method_params': []
            }],
            'authenticated':
            1
        })
        cls.wsURL = cls.cfg['workspace-url']
        cls.wsClient = workspaceService(cls.wsURL)
        cls.serviceImpl = kb_kaiju(cls.cfg)
        cls.scratch = cls.cfg['scratch']
        cls.callback_url = os.environ['SDK_CALLBACK_URL']

        suffix = int(time.time() * 1000)
        cls.wsName = "test_kb_kaiju_" + str(suffix)
        cls.ws_info = cls.wsClient.create_workspace({'workspace': cls.wsName})

        cls.kaiju_runner = KaijuUtil(cls.cfg, cls.ctx)
        cls.ru = ReadsUtils(os.environ['SDK_CALLBACK_URL'])
        cls.setAPI = SetAPI(url=cls.cfg['srv-wiz-url'], token=cls.ctx['token'])

        # prepare WS data
        cls.prepare_data()
예제 #11
0
    def __init__(self, config, provenance):
        self.config = config
        self.workspace_url = config['workspace-url']
        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.scratch = config['scratch']
        self.srv_wiz_url = config['srv-wiz-url']
        self.parallel_runner = KBParallel(self.callback_url)
        self.provenance = provenance
        self.star_utils = STARUtils(self.scratch, self.workspace_url,
                                    self.callback_url, self.srv_wiz_url,
                                    provenance)
        self.set_api_client = SetAPI(self.srv_wiz_url, service_ver='dev')
        self.qualimap = kb_QualiMap(self.callback_url, service_ver='dev')
        self.star_idx_dir = None
        self.star_out_dir = None

        # from the provenance, extract out the version to run by exact hash if possible
        self.my_version = 'release'
        if len(provenance) > 0:
            if 'subactions' in provenance[0]:
                self.my_version = self.get_version_from_subactions(
                    'kb_STAR', provenance[0]['subactions'])
        print('Running kb_STAR version = ' + self.my_version)
예제 #12
0
    def download_short_unpaired(self, console, token, wsname,
                                short_unpaired_libraries):
        try:
            self.log(console, "Getting short unpaired reads.\n")
            ruClient = ReadsUtils(url=self.callbackURL, token=token)

            # first, unpack any ReadsSets into the actual SingleEndLibrary referencs
            reads_refs = []
            # object info
            try:
                wsClient = Workspace(self.workspaceURL, token=token)
            except Exception as e:
                raise ValueError("unable to instantiate wsClient. " + str(e))

            [
                OBJID_I, NAME_I, TYPE_I, SAVE_DATE_I, VERSION_I, SAVED_BY_I,
                WSID_I, WORKSPACE_I, CHSUM_I, SIZE_I, META_I
            ] = range(11)  # object_info tuple
            for lib in short_unpaired_libraries:
                try:
                    obj_id = {
                        'ref': lib if '/' in lib else (wsname + '/' + lib)
                    }
                    lib_obj_info = wsClient.get_object_info_new(
                        {'objects': [obj_id]})[0]
                    lib_obj_type = lib_obj_info[TYPE_I]
                    # remove trailing version
                    lib_obj_type = re.sub('-[0-9]+\.[0-9]+$', "", lib_obj_type)
                    lib_ref = str(lib_obj_info[WSID_I])+'/' + \
                        str(lib_obj_info[OBJID_I])+'/'+str(lib_obj_info[VERSION_I])
                    if lib_obj_type == 'KBaseSets.ReadsSet':
                        # unpack it
                        try:
                            setAPIClient = SetAPI(url=self.serviceWizardURL,
                                                  token=token)
                            self.log(console, 'getting reads set ' + lib_ref)
                            readsSet = setAPIClient.get_reads_set_v1({
                                'ref':
                                lib_ref,
                                'include_item_info':
                                1
                            })
                        except Exception as e:
                            raise ValueError(
                                'SetAPI FAILURE: Unable to get read library set object: ('
                                + lib_ref + ')\n' + str(e))
                        for readsLibrary in readsSet['data']['items']:
                            reads_refs.append(readsLibrary['ref'])
                    else:
                        # use other reads objects "as is"
                        reads_refs.append(lib_ref)
                except Exception as e:
                    raise ValueError('Unable to get read library object: (' +
                                     str(lib) + ')' + str(e))

            result = ruClient.download_reads({
                'read_libraries': reads_refs,
                'interleaved': 'false'
            })
            # combine outputs
            short_unpaired_path = os.path.join(
                self.scratch, "short_unpaired_" + str(uuid.uuid4()) + ".fastq")

            self.log(console, "Combining short unpaired reads.\n")

            for reads_ref in reads_refs:
                files = result['files'][reads_ref]['files']

                if 'fwd' in files:
                    path = files['fwd']
                    if path.endswith('.gz'):
                        cmd = 'gzip -dc ' + path + ' >> ' + short_unpaired_path
                    else:
                        cmd = 'cat ' + path + ' >> ' + short_unpaired_path
                    self.log(console, "command: " + cmd)
                    cmdProcess = subprocess.Popen(cmd,
                                                  stdout=subprocess.PIPE,
                                                  stderr=subprocess.STDOUT,
                                                  shell=True)
                    cmdProcess.wait()
                    if cmdProcess.returncode != 0:
                        raise ValueError('Error running ' + cmd)
                    os.remove(path)
                else:
                    raise ValueError('File ' + reads_ref +
                                     ' missing forward reads file')

        except Exception as e:
            raise ValueError('Unable to download short unpaired reads\n' +
                             str(e))
        return short_unpaired_path
예제 #13
0
    def process_batch_result(self, batch_result, validated_params, reads,
                             input_set_info):

        n_jobs = len(batch_result['results'])
        n_success = 0
        n_error = 0
        ran_locally = 0
        ran_njsw = 0

        # reads alignment set items
        items = []
        objects_created = []

        for k in range(0, len(batch_result['results'])):
            job = batch_result['results'][k]
            result_package = job['result_package']
            if job['is_error']:
                n_error += 1
            else:
                n_success += 1
                output_info = result_package['result'][0]['output_info']
                ra_ref = output_info['upload_results']['obj_ref']
                # Note: could add a label to the alignment here?
                items.append({'ref': ra_ref, 'label': reads[k]['condition']})
                objects_created.append({'ref': ra_ref})

            if result_package['run_context']['location'] == 'local':
                ran_locally += 1
            if result_package['run_context']['location'] == 'njsw':
                ran_njsw += 1

        # Save the alignment set
        alignment_set_data = {'description': '', 'items': items}
        alignment_set_save_params = {
            'data':
            alignment_set_data,
            'workspace':
            validated_params['output_workspace'],
            'output_object_name':
            str(input_set_info[1]) + validated_params['output_obj_name_suffix']
        }

        set_api = SetAPI(self.srv_wiz_url)
        save_result = set_api.save_reads_alignment_set_v1(
            alignment_set_save_params)
        print('Saved ReadsAlignment=')
        pprint(save_result)
        objects_created.append({
            'ref':
            save_result['set_ref'],
            'description':
            'Set of all reads alignments generated'
        })
        set_name = save_result['set_info'][1]

        # run qualimap
        qualimap_report = self.qualimap.run_bamqc(
            {'input_ref': save_result['set_ref']})
        qc_result_zip_info = qualimap_report['qc_result_zip_info']

        # create the report
        report_text = 'Ran on SampleSet or ReadsSet.\n\n'
        report_text = 'Created ReadsAlignmentSet: ' + str(set_name) + '\n\n'
        report_text += 'Total ReadsLibraries = ' + str(n_jobs) + '\n'
        report_text += '        Successful runs = ' + str(n_success) + '\n'
        report_text += '            Failed runs = ' + str(n_error) + '\n'
        report_text += '       Ran on main node = ' + str(ran_locally) + '\n'
        report_text += '   Ran on remote worker = ' + str(ran_njsw) + '\n\n'

        print('Report text=')
        print(report_text)

        kbr = KBaseReport(self.callback_url)
        report_info = kbr.create_extended_report({
            'message':
            report_text,
            'objects_created':
            objects_created,
            'report_object_name':
            'kb_Bowtie2_' + str(uuid.uuid4()),
            'direct_html_link_index':
            0,
            'html_links': [{
                'shock_id': qc_result_zip_info['shock_id'],
                'name': qc_result_zip_info['index_html_file_name'],
                'label': qc_result_zip_info['name']
            }],
            'workspace_name':
            validated_params['output_workspace']
        })

        result = {
            'report_info': {
                'report_name': report_info['name'],
                'report_ref': report_info['ref']
            }
        }
        result['batch_output_info'] = batch_result

        return result
예제 #14
0
class STARUtils:
    STAR_VERSION = 'STAR 2.5.3a'
    STAR_BIN = '/kb/deployment/bin/STAR'
    STAR_IDX_DIR = 'STAR_Genome_index'
    STAR_OUT_DIR = 'STAR_Output'
    PARAM_IN_WS = 'output_workspace'
    PARAM_IN_FASTA_FILES = 'genomeFastaFiles'
    PARAM_IN_OUTFILE_PREFIX = 'outFileNamePrefix'
    PARAM_IN_STARMODE = 'runMode'
    PARAM_IN_THREADN = 'runThreadN'
    PARAM_IN_READS_FILES = 'readFilesIn'
    PARAM_IN_READS = 'readsset_ref'
    PARAM_IN_GENOME = 'genome_ref'
    SET_READS = 'set_reads_refs'

    def __init__(self, scratch_dir, workspace_url, callback_url, srv_wiz_url, provenance):
        self.workspace_url = workspace_url
        self.callback_url = callback_url
        self.srv_wiz_url = srv_wiz_url
        self.au = AssemblyUtil(self.callback_url)
        self.dfu = DataFileUtil(self.callback_url, service_ver='beta')
        self.scratch = scratch_dir
        self.working_dir = scratch_dir
        self.prog_runner = Program_Runner(self.STAR_BIN, self.scratch)
        self.provenance = provenance
        self.ws_client = Workspace(self.workspace_url)

        self.parallel_runner = KBParallel(self.callback_url)
        self.qualimap = kb_QualiMap(self.callback_url, service_ver='dev')
        self.set_api_client = SetAPI(self.srv_wiz_url, service_ver='dev')
        self.eu = ExpressionUtils(self.callback_url, service_ver='beta')

    def process_params(self, params):
        """
        process_params: checks params passed to run_star method and set default values
        """
        log('Start validating run_star parameters')
        # check for required parameters
        if params.get(self.PARAM_IN_WS, None) is None:
            raise ValueError(self.PARAM_IN_WS + ' parameter is required')

        if params.get(self.PARAM_IN_STARMODE, None) is None:
            params[self.PARAM_IN_STARMODE] = 'alignReads'
	if params.get(self.PARAM_IN_GENOME, None) is None:
            raise ValueError(self.PARAM_IN_GENOME +
				' parameter is required for generating genome index')

        if (params.get(self.PARAM_IN_STARMODE, None) is not None and
		params[self.PARAM_IN_STARMODE] != "genomeGenerate"):
            if params.get(self.PARAM_IN_READS, None) is None:
		raise ValueError(self.PARAM_IN_READS +
				' parameter is required for reads mapping')
		if not valid_string(params[self.PARAM_IN_READS], is_ref=True):
			raise ValueError("Parameter readsset_ref must be a valid Workspace object reference, "
					 "not {}".format(params.get(self.PARAM_IN_READS, None)))

        if params.get(self.PARAM_IN_THREADN, None) is not None:
            if not isinstance(params[self.PARAM_IN_THREADN], int):
                raise ValueError(self.PARAM_IN_HASH_THREADN + ' must be of type int')
	else:
             params[self.PARAM_IN_THREADN] = 2

	if "alignment_suffix" not in params or not valid_string(params["alignment_suffix"]):
            raise ValueError("Parameter alignment_suffix must be a valid Workspace object string, "
                      "not {}".format(params.get("alignment_suffix", None)))

        if params.get(self.PARAM_IN_OUTFILE_PREFIX, None) is not None:
            if params[self.PARAM_IN_OUTFILE_PREFIX].find('/') != -1:
                raise ValueError(self.PARAM_IN_OUTFILE_PREFIX + ' cannot contain subfolder(s).')
        else:
            params[self.PARAM_IN_OUTFILE_PREFIX] = 'star_'

        if params.get('create_report', None) is None:
            params['create_report'] = 0

        return self._setDefaultParameters(params)


    def _setDefaultParameters(self, params_in):
        """set default for this group of parameters
        """
        params = copy.deepcopy(params_in)
        if params.get('outFilterType', None) is None:
            params['outFilterType'] = "\"BySJout\""
        if params.get('outFilterMultimapNmax', None) is None:
            params['outFilterMultimapNmax'] = 20
        if params.get('outSAMtype', None) is None:
            params['outSAMtype'] = 'BAM'
        if params.get('outSAMattrIHstart', None) is None:
            params['outSAMattrIHstart'] = 0
        if params.get('outSAMstrandField', None) is None:
            params['outSAMstrandField'] = 'intronMotif'
        if params.get('outFilterIntronMotifs', None) is None:
            params['outFilterIntronMotifs'] = 'RemoveNoncanonical'
        if params.get(self.SET_READS, None) is None:
            params[self.SET_READS] = self._get_reads_refs_from_setref(params)

        return params

    def _get_genome_gtf_file(self, gnm_ref, gtf_file_dir):
        """
        Get data from genome object ref and return the GTF filename (with path)
        for STAR indexing and mapping.
        STAR uses the reference annotation to guide assembly and for creating alignment
        """
        log("Converting genome {0} to GFF file in folder {1}".format(gnm_ref, gtf_file_dir))
        gfu = GenomeFileUtil(self.callback_url)
        try:
            gfu_ret = gfu.genome_to_gff({self.PARAM_IN_GENOME: gnm_ref,
                                         'is_gtf': 1,
                                         'target_dir': gtf_file_dir
                                      })
        except ValueError as egfu:
            log('GFU getting GTF file raised error:\n')
            pprint(egfu)
            return None
        else:#no exception raised
            return gfu_ret.get('file_path')


    def _construct_indexing_cmd(self, params):
	# STEP 1: construct the command for running `STAR --runMode genomeGenerate...`
        idx_cmd = [self.STAR_BIN]
	idx_cmd.append('--genomeDir')
	idx_cmd.append(params[self.STAR_IDX_DIR])
	idx_cmd.append('--' + self.PARAM_IN_STARMODE)
	idx_cmd.append('genomeGenerate')
	idx_cmd.append('--' + self.PARAM_IN_THREADN)
	idx_cmd.append(str(params[self.PARAM_IN_THREADN]))

	if params.get(self.PARAM_IN_FASTA_FILES, None) is not None:
            idx_cmd.append('--' + self.PARAM_IN_FASTA_FILES)
            for fasta_file in params[self.PARAM_IN_FASTA_FILES]:
                idx_cmd.append(fasta_file)

	# STEP 2: append the standard optional inputs
        if params.get('sjdbGTFfile', None) is not None:
            idx_cmd.append('--sjdbGTFfile')
            idx_cmd.append(params['sjdbGTFfile'])
        if (params.get('sjdbOverhang', None) is not None
		and params['sjdbOverhang'] > 0):
            idx_cmd.append('--sjdbOverhang')
            idx_cmd.append(str(params['sjdbOverhang']))

        #print ('STAR indexing CMD:')
        #print ' '.join(idx_cmd)
        return idx_cmd

    def _construct_mapping_cmd(self, params):
	if params.get(self.PARAM_IN_STARMODE, None) is None:
            params[self.PARAM_IN_STARMODE] = 'alignReads'

        # STEP 1: set the working folder housing the STAR output results as well as the reads info
        star_out_dir = ''
	if params.get('align_output', None) is None:
            star_out_dir = self.scratch
	else:
            star_out_dir = params['align_output']

        # STEP 2: construct the command for running STAR mapping
        mp_cmd = [self.STAR_BIN]
	mp_cmd.append('--genomeDir')
	mp_cmd.append(params[self.STAR_IDX_DIR])
	mp_cmd.append('--' + self.PARAM_IN_STARMODE)
	mp_cmd.append(params[self.PARAM_IN_STARMODE])
	mp_cmd.append('--' + self.PARAM_IN_THREADN)
	mp_cmd.append(str(params[self.PARAM_IN_THREADN]))

	if params.get(self.PARAM_IN_READS_FILES, None) is not None:
            #print('Input reads files:\n' + pformat(params[self.PARAM_IN_READS_FILES]))
            mp_cmd.append('--' + self.PARAM_IN_READS_FILES)
            for reads_file in params[self.PARAM_IN_READS_FILES]:
                mp_cmd.append(reads_file)
		readName, readsExtension = os.path.splitext(reads_file)
                #print ('Reads file name-- {}/extension-- {}:'.format(readName, readsExtension))
		if readsExtension == '.gz':
			mp_cmd.append('--readFilesCommand')
			mp_cmd.append('gunzip')
			mp_cmd.append('-c')

		if readsExtension == '.bz2':
			mp_cmd.append('--readFilesCommand')
			mp_cmd.append('bunzip2')
			mp_cmd.append('-c')

        # STEP 3: appending the advanced optional inputs
        mp_cmd.append('--' + self.PARAM_IN_OUTFILE_PREFIX)
        mp_cmd.append(os.path.join(star_out_dir, params[self.PARAM_IN_OUTFILE_PREFIX]))

        if params.get('sjdbGTFfile', None) is not None:
            mp_cmd.append('--sjdbGTFfile')
            mp_cmd.append(params['sjdbGTFfile'])
        if (params.get('sjdbOverhang', None) is not None
		and params['sjdbOverhang'] > 0):
            mp_cmd.append('--sjdbOverhang')
            mp_cmd.append(str(params['sjdbOverhang']))

        if (params.get('outFilterType', None) is not None
                and isinstance(params['outFilterType'], str)):
            mp_cmd.append('--outFilterType')
            mp_cmd.append(params['outFilterType'])
        if (params.get('outFilterMultimapNmax', None) is not None
                and isinstance(params['outFilterMultimapNmax'], int)
                and params['outFilterMultimapNmax'] >= 0):
            mp_cmd.append('--outFilterMultimapNmax')
            mp_cmd.append(str(params['outFilterMultimapNmax']))

        #output sorted file:Aligned.sortedByCoord.out.bam
        #allowed values of --outSAMtype are BAM Unsorted or SortedByCoordinate or both
        if params.get('outSAMtype', None) is not None:
            mp_cmd.append('--outSAMtype')
            mp_cmd.append(params['outSAMtype'])
            if params.get('outSAMtype', None) == 'BAM':
                mp_cmd.append('SortedByCoordinate')

        # 'It is recommended to remove the non-canonical junctions for Cnks runs using
        # --outFilterIntronMotifs RemoveNoncanonical'
        if params.get('outFilterIntronMotifs', None) is not None:
            mp_cmd.append('--outFilterIntronMotifs')
            mp_cmd.append('RemoveNoncanonical')

        if (params.get('outSAMattrIHstart', None) is not None
                and isinstance(params['outSAMattrIHstart'], int)
                and params['outSAMattrIHstart'] >= 0):
            mp_cmd.append('--outSAMattrIHstart')
            mp_cmd.append(str(params['outSAMattrIHstart']))
        if (params.get('outSAMstrandField', None) is not None
                and isinstance(params['outSAMstrandField'], str)):
            mp_cmd.append('--outSAMstrandField')
            mp_cmd.append(params['outSAMstrandField'])

        quant_modes = ["TranscriptomeSAM", "GeneCounts", "Both"]
        if (params.get('quantMode', None) is not None
                and params.get('quantMode', None) in quant_modes):
            mp_cmd.append('--quantMode')
            if params['quantMode'] == "Both":
                mp_cmd.append("TranscriptomeSAM")
                mp_cmd.append("GeneCounts")
            else:
                mp_cmd.append(params['quantMode'])
        if (params.get('alignSJoverhangMin', None) is not None
		and isinstance(params['alignSJoverhangMin'], int)
                and params['alignSJoverhangMin'] > 0):
            mp_cmd.append('--alignSJoverhangMin')
            mp_cmd.append(str(params['alignSJoverhangMin']))
        if (params.get('alignSJDBoverhangMin', None) is not None
                and isinstance(params['alignSJDBoverhangMin'], int)
                and params['alignSJDBoverhangMin'] > 0):
            mp_cmd.append('--alignSJDBoverhangMin')
            mp_cmd.append(str(params['alignSJDBoverhangMin']))
        if (params.get('outFilterMismatchNmax', None) is not None
		and isinstance(params['outFilterMismatchNmax'], int)
                and params['outFilterMismatchNmax'] > 0):
            mp_cmd.append('--outFilterMismatchNmax')
            mp_cmd.append(str(params['outFilterMismatchNmax']))
        if (params.get('alignIntronMin', None) is not None
		and isinstance(params['alignIntronMin'], int)
                and params['alignIntronMin'] > 0):
            mp_cmd.append('--alignIntronMin')
            mp_cmd.append(str(params['alignIntronMin']))
        if (params.get('alignIntronMax', None) is not None
		and isinstance(params['alignIntronMax'], int)
                and params['alignIntronMax'] >= 0):
            mp_cmd.append('--alignIntronMax')
            mp_cmd.append(str(params['alignIntronMax']))
        if (params.get('alignMatesGapMax', None) is not None
		and isinstance(params['alignMatesGapMax'], int)
                and params['alignMatesGapMax'] >= 0):
            mp_cmd.append('--alignMatesGapMax')
            mp_cmd.append(str(params['alignMatesGapMax']))

        #print ('STAR mapping CMD:')
        #print ' '.join(mp_cmd)
        return mp_cmd

    def _exec_indexing(self, params):
        log('Running STAR index generating with params:\n' + pformat(params))

        idx_cmd = self._construct_indexing_cmd(params)

        exitCode = self.prog_runner.run(idx_cmd, self.scratch)

        return exitCode

    def _exec_mapping(self, params):
        log('Running STAR mapping with params:\n' + pformat(params))

        mp_cmd = self._construct_mapping_cmd(params)

        exitCode = self.prog_runner.run(mp_cmd, self.scratch)

        return exitCode

    def _exec_star_pipeline(self, params, rds_files, rds_name, idx_dir, out_dir):
        # build the parameters
        params_idx = self._get_indexing_params(params, idx_dir)
        params_mp = self._get_mapping_params(params, rds_files, rds_name, idx_dir, out_dir)

        # execute indexing and then mapping
        retVal = {}
        try:
            if params[self.PARAM_IN_STARMODE]=='genomeGenerate':
                ret = self._exec_indexing(params_idx)
            else:
		ret = 0
            while( ret != 0 ):
                time.sleep(1)
        except ValueError as eidx:
            log('STAR genome indexing raised error:\n')
            pprint(eidx)
        else:#no exception raised by genome indexing and STAR returns 0, then run mapping
            params_mp[self.PARAM_IN_STARMODE] = 'alignReads'
            try:
                ret = self._exec_mapping(params_mp)
                while( ret != 0 ):
                    time.sleep(1)
            except ValueError as emp:
                log('STAR mapping raised error:\n')
                pprint(emp)
            else:#no exception raised by STAR mapping and STAR returns 0, then move to saving and reporting  
                ret = {'star_idx': star_idx, 'star_output': params_mp.get('align_output')}

        return ret


    def upload_STARalignment(self, input_params, reads_ref, reads_info, output_bam_file):
        """
        Uploads the alignment file + metadata.
        Returns the STAR alignment reference.
        """

        aligner_opts = dict()
        for k in input_params:
            aligner_opts[k] = str(input_params[k])
        pprint(reads_info)

        alignment_name = reads_ref['alignment_output_name']
        align_upload_params = {
            "destination_ref": "{}/{}".format(input_params[self.PARAM_IN_WS], alignment_name),
            "file_path": output_bam_file,
            "assembly_or_genome_ref": input_params[self.PARAM_IN_GENOME],
            "read_library_ref": reads_info['object_ref'],
            "library_type": reads_info['style'],
            "condition": reads_info['condition'],
            "aligned_using": 'STAR',
            "aligner_version":self.STAR_VERSION,
            "aligner_opts": aligner_opts
        }

        pprint(align_upload_params)

        ra_util = ReadsAlignmentUtils(self.callback_url, service_ver='beta')
        rau_upload_ret = ra_util.upload_alignment(align_upload_params)
        alignment_ref = rau_upload_ret["obj_ref"]
        print("STAR alignment uploaded as object {}".format(alignment_ref))
        return rau_upload_ret


    def generate_report_for_single_run(self, run_output_info, params):
        input_ref = run_output_info['upload_results']['obj_ref']
        index_dir = run_output_info['index_dir']
        output_dir = run_output_info['output_dir']
        output_files = self._generate_output_file_list(index_dir, output_dir)

        # first run qualimap
        qualimap_report = self.qualimap.run_bamqc({'input_ref': input_ref})
        qc_result_zip_info = qualimap_report['qc_result_zip_info']

        # create report
        report_text = 'Ran on a single reads library.\n\n'
        alignment_info = self.get_obj_infos(input_ref)[0]
        report_text = 'Created ReadsAlignment: ' + str(alignment_info[1]) + '\n'
        report_text += '                        ' + input_ref + '\n'
        kbr = KBaseReport(self.callback_url)
        report_info = kbr.create_extended_report({'message': report_text,
                                                  'file_links': output_files,
                                                  'objects_created': [{'ref': input_ref,
                                                                       'description': 'ReadsAlignment'}],
                                                  'report_object_name': 'kb_STAR_report_' + str(uuid.uuid4()),
                                                  'direct_html_link_index': 0,
                                                  'html_links': [{'shock_id': qc_result_zip_info['shock_id'],
                                                                  'name': qc_result_zip_info['index_html_file_name'],
                                                                  'label': qc_result_zip_info['name']}],
                                                  'html_window_height': 366,
                                                  'workspace_name': params['output_workspace']
                                                  })
        return report_info #{'report_name': report_info['name'], 'report_ref': report_info['ref']}

    def _get_reads_info(self, reads, readsSet_ref):
        '''
        _get_reads_info:fetches the detailed info for each reads with ref in list reads_refs
        return an object of the following structure:
        {
            "style": "paired", "single", or "interleaved",
            "file_fwd": path_to_file,
            "name": name of the reads,
            "file_rev": path_to_file, only if paired end,
            "object_ref": reads reference for downstream convenience,
            "condition": the condition for the reads.
        }
        '''
        try:
            print("Fetching FASTA file from reads reference {}".format(reads['ref']))
            ret_reads_info = fetch_reads_from_reference(reads['ref'], self.callback_url)
        except ValueError:
            print("Incorrect object type for fetching a FASTA file!")
            raise

        if ret_reads_info.get("file_fwd", None) is None:
            raise RuntimeError("FASTA file fetched from reads {} doesn't seem to exist!".format(reads['ref']))
        else:
            if reads.get('condition', None) is not None:
                ret_reads_info['condition'] = reads['condition']
            else:
                ret_reads_info['condition'] = 'unspecified'
            if reads.get('object_ref', None) != readsSet_ref:
                ret_reads_info[self.PARAM_IN_READS] = readsSet_ref

        return ret_reads_info


    def _get_genome_fasta(self, gnm_ref):
        genome_fasta_files = list()
	if gnm_ref is not None:
            try:
		print("Fetching FASTA file from object {}".format(gnm_ref))
		genome_fasta_file = fetch_fasta_from_object(gnm_ref, self.workspace_url, self.callback_url)
		print("Done fetching FASTA file! Path = {}".format(genome_fasta_file.get("path", None)))
            except ValueError:
		print("Incorrect object type for fetching a FASTA file!")
		raise

            if genome_fasta_file.get("path", None) is None:
		raise RuntimeError("FASTA file fetched from object {} doesn't seem exist!".format(gnm_ref))
            else:
		genome_fasta_files.append(genome_fasta_file["path"])
        return genome_fasta_files


    def convert_params(self, validated_params):
        """
        Convert input parameters with KBase ref format into STAR parameters,
        and add the advanced options.
        """
        params = copy.deepcopy(validated_params)
        params['runMode'] = 'genomeGenerate'

        if validated_params.get('create_report', None) is not None:
                params['create_report'] = validated_params['create_report']
        if validated_params.get('concurrent_local_tasks', None) is not None:
                params['concurrent_local_tasks'] = validated_params['concurrent_local_tasks']
        if validated_params.get('concurrent_njsw_tasks', None) is not None:
                params['concurrent_njsw_tasks'] = validated_params['concurrent_njsw_tasks']
        if validated_params.get('alignmentset_suffix', None) is not None:
                params['alignmentset_suffix'] = validated_params['alignmentset_suffix']

        # Add advanced options from validated_params to params
        sjdbGTFfile = validated_params.get("sjdbGTFfile", None)
	if sjdbGTFfile is not None:
            params['sjdbGTFfile'] = sjdbGTFfile
        else:
            params['sjdbGTFfile'] = self._get_genome_gtf_file(
                                        params[self.PARAM_IN_GENOME],
                                        os.path.join(self.scratch, self.STAR_IDX_DIR))
        if validated_params.get('sjdbOverhang', None) is not None :
            params['sjdbOverhang'] = validated_params['sjdbOverhang']
        else:
            params['sjdbOverhang'] = 100

        quant_modes = ["TranscriptomeSAM", "GeneCounts", "Both"]
        if (validated_params.get('quantMode', None) is not None
                and validated_params.get('quantMode', None) in quant_modes):
            params['quantMode'] = validated_params['quantMode']
        else:
            params['quantMode'] = 'Both'

        return params


    def _get_indexing_params(self, params, star_idx_dir):
        params_idx = {
                'runMode': 'genomeGenerate',
		'runThreadN': params[self.PARAM_IN_THREADN],
		self.STAR_IDX_DIR: star_idx_dir,
                'genomeFastaFiles': params[self.PARAM_IN_FASTA_FILES]
        }
        if params.get('sjdbGTFfile', None) is not None:
            params_idx['sjdbGTFfile'] = params['sjdbGTFfile']
        if params.get('sjdbOverhang', None) is not None :
            params_idx['sjdbOverhang'] = params['sjdbOverhang']

        return params_idx


    def _get_mapping_params(self, params, rds_files, rds_name, idx_dir, out_dir):
        ''' build the mapping parameters'''
        aligndir = out_dir
        if rds_name:
            aligndir = os.path.join(out_dir, rds_name)
            self._mkdir_p(aligndir)
            #print '**********STAR output directory created:{}'.format(aligndir)

        params_mp = copy.deepcopy(params)
        params_mp['runMode'] = 'alignReads'
        params_mp['readFilesIn'] = rds_files
	params_mp[self.STAR_IDX_DIR] = idx_dir
        params_mp['align_output'] = aligndir

        return params_mp


    def determine_input_info(self, validated_params):
        ''' get info on the readsset_ref object and determine if we run once or run on a set
        input info provides information on the input and tells us if we should
        run as a single_library or as a set:
             input_info = {'run_mode': '', 'info': [..], 'ref': '55/1/2'}
        '''
        info = self.get_obj_infos(validated_params[self.PARAM_IN_READS])[0]
        obj_type = self.get_type_from_obj_info(info)
        if obj_type in ['KBaseAssembly.PairedEndLibrary', 'KBaseAssembly.SingleEndLibrary',
                        'KBaseFile.PairedEndLibrary', 'KBaseFile.SingleEndLibrary']:
            return {'run_mode': 'single_library', 'info': info, 'ref': validated_params[self.PARAM_IN_READS]}
        if obj_type == 'KBaseRNASeq.RNASeqSampleSet':
            return {'run_mode': 'sample_set', 'info': info, 'ref': validated_params[self.PARAM_IN_READS]}
        if obj_type == 'KBaseSets.ReadsSet':
            return {'run_mode': 'sample_set', 'info': info, 'ref': validated_params[self.PARAM_IN_READS]}

        raise ValueError('Object type of readsset_ref is not valid, was: ' + str(obj_type))

    def determine_unique_reads_names(self, validated_params):
        infos = self.get_obj_infos(validated_params[self.PARAM_IN_READS])
        return get_unique_names(infos)

    def get_type_from_obj_info(self, info):
        return info[2].split('-')[0]

    def get_name_from_obj_info(self, info):
        return info[1]

    def get_obj_infos(self, ref):
        return self.ws_client.get_object_info3({'objects': [{'ref': ref}]})['infos']

    def get_object_names(self, ref_list):
        """
        From a list of workspace references, returns a mapping from ref -> name of the object.
        """
        obj_ids = list()
        for ref in ref_list:
            obj_ids.append({"ref": ref})
        info = self.ws_client.get_object_info3({"objects": obj_ids})
        name_map = dict()
        # we already have the refs as passed previously, so use those for mapping, as they're in
        # the same order as what's returned.
        for i in range(len(info["infos"])):
            name_map[ref_list[i]] = info["infos"][i][1]
        return name_map


    def _mkdir_p(self, dir):
        """
        _mkdir_p: make directory for given path
        """
        log('Creating a new dir: ' + dir)
        if not dir:
            return
        if not os.path.exists(dir):
            os.makedirs(dir)
        else:
            log('{} has existed, so skip creating.'.format(dir))


    def create_star_dirs(self, star_home):
        '''creating the directories for STAR'''
        # the index directory
        idxdir = os.path.join(star_home, self.STAR_IDX_DIR)
        self._mkdir_p(idxdir)
        # the output directory
        outdir = os.path.join(star_home, self.STAR_OUT_DIR)
        self._mkdir_p(outdir)

        return (idxdir, outdir)


    def _get_reads_refs_from_setref(self, params):
        readsSet_ref = params[self.PARAM_IN_READS]
        reads_refs = list()
        try:
            #print("Fetching reads ref(s) from sample/reads set ref {}".format(readsSet_ref))
            reads_refs = fetch_reads_refs_from_sampleset(
                                    readsSet_ref,
                                    self.workspace_url,
                                    self.callback_url,
                                    params)
            #print("\nDone fetching reads ref(s) from readsSet {}--\nDetails:\n".format(readsSet_ref))
            #pprint(reads_refs)
        except ValueError:
            print("Incorrect object type for fetching reads ref(s)!")
            raise

        return reads_refs

    def _generate_output_file_list(self, idx_dir, out_dir):
        """
        _generate_output_file_list: zip result files and generate file_links for report
        """

        log('start packing result files')

        output_files = list()

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_directory)
        star_index = os.path.join(output_directory, 'star_index.zip')
        star_output = os.path.join(output_directory, 'star_output.zip')
        self.zip_folder(idx_dir, star_index)
        self.zip_folder(out_dir, star_output)

        #star_index = self.zip_folder_withDFU(idx_dir, 'star_index')
        #star_output = self.zip_folder_withDFU(out_dir, 'star_output')

        output_files.append({'path': star_index,
                             'name': os.path.basename(star_index),
                             'label': os.path.basename(star_index),
                             'description': 'Index file(s) generated by STAR'})

        output_files.append({'path': star_output,
                             'name': os.path.basename(star_output),
                             'label': os.path.basename(star_output),
                             'description': 'Output file(s) generated by STAR'})

        return output_files


    def zip_folder_withDFU(self, folder_path, output_name):
        """Zip the contents of an entire folder (with that folder included
        in the archive). Empty subfolders will be included in the archive
        as well.
        """
        output_path = self.dfu.pack_file(
                {'file_path': folder_path + '/' + output_name,
                 'pack': 'zip'})['file_path']

        print "{} created successfully.".format(output_path)

        #with zipfile.ZipFile(output_path, "r") as f:
            #print 'Checking the zipped file......\n'
            #for info in f.infolist():
                #    print info.filename, info.date_time, info.file_size, info.compress_size
            #for fn in f.namelist():
                #print fn

        return output_path


    def zip_folder(self, folder_path, output_path):
        """Zip the contents of an entire folder (with that folder included in the archive). 
        Empty subfolders could be included in the archive as well if the commented portion is used.
        """
        with zipfile.ZipFile(output_path, 'w',
                             zipfile.ZIP_DEFLATED,
                             allowZip64=True) as ziph:
            for root, folders, files in os.walk(folder_path):
                # Include all subfolders, including empty ones.
                #for folder_name in folders:
                #    absolute_path = os.path.join(root, folder_name)
                #    relative_path = os.path.join(os.path.basename(root), folder_name)
                #    print "Adding {} to archive.".format(absolute_path)
                #    ziph.write(absolute_path, relative_path)
                for f in files:
                    absolute_path = os.path.join(root, f)
                    relative_path = os.path.join(os.path.basename(root), f)
                    #print "Adding {} to archive.".format(absolute_path)
                    ziph.write(absolute_path, relative_path)

        print "{} created successfully.".format(output_path)

        #with zipfile.ZipFile(output_path, "r") as f:
        #    print 'Checking the zipped file......\n'
        #    for info in f.infolist():
        #        print info.filename, info.date_time, info.file_size, info.compress_size


    def _generate_html_report(self, out_dir, obj_ref):
        """
        _generate_html_report: generate html summary report
        """

        log('start generating html report')
        html_report = list()

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_directory)
        result_file_path = os.path.join(output_directory, 'report.html')

        star_obj = self.ws_client.get_objects2({'objects':
                                                 [{'ref': obj_ref}]})['data'][0]
        star_obj_info = star_obj['info']
        star_obj_data = star_obj['data']
        star_obj_type = star_obj_info[2]

        Overview_Content = ''
        if re.match('KBaseRNASeq.RNASeqAlignment-\d.\d', star_obj_type):
            Overview_Content += '<br/><table><tr><th>Generated Alignment Object</th>'
            Overview_Content += '<th></th></tr>'
            Overview_Content += '<tr><th>Alignment Name</th><th>Condition</th></tr>'
            Overview_Content += '<tr><td>{} ({})</td>'.format(star_obj_info[1],obj_ref)
            Overview_Content += '<td>{}</td></tr>'.format(star_obj_data['condition'])
            Overview_Content += '</table>'
        elif (re.match('KBaseRNASeq.RNASeqAlignmentSet-\d.\d', star_obj_type)
                or re.match('KBaseSets.ReadsAlignmentSet-\d.\d', star_obj_type)
                or re.match('KBaseSet.RNASeqAlignmentSet-\d.\d', star_obj_type)):
            Overview_Content += '<br/><table><tr><th>Generated AlignmentSet Object</th></tr>'
            Overview_Content += '<tr><td>{} ({})'.format(star_obj_info[1],obj_ref)
            Overview_Content += '</td></tr></table>'
            Overview_Content += '<p><br/></p>'
            Overview_Content += '<table><tr><th>Generated Alignment Objects</th>'
            Overview_Content += '<th></th></tr>'
            Overview_Content += self._fill_html_trs('Alignment Name', star_obj_data)
            Overview_Content += '</table>'
        elif re.match('KBaseRNASeq.RNASeqExpression-\d.\d', star_obj_type):
            Overview_Content += '<br/><table><tr><th>Generated Expression Object</th>'
            Overview_Content += '<th></th></tr>'
            Overview_Content += '<tr><th>Expression Name</th><th>Condition</th></tr>'
            Overview_Content += '<tr><td>{} ({})</td>'.format(star_obj_info[1], obj_ref)
            Overview_Content += '<td>{}</td></tr>'.format(star_obj_data['condition'])
            Overview_Content += '</table>'
        elif re.match('KBaseSets.ExpressionSet-\d.\d', star_obj_type):
            Overview_Content += '<br/><table><tr><th>Generated ExpressionSet Object</th></tr>'
            Overview_Content += '<tr><td>{} ({})'.format(star_obj_info[1], obj_ref)
            Overview_Content += '</td></tr></table>'
            Overview_Content += '<p><br/></p>'
            Overview_Content += '<table><tr><th>Generated Expression Objects</th>'
            Overview_Content += '<th></th></tr>'
            Overview_Content += self._fill_html_trs('Expression Name', star_obj_data)
            Overview_Content += '</table>'

        with open(result_file_path, 'w') as result_file:
            with open(os.path.join(os.path.dirname(__file__), 'report_template.html'),
                      'r') as report_template_file:
                report_template = report_template_file.read()
                report_template = report_template.replace('<p>Overview_Content</p>',
                                                          Overview_Content)
                result_file.write(report_template)

        html_report.append({'path': result_file_path,
                            'name': os.path.basename(result_file_path),
                            'label': os.path.basename(result_file_path),
                            'description': 'HTML summary report for STAR'})
        return html_report

    def _fill_html_trs(self, col_caption, obj_data):
        '''
        _fill_html_trs: simple creates an html string that has rows (tr) of td for a table
        '''
        tr_html_str = '<tr><th>{}</th><th>Condition</th></tr>'.format(col_caption)

        for item in obj_data['items']:
            item_obj = self.ws_client.get_objects2({'objects':[{'ref': item['ref']}]})['data'][0]
            item_obj_info = item_obj['info']
            item_obj_data = item_obj['data']
            obj_name = item_obj_info[1]

            tr_html_str += '<tr><td>{} ({})</td>'.format(obj_name, item['ref'])
            tr_html_str += '<td>{}</td></tr>'.format(item_obj_data['condition'])

        return tr_html_str

    def _generate_star_report(self, obj_ref, report_text, html_links, workspace_name, index_dir, output_dir):
        """
        _generate_star_report: generate summary report
        """
        log('creating STAR report')

        output_files = self._generate_output_file_list(index_dir, output_dir)
        output_html_files = self._generate_html_report(output_dir, obj_ref)
        output_html_files += html_links

        star_obj = self.ws_client.get_objects2({'objects':[{'ref': obj_ref}]})['data'][0]
        star_obj_info = star_obj['info']
        star_obj_data = star_obj['data']

        star_obj_type = star_obj_info[2]
        if re.match('KBaseRNASeq.RNASeqAlignment-\d+.\d+', star_obj_type):
            objects_created = [{'ref': obj_ref,
                                'description': 'RNASeqAlignment generated by STAR'}]
        elif (re.match('KBaseRNASeq.RNASeqAlignmentSet-\d+.\d+', star_obj_type)
                or re.match('KBaseSets.ReadsAlignmentSet-\d+.\d+', star_obj_type)
                or re.match('KBaseSet.RNASeqAlignmentSet-\d+.\d+', star_obj_type)):
            objects_created = [{'ref': obj_ref,
                'description': '{} generated by STAR'.format(re.sub(r"-\d+.\d+", "",star_obj_type))}]
            items = star_obj_data['items']
            for item in items:
                objects_created.append({'ref': item['ref'],
                                        'description': 'Alignment generated by STAR'})
        elif re.match('KBaseRNASeq.RNASeqExpression-\d+.\d+', star_obj_type):
            objects_created = [{'ref': obj_ref,
                                'description': 'Expression generated by STAR'}]
        elif re.match('KBaseSets.ExpressionSet-\d+.\d+', star_obj_type):
            objects_created = [{'ref': obj_ref,
                                'description': 'ExpressionSet generated by STAR'}]
            items = star_obj_data['items']
            for item in items:
                objects_created.append({'ref': item['ref'],
                                        'description': 'Expression generated by STAR'})

        report_params = {'message': report_text,
                         'workspace_name': workspace_name,
                         'file_links': output_files,
                         'objects_created': objects_created,
                         'html_links': output_html_files,
                         'direct_html_link_index': 0,
                         'html_window_height': 366,
                         'report_object_name': 'kb_STAR_report_' + str(uuid.uuid4())}

        kbase_report_client = KBaseReport(self.callback_url)
        report_output = kbase_report_client.create_extended_report(report_params)

        return report_output

    def upload_alignment_set(self, alignment_items, alignmentset_name, ws_name):
        """
        Compiles and saves a set of alignment references (+ other stuff) into a
        KBaseRNASeq.RNASeqAlignmentSet.
        Returns the reference to the new alignment set.
        alignment_items: [{
            "ref": alignment_ref,
            "label": condition label.
        }]
        """
        print("Uploading completed alignment set")
        alignment_set = {
            "description": "Alignments using STAR, v.{}".format(self.STAR_VERSION),
            "items": alignment_items
        }
        set_info = self.set_api_client.save_reads_alignment_set_v1({
            "workspace": ws_name,
            "output_object_name": alignmentset_name,
            "data": alignment_set
        })
        return set_info
    def prepare_data(cls):

        workspace_id = cls.dfu.ws_name_to_id(cls.wsName)

        # upload genome object
        genbank_file_name = 'minimal.gbff'
        genbank_file_path = os.path.join(cls.scratch, genbank_file_name)
        shutil.copy(os.path.join('data', genbank_file_name), genbank_file_path)

        genome_object_name = 'test_Genome'
        cls.genome_ref = cls.gfu.genbank_to_genome({
            'file': {
                'path': genbank_file_path
            },
            'workspace_name':
            cls.wsName,
            'genome_name':
            genome_object_name
        })['genome_ref']
        print('TEST genome_ref=' + cls.genome_ref)

        # upload assembly object
        file_name = 'test.fna'
        fasta_path = os.path.join(cls.scratch, file_name)
        shutil.copy(os.path.join('data', file_name), fasta_path)
        assembly_name = 'test_assembly'
        cls.assembly_ref = cls.au.save_assembly_from_fasta({
            'file': {
                'path': fasta_path
            },
            'workspace_name':
            cls.wsName,
            'assembly_name':
            assembly_name
        })

        print('TEST assembly_ref=' + cls.assembly_ref)

        # upload reads object
        reads_file_name = 'Sample1.fastq'
        reads_file_path = os.path.join(cls.scratch, reads_file_name)
        shutil.copy(os.path.join('data', reads_file_name), reads_file_path)

        reads_object_name_1 = 'test_Reads_1'
        cls.reads_ref_1 = cls.ru.upload_reads({
            'fwd_file': reads_file_path,
            'wsname': cls.wsName,
            'sequencing_tech': 'Unknown',
            'interleaved': 0,
            'name': reads_object_name_1
        })['obj_ref']
        print('TEST reads_ref_1=' + cls.reads_ref_1)

        reads_object_name_2 = 'test_Reads_2'
        cls.reads_ref_2 = cls.ru.upload_reads({
            'fwd_file': reads_file_path,
            'wsname': cls.wsName,
            'sequencing_tech': 'Unknown',
            'interleaved': 0,
            'name': reads_object_name_2
        })['obj_ref']
        print('TEST reads_ref_2=' + cls.reads_ref_2)

        # upload alignment object
        alignment_file_name = 'accepted_hits.bam'
        alignment_file_path = os.path.join(cls.scratch, alignment_file_name)
        shutil.copy(os.path.join('data', alignment_file_name),
                    alignment_file_path)

        alignment_object_name_1 = 'test_Alignment_1'
        cls.condition_1 = 'test_condition_1'
        cls.alignment_ref_1 = cls.rau.upload_alignment({
            'file_path':
            alignment_file_path,
            'destination_ref':
            cls.wsName + '/' + alignment_object_name_1,
            'read_library_ref':
            cls.reads_ref_1,
            'condition':
            cls.condition_1,
            'library_type':
            'single_end',
            'assembly_or_genome_ref':
            cls.genome_ref
        })['obj_ref']
        print('TEST alignment_ref_1=' + cls.alignment_ref_1)

        alignment_object_name_2 = 'test_Alignment_2'
        cls.condition_2 = 'test_condition_2'
        cls.alignment_ref_2 = cls.rau.upload_alignment({
            'file_path':
            alignment_file_path,
            'destination_ref':
            cls.wsName + '/' + alignment_object_name_2,
            'read_library_ref':
            cls.reads_ref_2,
            'condition':
            cls.condition_2,
            'library_type':
            'single_end',
            'assembly_or_genome_ref':
            cls.genome_ref
        })['obj_ref']
        print('TEST alignment_ref_2=' + cls.alignment_ref_2)

        alignment_object_name_3 = 'test_Alignment_3'
        cls.condition_3 = 'test_condition_3'
        cls.alignment_ref_3 = cls.rau.upload_alignment({
            'file_path':
            alignment_file_path,
            'destination_ref':
            cls.wsName + '/' + alignment_object_name_3,
            'read_library_ref':
            cls.reads_ref_2,
            'condition':
            cls.condition_3,
            'library_type':
            'single_end',
            'assembly_or_genome_ref':
            cls.assembly_ref
        })['obj_ref']
        print('TEST alignment_ref_3=' + cls.alignment_ref_3)

        # upload sample_set object

        sample_set_object_name = 'test_Sample_Set'
        sample_set_data = {
            'sampleset_id': sample_set_object_name,
            'sample_ids': [cls.reads_ref_1, cls.reads_ref_2],
            'sampleset_desc': 'test sampleset object',
            'Library_type': 'SingleEnd',
            'condition': [cls.condition_1, cls.condition_2],
            'domain': 'Unknown',
            'num_samples': 2,
            'platform': 'Unknown'
        }
        save_object_params = {
            'id':
            workspace_id,
            'objects': [{
                'type': 'KBaseRNASeq.RNASeqSampleSet',
                'data': sample_set_data,
                'name': sample_set_object_name
            }]
        }

        dfu_oi = cls.dfu.save_objects(save_object_params)[0]
        cls.sample_set_ref = str(dfu_oi[6]) + '/' + str(dfu_oi[0]) + '/' + str(
            dfu_oi[4])
        print('TEST sample_set_ref=' + cls.sample_set_ref)

        # upload alignment_set object
        object_type = 'KBaseRNASeq.RNASeqAlignmentSet'
        alignment_set_object_name = 'test_Alignment_Set'
        alignment_set_data = {
            'genome_id':
            cls.genome_ref,
            'read_sample_ids': [reads_object_name_1, reads_object_name_2],
            'mapped_rnaseq_alignments': [{
                reads_object_name_1:
                alignment_object_name_1
            }, {
                reads_object_name_2:
                alignment_object_name_2
            }],
            'mapped_alignments_ids': [{
                reads_object_name_1: cls.alignment_ref_1
            }, {
                reads_object_name_2: cls.alignment_ref_2
            }],
            'sample_alignments': [cls.alignment_ref_1, cls.alignment_ref_2],
            'sampleset_id':
            cls.sample_set_ref
        }
        save_object_params = {
            'id':
            workspace_id,
            'objects': [{
                'type': object_type,
                'data': alignment_set_data,
                'name': alignment_set_object_name
            }]
        }

        dfu_oi = cls.dfu.save_objects(save_object_params)[0]
        cls.old_alignment_set_ref = str(dfu_oi[6]) + '/' + str(
            dfu_oi[0]) + '/' + str(dfu_oi[4])
        print('TEST (legacy) KBaseRNASeq.alignment_set_ref=' +
              cls.old_alignment_set_ref)

        # Save the alignment set
        items = [{
            'ref': cls.alignment_ref_1,
            'label': 'c1'
        }, {
            'ref': cls.alignment_ref_2,
            'label': 'c2'
        }]
        alignment_set_data = {'description': '', 'items': items}
        alignment_set_save_params = {
            'data': alignment_set_data,
            'workspace': cls.wsName,
            'output_object_name': 'MyReadsAlignmentSet'
        }

        set_api = SetAPI(cls.srv_wiz_url)
        save_result = set_api.save_reads_alignment_set_v1(
            alignment_set_save_params)
        cls.new_alignment_set_ref = save_result['set_ref']
        print('TEST KBaseSet.alignment_set_ref=')
        print(cls.new_alignment_set_ref)
예제 #16
0
    def exec_megahit(self, ctx, params):
        """
        :param params: instance of type "ExecMegaHitParams" (exec_megahit()
           Actual execution of MEGAHIT Accepts ReadsSet or a ReadsLibrary as
           Input Creates Assembly object(s) as output. Will eventually also
           create AssemblySet object if input is a ReadsSet and not running a
           combined assembly Other vars same as run_megahit()) -> structure:
           parameter "workspace_name" of String, parameter "input_reads_ref"
           of String, parameter "output_contigset_name" of String, parameter
           "combined_assembly_flag" of Long, parameter
           "megahit_parameter_preset" of String, parameter "min_count" of
           Long, parameter "k_min" of Long, parameter "k_max" of Long,
           parameter "k_step" of Long, parameter "k_list" of list of Long,
           parameter "min_contig_len" of Long
        :returns: instance of type "ExecMegaHitOutput" -> structure:
           parameter "report_text" of String, parameter
           "output_contigset_ref" of list of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN exec_megahit
        console = []
        self.log(console, 'Running exec_megahit() with params=')
        self.log(console, "\n" + pformat(params))

        #SERVICE_VER = 'dev'  # DEBUG
        SERVICE_VER = 'release'

        ### STEP 0: init
        token = ctx['token']
        wsClient = workspaceService(self.workspaceURL, token=token)
        headers = {'Authorization': 'OAuth ' + token}
        env = os.environ.copy()
        env['KB_AUTH_TOKEN'] = token

        ### STEP 1: basic parameter checks + parsing
        required_params = [
            'workspace_name', 'input_reads_ref', 'output_contigset_name'
        ]
        for required_param in required_params:
            if required_param not in params or params[required_param] == None:
                raise ValueError("Must define required param: '" +
                                 required_param + "'")

        ### STEP 2: determine if input is a ReadsLibrary or ReadsSet
        input_reads_ref = params['input_reads_ref']
        input_reads_name = None
        try:
            [
                OBJID_I, NAME_I, TYPE_I, SAVE_DATE_I, VERSION_I, SAVED_BY_I,
                WSID_I, WORKSPACE_I, CHSUM_I, SIZE_I, META_I
            ] = range(11)  # object_info tuple

            input_reads_obj_info = wsClient.get_object_info_new(
                {'objects': [{
                    'ref': input_reads_ref
                }]})[0]
            input_reads_obj_type = re.sub(
                '-[0-9]+\.[0-9]+$', "",
                input_reads_obj_info[TYPE_I])  # remove trailing version
            input_reads_name = input_reads_obj_info[NAME_I]

        except Exception as e:
            raise ValueError('Unable to get reads object from workspace: (' +
                             input_reads_ref + ')' + str(e))

        accepted_input_types = [
            "KBaseSets.ReadsSet", "KBaseFile.PairedEndLibrary"
        ]
        if input_reads_obj_type not in accepted_input_types:
            raise ValueError("Input reads of type '" + input_reads_obj_type +
                             "' not accepted.  Must be one of " +
                             ", ".join(accepted_input_types))

        if input_reads_obj_type == "KBaseSets.ReadsSet":
            required_param = 'combined_assembly_flag'
            if required_param not in params or params[required_param] == None:
                raise ValueError("Must define required param: '" +
                                 required_param + "'")

        ### STEP 3: get the list of library references
        if input_reads_obj_type == "KBaseFile.PairedEndLibrary":
            readsSet_ref_list = [input_reads_ref]
            readsSet_names_list = [input_reads_name]

        elif input_reads_obj_type == "KBaseSets.ReadsSet":
            readsSet_ref_list = []
            readsSet_names_list = []

            try:
                setAPI_Client = SetAPI(
                    url=self.serviceWizardURL,
                    token=ctx['token'])  # for dynamic service
                #setAPI_Client = SetAPI (url=self.callbackURL, token=ctx['token'])  # SDK local method
            except Exception as e:
                raise ValueError(
                    "SetAPI FAILURE: Unable to get SetAPI Client from serviceWizard: '"
                    + self.serviceWizardURL + "' token: '" + ctx['token'] +
                    "'" + str(e))
                #raise ValueError("SetAPI FAILURE: Unable to get SetAPI Client as local method callbackURL: '"+self.callbackURL+"' token: '"+ctx['token']+"'" + str(e))

            try:
                input_readsSet_obj = setAPI_Client.get_reads_set_v1({
                    'ref':
                    input_reads_ref,
                    'include_item_info':
                    1
                })
            except Exception as e:
                raise ValueError(
                    'SetAPI FAILURE: Unable to get read library set object from workspace: ('
                    + str(input_reads_ref) + ")\n" + str(e))

            for readsLibrary_obj in input_readsSet_obj['data']['items']:
                readsSet_ref_list.append(readsLibrary_obj['ref'])
                NAME_I = 1
                readsSet_names_list.append(readsLibrary_obj['info'][NAME_I])

        else:
            raise ValueError("Input reads of type '" + input_reads_obj_type +
                             "' not accepted.  Must be one of " +
                             ", ".join(accepted_input_types))

        ### STEP 4: If doing a combined assembly on a ReadsSet, download reads one at a time and combine
        if input_reads_obj_type == "KBaseSets.ReadsSet" and params[
                'combined_assembly_flag'] != 0:

            self.log(
                console,
                "MegaHit_Sets:run_megahit(): CREATING COMBINED INPUT FASTQ FILES"
            )

            # make dir
            timestamp = int(
                (datetime.utcnow() -
                 datetime.utcfromtimestamp(0)).total_seconds() * 1000)
            input_dir = os.path.join(self.scratch, 'input.' + str(timestamp))
            if self.mac_mode:  # on macs, we cannot run megahit in the shared host scratch space, so we need to move the file there
                input_dir = os.path.join(self.host_scratch,
                                         'input.' + str(timestamp))
            if not os.path.exists(input_dir):
                os.makedirs(input_dir)

            # connect to ReadsUtils Client
            try:
                readsUtils_Client = ReadsUtils(url=self.callbackURL,
                                               token=ctx['token'])  # SDK local
            except:
                raise ValueError("Unable to get readsUtils_Client\n" + str(e))

            # start combined file
            read_buf_size = 65536
            write_buf_size = 65536
            combined_input_fwd_path = os.path.join(input_dir,
                                                   'input_reads_fwd.fastq')
            combined_input_rev_path = os.path.join(input_dir,
                                                   'input_reads_rev.fastq')
            combined_input_fwd_handle = open(combined_input_fwd_path, 'w',
                                             write_buf_size)
            combined_input_rev_handle = open(combined_input_rev_path, 'w',
                                             write_buf_size)

            # add libraries, one at a time
            for this_input_reads_ref in readsSet_ref_list:
                self.log(
                    console,
                    "MegaHit_Sets:run_megahit(): DOWNLOADING FASTQ FILES FOR ReadsSet member: "
                    + str(this_input_reads_ref))
                try:
                    readsLibrary = readsUtils_Client.download_reads({
                        'read_libraries': [this_input_reads_ref],
                        'interleaved':
                        'false'
                    })
                except Exception as e:
                    raise ValueError(
                        'Unable to get reads object from workspace: (' +
                        this_input_reads_ref + ")\n" + str(e))

                this_input_fwd_path = readsLibrary['files'][
                    this_input_reads_ref]['files']['fwd']
                this_input_rev_path = readsLibrary['files'][
                    this_input_reads_ref]['files']['rev']

                # append fwd
                self.log(
                    console,
                    "MegaHit_Sets:run_megahit(): APPENDING FASTQ FILES FOR ReadsSet member: "
                    + str(this_input_reads_ref))
                this_input_path = this_input_fwd_path
                cat_file_handle = combined_input_fwd_handle
                with open(this_input_path, 'r',
                          read_buf_size) as this_input_handle:
                    while True:
                        read_data = this_input_handle.read(read_buf_size)
                        if read_data:
                            cat_file_handle.write(read_data)
                        else:
                            break
                os.remove(
                    this_input_path
                )  # create space since we no longer need the piece file

                # append rev
                this_input_path = this_input_rev_path
                cat_file_handle = combined_input_rev_handle
                with open(this_input_path, 'r',
                          read_buf_size) as this_input_handle:
                    while True:
                        read_data = this_input_handle.read(read_buf_size)
                        if read_data:
                            cat_file_handle.write(read_data)
                        else:
                            break
                os.remove(
                    this_input_path
                )  # create space since we no longer need the piece file

            combined_input_fwd_handle.close()
            combined_input_rev_handle.close()

        ### STEP 5: finally run MegaHit_Sets
        exec_megahit_single_library_params = params
        output_assemblyset_contigset_paths = []
        output_contigset_path = None

        # PairedEndLibrary
        if input_reads_obj_type == "KBaseFile.PairedEndLibrary":
            self.log(
                console,
                "MegaHit_Sets:run_megahit(): DOWNLOADING FASTQ FILES FOR ReadsLibrary: "
                + str(input_reads_ref))
            try:
                readsUtils_Client = ReadsUtils(url=self.callbackURL,
                                               token=ctx['token'])  # SDK local
                readsLibrary = readsUtils_Client.download_reads({
                    'read_libraries': [input_reads_ref],
                    'interleaved':
                    'false'
                })
            except Exception as e:
                raise ValueError(
                    'Unable to get reads object from workspace: (' +
                    input_reads_ref + ")\n" + str(e))

            input_fwd_path = readsLibrary['files'][input_reads_ref]['files'][
                'fwd']
            input_rev_path = readsLibrary['files'][input_reads_ref]['files'][
                'rev']
            exec_megahit_single_library_params[
                'input_fwd_path'] = input_fwd_path
            exec_megahit_single_library_params[
                'input_rev_path'] = input_rev_path

            # the key line
            output_contigset_path = self.exec_megahit_single_library(
                exec_megahit_single_library_params)
            output_assemblyset_contigset_paths.append(output_contigset_path)

            os.remove(input_fwd_path)  # files can be really big
            os.remove(input_rev_path)

        # ReadsSet combined (already downloaded and combined fastqs)
        elif input_reads_obj_type == "KBaseSets.ReadsSet" and params[
                'combined_assembly_flag'] != 0:

            input_fwd_path = combined_input_fwd_path
            input_rev_path = combined_input_rev_path
            exec_megahit_single_library_params[
                'input_fwd_path'] = input_fwd_path
            exec_megahit_single_library_params[
                'input_rev_path'] = input_rev_path

            # the key line
            output_contigset_path = self.exec_megahit_single_library(
                exec_megahit_single_library_params)
            output_assemblyset_contigset_paths.append(output_contigset_path)

            os.remove(input_fwd_path)  # files can be really big
            os.remove(input_rev_path)

        # ReadsSet uncombined (still have to download)
        elif input_reads_obj_type == "KBaseSets.ReadsSet" and params[
                'combined_assembly_flag'] == 0:
            # connect to ReadsUtils Client
            try:
                readsUtils_Client = ReadsUtils(url=self.callbackURL,
                                               token=ctx['token'])  # SDK local
            except:
                raise ValueError("Unable to get readsUtils_Client\n" + str(e))

            # get libraries, one at a time, and run MegaHit_Sets
            output_assemblyset_contigset_paths = []
            for this_input_reads_ref in readsSet_ref_list:
                self.log(
                    console,
                    "MegaHit_Sets:run_megahit(): DOWNLOADING FASTQ FILES FOR ReadsSet member: "
                    + str(this_input_reads_ref))
                try:
                    readsLibrary = readsUtils_Client.download_reads({
                        'read_libraries': [this_input_reads_ref],
                        'interleaved':
                        'false'
                    })
                except Exception as e:
                    raise ValueError(
                        'Unable to get reads object from workspace: (' +
                        this_input_reads_ref + ")\n" + str(e))

                this_input_fwd_path = readsLibrary['files'][
                    this_input_reads_ref]['files']['fwd']
                this_input_rev_path = readsLibrary['files'][
                    this_input_reads_ref]['files']['rev']
                exec_megahit_single_library_params[
                    'input_fwd_path'] = this_input_fwd_path
                exec_megahit_single_library_params[
                    'input_rev_path'] = this_input_rev_path

                # the key line
                this_output_contigset_path = self.exec_megahit_single_library(
                    exec_megahit_single_library_params)
                output_assemblyset_contigset_paths.append(
                    this_output_contigset_path)

                os.remove(this_input_fwd_path)  # files can be really big
                os.remove(this_input_rev_path)

        # just in case we've confused ourselves
        else:
            raise ValueError("error in logic")

        ### STEP 6: save the resulting assembly
        assemblyUtil = AssemblyUtil(self.callbackURL,
                                    token=ctx['token'],
                                    service_ver=SERVICE_VER)
        output_contigset_refs = []
        output_contigset_names = []
        for i, this_output_contigset_path in enumerate(
                output_assemblyset_contigset_paths):
            if len(output_assemblyset_contigset_paths) == 1:
                assembly_name = params['output_contigset_name']
            else:
                assembly_name = readsSet_names_list[i] + '-' + params[
                    'output_contigset_name']

            this_output_data_ref = assemblyUtil.save_assembly_from_fasta({
                'file': {
                    'path': this_output_contigset_path
                },
                'workspace_name':
                params['workspace_name'],
                'assembly_name':
                assembly_name
            })

            output_contigset_refs.append(this_output_data_ref)
            output_contigset_names.append(assembly_name)

        ### STEP 7: generate the report text

        # compute a simple contig length distribution for the report
        report = ''
        for i, this_output_contigset_path in enumerate(
                output_assemblyset_contigset_paths):

            report += "MegaHit_Sets run for Read Library: " + readsSet_names_list[
                i] + "\n"
            report += "-------------------------------------------------------------\n"
            report += "\n"
            lengths = []
            for seq_record in SeqIO.parse(this_output_contigset_path, 'fasta'):
                lengths.append(len(seq_record.seq))

                report += 'ContigSet saved to: ' + params[
                    'workspace_name'] + '/' + output_contigset_names[i] + '\n'
                report += 'Assembled into ' + str(len(lengths)) + ' contigs.\n'
                report += 'Avg Length: ' + str(
                    sum(lengths) / float(len(lengths))) + ' bp.\n'

                bins = 10
                counts, edges = np.histogram(lengths, bins)
                report += 'Contig Length Distribution (# of contigs -- min to max basepairs):\n'
                for c in range(bins):
                    report += '   ' + str(counts[c]) + '\t--\t' + str(
                        edges[c]) + ' to ' + str(edges[c + 1]) + ' bp\n'

        ### STEP 8: contruct the output to send back
        output = {
            'report_text': report,
            'output_contigset_refs': output_contigset_refs
        }

        #END exec_megahit

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method exec_megahit return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
예제 #17
0
    def KButil_Build_InSilico_Metagenomes_with_Grinder(self, ctx, params):
        """
        :param params: instance of type
           "KButil_Build_InSilico_Metagenomes_with_Grinder_Params"
           (KButil_Build_InSilico_Metagenomes_with_Grinder() ** **  Use
           Grinder to generate in silico shotgun metagenomes) -> structure:
           parameter "workspace_name" of type "workspace_name" (** The
           workspace object refs are of form: ** **    objects =
           ws.get_objects([{'ref':
           params['workspace_id']+'/'+params['obj_name']}]) ** ** "ref" means
           the entire name combining the workspace id and the object name **
           "id" is a numerical identifier of the workspace or object, and
           should just be used for workspace ** "name" is a string identifier
           of a workspace or object.  This is received from Narrative.),
           parameter "input_refs" of type "data_obj_ref", parameter
           "output_name" of type "data_obj_name", parameter "desc" of String,
           parameter "num_reads_per_lib" of Long, parameter
           "population_percs" of String, parameter "read_len_mean" of Long,
           parameter "read_len_stddev" of Double, parameter "pairs_flag" of
           Long, parameter "mate_orientation" of String, parameter
           "insert_len_mean" of Long, parameter "insert_len_stddev" of
           Double, parameter "mutation_dist" of String, parameter
           "mutation_ratio" of String, parameter "qual_good" of Long,
           parameter "qual_bad" of Long, parameter "len_bias_flag" of Long,
           parameter "random_seed" of Long
        :returns: instance of type
           "KButil_Build_InSilico_Metagenomes_with_Grinder_Output" ->
           structure: parameter "report_name" of type "data_obj_name",
           parameter "report_ref" of type "data_obj_ref"
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN KButil_Build_InSilico_Metagenomes_with_Grinder

        #### STEP 0: basic init
        ##
        console = []
        invalid_msgs = []
        report_text = ''
        self.log(console,
                 'Running KButil_Build_InSilico_Metagenomes_with_Grinder(): ')
        self.log(console, "\n" + pformat(params))

        # Auth
        token = ctx['token']
        headers = {'Authorization': 'OAuth ' + token}
        env = os.environ.copy()
        env['KB_AUTH_TOKEN'] = token

        # API Clients
        #SERVICE_VER = 'dev'  # DEBUG
        SERVICE_VER = 'release'
        wsClient = workspaceService(self.workspaceURL, token=token)
        readsUtils_Client = ReadsUtils(url=self.callbackURL,
                                       token=ctx['token'])  # SDK local
        #setAPI_Client = SetAPI (url=self.callbackURL, token=ctx['token'])  # for SDK local.  local doesn't work for SetAPI
        setAPI_Client = SetAPI(url=self.serviceWizardURL,
                               token=ctx['token'])  # for dynamic service
        auClient = AssemblyUtil(self.callbackURL,
                                token=ctx['token'],
                                service_ver=SERVICE_VER)
        dfu = DFUClient(self.callbackURL)

        # param checks
        required_params = [
            'workspace_name', 'input_refs', 'output_name', 'num_reads_per_lib',
            'population_percs', 'read_len_mean', 'read_len_stddev',
            'pairs_flag', 'mate_orientation', 'insert_len_mean',
            'insert_len_stddev', 'mutation_dist', 'mutation_ratio',
            'qual_good', 'qual_bad', 'len_bias_flag', 'random_seed'
        ]
        for arg in required_params:
            if arg not in params or params[arg] == None or params[arg] == '':
                raise ValueError("Must define required param: '" + arg + "'")

        # cast to str unpredictable numerical params (mostly used in string context)
        numerical_params = [
            'num_reads_per_lib', 'read_len_mean', 'read_len_stddev',
            'pairs_flag', 'insert_len_mean', 'insert_len_stddev', 'qual_good',
            'qual_bad', 'len_bias_flag', 'random_seed'
        ]
        for arg in numerical_params:
            if arg not in params or params[arg] == None or params[arg] == '':
                continue
            params[arg] = str(params[arg])

        # load provenance
        provenance = [{}]
        if 'provenance' in ctx:
            provenance = ctx['provenance']
        provenance[0]['input_ws_objects'] = []
        for input_ref in params['input_refs']:
            provenance[0]['input_ws_objects'].append(input_ref)

        # set the output paths
        timestamp = int(
            (datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds()
            * 1000)
        output_dir = os.path.join(self.scratch, 'output.' + str(timestamp))
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        html_output_dir = os.path.join(output_dir, 'html')
        if not os.path.exists(html_output_dir):
            os.makedirs(html_output_dir)

        #### STEP 1: Parse population_percs and write to file
        ##
        abundance_str = params['population_percs'].strip()
        abundance_file_path = os.path.join(output_dir, 'my_abundances.txt')
        abundance_config_num_libs = 0
        abundance_config_num_libs_set = False
        grinder_genome_ids = []
        header = []
        out_buf = []

        for row in abundance_str.split("\n"):
            cols = re.split(r'\s+', row)
            if cols[0].upper() == "GENOME":
                for col in cols:
                    if col == '':
                        continue
                    header.append(col)
                continue
            grinder_genome_ids.append(cols[0])
            self.log(console, "GRINDER GENOME ID: '" + cols[0] + "'")  # DEBUG
            out_row = []
            for col in cols:
                if col == '':
                    continue
                elif col == '%':
                    continue
                elif col.endswith('%'):
                    col = col.rstrip('%')
                out_row.append(col)
            out_buf.append("\t".join(out_row))
            num_samples = len(out_row) - 1  # first col is genome id
            if not abundance_config_num_libs_set:
                abundance_config_num_libs_set = True
                abundance_config_num_libs = num_samples
            elif num_samples != abundance_config_num_libs:
                invalid_msgs.append(
                    "inconsistent number of samples in population_percs input field"
                )
        # data validation
        if abundance_config_num_libs == 0:
            invalid_msgs.append(
                "unable to find sample percentages in population_percs input field"
            )
        sample_sums = []
        for row_i, abund_row_str in enumerate(out_buf):
            abund_row = abund_row_str.split()
            for sample_i, abund in enumerate(abund_row[1:]):
                if row_i == 0:
                    sample_sums.append(0)
                #self.log (console, "row_i: "+str(row_i)+" sample_i: "+str(sample_i))  # DEBUG
                sample_sums[sample_i] += float(abund)
        for sample_i, sample_sum in enumerate(sample_sums):
            if sample_sum < 99.5 or sample_sum > 100.5:
                self.log(
                    invalid_msgs, "Sample: " + str(sample_i + 1) + " " +
                    header[sample_i + 1] +
                    " proportions is not summing to 100.0. Summing to: " +
                    str(sample_sum))

        if len(invalid_msgs) == 0:
            with open(abundance_file_path, 'w') as abundance_fh:
                for out_line in out_buf:
                    abundance_fh.write(out_line + "\n")
            # DEBUG
            with open(abundance_file_path, 'r') as abundance_fh:
                for out_line in abundance_fh.readlines():
                    out_line = out_line.rstrip()
                    self.log(console, "ABUNDANCE_CONFIG: '" + out_line + "'")

        #### STEP 2: get genome scaffold sequences
        ##
        if len(invalid_msgs) == 0:
            genomes_src_db_file_path = os.path.join(output_dir, 'genomes.fna')
            read_buf_size = 65536
            write_buf_size = 65536
            accepted_input_types = ["KBaseGenomes.Genome"]
            genome_refs = params['input_refs']
            genome_obj_names = []
            genome_sci_names = []
            assembly_refs = []

            for i, input_ref in enumerate(genome_refs):
                # genome obj info
                try:
                    [
                        OBJID_I, NAME_I, TYPE_I, SAVE_DATE_I, VERSION_I,
                        SAVED_BY_I, WSID_I, WORKSPACE_I, CHSUM_I, SIZE_I,
                        META_I
                    ] = range(11)  # object_info tuple
                    input_obj_info = wsClient.get_object_info_new(
                        {'objects': [{
                            'ref': input_ref
                        }]})[0]
                    input_obj_type = re.sub(
                        '-[0-9]+\.[0-9]+$', "",
                        input_obj_info[TYPE_I])  # remove trailing version
                    genome_obj_names.append(input_obj_info[NAME_I])

                except Exception as e:
                    raise ValueError('Unable to get object from workspace: (' +
                                     input_ref + ')' + str(e))
                if input_obj_type not in accepted_input_types:
                    raise ValueError("Input object of type '" +
                                     input_obj_type +
                                     "' not accepted.  Must be one of " +
                                     ", ".join(accepted_input_types))

                # genome obj data
                try:
                    genome_obj = wsClient.get_objects([{
                        'ref': input_ref
                    }])[0]['data']
                    genome_sci_names.append(genome_obj['scientific_name'])
                except:
                    raise ValueError("unable to fetch genome: " + input_ref)

                # Get assembly_refs
                if ('contigset_ref' not in genome_obj or genome_obj['contigset_ref'] == None) \
                   and ('assembly_ref' not in genome_obj or genome_obj['assembly_ref'] == None):
                    msg = "Genome " + genome_obj_names[
                        i] + " (ref:" + input_ref + ") " + genome_sci_names[
                            i] + " MISSING BOTH contigset_ref AND assembly_ref.  Cannot process.  Exiting."
                    self.log(console, msg)
                    self.log(invalid_msgs, msg)
                    continue
                elif 'assembly_ref' in genome_obj and genome_obj[
                        'assembly_ref'] != None:
                    msg = "Genome " + genome_obj_names[
                        i] + " (ref:" + input_ref + ") " + genome_sci_names[
                            i] + " USING assembly_ref: " + str(
                                genome_obj['assembly_ref'])
                    self.log(console, msg)
                    assembly_refs.append(genome_obj['assembly_ref'])
                elif 'contigset_ref' in genome_obj and genome_obj[
                        'contigset_ref'] != None:
                    msg = "Genome " + genome_obj_names[
                        i] + " (ref:" + input_ref + ") " + genome_sci_names[
                            i] + " USING contigset_ref: " + str(
                                genome_obj['contigset_ref'])
                    self.log(console, msg)
                    assembly_refs.append(genome_obj['contigset_ref'])

        # get fastas for scaffolds
        if len(invalid_msgs) == 0:
            contig_file_paths = []

            for genome_i, input_ref in enumerate(genome_refs):
                contig_file = auClient.get_assembly_as_fasta({
                    'ref':
                    assembly_refs[genome_i]
                }).get('path')
                sys.stdout.flush()
                contig_file_path = dfu.unpack_file({'file_path':
                                                    contig_file})['file_path']
                contig_file_paths.append(contig_file_path)

            # reformat FASTA IDs for Grinder
            with open(genomes_src_db_file_path, 'w',
                      write_buf_size) as genomes_src_db_fh:
                for genome_i, contig_file_path in enumerate(contig_file_paths):
                    #self.log(console,str(genome_i)+" CONTIG_FILE: "+contig_file_path)  # DEBUG
                    #contig_ids = []
                    with open(contig_file_path, 'r',
                              read_buf_size) as contig_fh:
                        genome_seq = ''
                        contig_seq = ''
                        contig_seqs = []
                        for contig_line in contig_fh.readlines():
                            contig_line = contig_line.rstrip()
                            if contig_line.startswith('>'):
                                #contig_id = contig_line.strip()[1:].split(' ')[0]
                                #contig_ids.append(contig_id)
                                #genomes_src_db_fh.write(">"+grinder_genome_ids[genome_i]+"\n")
                                if contig_seq != '':
                                    contig_seqs.append(contig_seq)
                                    contig_seq = ''
                                    continue
                            else:
                                #genomes_src_db_fh.write(contig_line)
                                contig_seq += contig_line
                        if contig_seq != '':
                            contig_seqs.append(contig_seq)
                            contig_seq = ''

                    # write joined contigs to file
                    genome_seq = "NNNNNNNNNN".join(
                        contig_seqs
                    )  # NOTE: Using "-exclude_chars" grinder opt on N to avoid contig joins
                    genome_seq = genome_seq.upper(
                    )  # grinder might require upper case?
                    genomes_src_db_fh.write(">" +
                                            grinder_genome_ids[genome_i] +
                                            "\n")
                    genomes_src_db_fh.write(genome_seq + "\n")
                    genome_seq = ''
                    contig_seqs = []

                    # DEBUG
                    #for contig_id in contig_ids:
                    #    self.log(console, "\tCONTIG_ID: "+contig_id)  # DEBUG
            # DEBUG
            toggle = 0
            with open(genomes_src_db_file_path, 'r',
                      write_buf_size) as genomes_src_db_fh:
                for contig_line in genomes_src_db_fh.readlines():
                    contig_line = contig_line.rstrip()
                    if contig_line.startswith('>'):
                        self.log(console, 'GENOMES_SRC_DB: ' + contig_line)
                        genome_id = contig_line[1:]
                        toggle = 0
                    elif toggle == 0:
                        #elif genome_id == 'G3':
                        self.log(
                            console,
                            'GENOMES_SRC_DB: ' + contig_line[0:50] + '...')
                        toggle += 1

        #### STEP 3: Run Grinder
        ##
        if len(invalid_msgs) == 0:
            cmd = []
            cmd.append(self.GRINDER)
            # output
            cmd.append('-base_name')
            cmd.append(params['output_name'])
            cmd.append('-output_dir')
            cmd.append(output_dir)
            # contigs input
            cmd.append('-reference_file')
            cmd.append(genomes_src_db_file_path)
            # abundances
            cmd.append('-abundance_file')
            cmd.append(abundance_file_path)
            # library size
            cmd.append('-total_reads')
            cmd.append(str(params['num_reads_per_lib']))
            # num libraries (overridden by abundance file?)
            cmd.append('-num_libraries')
            cmd.append(str(abundance_config_num_libs))
            # read and insert lens
            cmd.append('-read_dist')
            cmd.append(str(params['read_len_mean']))
            cmd.append('normal')
            cmd.append(str(params['read_len_stddev']))
            if str(params['pairs_flag']) == '1':
                cmd.append('-insert_dist')
                cmd.append(str(params['insert_len_mean']))
                cmd.append('normal')
                cmd.append(str(params['insert_len_stddev']))
                # mate orientation
                cmd.append('-mate_orientation')
                cmd.append(params['mate_orientation'])
            # genome len bias
            cmd.append('-length_bias')
            cmd.append(str(params['len_bias_flag']))
            # mutation model
            cmd.append('-mutation_dist')
            cmd.append(str(params['mutation_dist']))
            cmd.append('-mutation_ratio')
            cmd.append(str(params['mutation_ratio']))
            # qual scores
            cmd.append('-fastq_output')
            cmd.append('1')
            cmd.append('-qual_levels')
            cmd.append(str(params['qual_good']))
            cmd.append(str(params['qual_bad']))
            # skip contig joins
            cmd.append('-exclude_chars')
            cmd.append('NX')
            # explicitly request bidirectional
            cmd.append('-unidirectional')
            cmd.append('0')
            # random seed
            if 'random_seed' in params and params[
                    'random_seed'] != None and params['random_seed'] != '':
                cmd.append('-random_seed')
                cmd.append(str(params['random_seed']))

            # RUN
            cmd_str = " ".join(cmd)
            self.log(console, "===========================================")
            self.log(console, "RUNNING: " + cmd_str)
            self.log(console, "===========================================")

            cmdProcess = subprocess.Popen(cmd_str,
                                          stdout=subprocess.PIPE,
                                          stderr=subprocess.STDOUT,
                                          shell=True)
            outputlines = []
            while True:
                line = cmdProcess.stdout.readline()
                outputlines.append(line)
                if not line: break
                self.log(console, line.replace('\n', ''))

            cmdProcess.stdout.close()
            cmdProcess.wait()
            self.log(console,
                     'return code: ' + str(cmdProcess.returncode) + '\n')
            if cmdProcess.returncode != 0:
                raise ValueError('Error running kb_grinder, return code: ' +
                                 str(cmdProcess.returncode) + '\n')

            #report_text += "\n".join(outputlines)
            #report_text += "cmdstring: " + cmdstring + " stdout: " + stdout + " stderr " + stderr

            # capture output for report and paths to out files
            report_text_buf = []
            struct_file_paths = []
            struct_file_names = []
            fastq_file_paths = []
            for out_line in outputlines:
                out_line = out_line.rstrip()
                if 'Community structure' in out_line:
                    clean_line = out_line.lstrip()
                    struct_file_path = re.split(r'\s+', clean_line)[3]
                    struct_file_paths.append(struct_file_path)
                    struct_file_names.append(struct_file_path.split('/')[-1])
                    self.log(console, "STRUCT_FILE_NAME: '" +
                             struct_file_path.split('/')[-1])  # DEBUG
                elif 'FASTQ file' in out_line:
                    clean_line = out_line.lstrip()
                    fastq_file_paths.append(re.split(r'\s+', clean_line)[3])
                else:
                    report_text_buf.append(out_line)
            report_text += "\n".join(report_text_buf)

        #### STEP 4: Upload Read Libs and create reads set
        ##
        if len(invalid_msgs) == 0:
            lib_obj_refs = []
            lib_obj_names = []
            readsSet_items = []

            for sample_i, fastq_file_path in enumerate(fastq_file_paths):

                if not os.path.isfile (fastq_file_path) \
                   or os.path.getsize (fastq_file_path) == 0:

                    raise ValueError("empty read lib generated: " +
                                     fastq_file_path)
                else:

                    # lib obj name
                    if len(fastq_file_paths) == 0:
                        output_obj_name = params['output_name']
                    else:
                        if str(params['pairs_flag']) == '1':
                            output_obj_name = params[
                                'output_name'] + '-sample' + str(
                                    sample_i + 1) + ".PairedEndLib"
                        else:
                            output_obj_name = params[
                                'output_name'] + '-sample' + str(
                                    sample_i + 1) + ".SingleEndLib"
                    lib_obj_names.append(output_obj_name)

                    # upload lib and get obj ref
                    self.log(
                        console,
                        'Uploading trimmed paired reads: ' + output_obj_name)
                    sequencing_tech = 'artificial reads'
                    if str(params['pairs_flag']) == '1':
                        interleaved = 1
                    else:
                        interleaved = 0
                    lib_obj_ref = readsUtils_Client.upload_reads({
                        'wsname':
                        str(params['workspace_name']),
                        'name':
                        output_obj_name,
                        'fwd_file':
                        fastq_file_path,
                        'interleaved':
                        interleaved,
                        'sequencing_tech':
                        sequencing_tech
                    })['obj_ref']
                    lib_obj_refs.append(lib_obj_ref)
                    os.remove(fastq_file_path)  # free up disk

                    # add to readsSet
                    readsSet_items.append({
                        'ref': lib_obj_ref,
                        'label': output_obj_name
                    })
            # create readsset
            readsSet_obj_ref = None
            if len(lib_obj_refs) > 1:
                readsSet_obj = {
                    'description':
                    "Grinder Metagenome from " + " ".join(genome_obj_names),
                    'items':
                    readsSet_items
                }
                readsSet_obj_name = params['output_name']
                readsSet_obj_ref = setAPI_Client.save_reads_set_v1({
                    'workspace_name':
                    params['workspace_name'],
                    'output_object_name':
                    readsSet_obj_name,
                    'data':
                    readsSet_obj
                })['set_ref']

        #### STEP 5: Build report
        ##
        reportName = 'kb_grinder_report_' + str(uuid.uuid4())
        reportObj = {
            'objects_created': [],
            #'text_message': '',  # or is it 'message'?
            'message': '',  # or is it 'text_message'?
            'direct_html': '',
            #'direct_html_link_index': 0,
            'file_links': [],
            'html_links': [],
            'workspace_name': params['workspace_name'],
            'report_object_name': reportName
        }

        # message
        if len(invalid_msgs) > 0:
            report_text = "\n".join(invalid_msgs)
        reportObj['message'] = report_text

        if len(invalid_msgs) == 0:
            # objs
            if readsSet_obj_ref != None:
                reportObj['objects_created'].append({
                    'ref':
                    readsSet_obj_ref,
                    'desc':
                    params['output_name'] + " ReadsSet"
                })
            for lib_obj_i, lib_obj_ref in enumerate(lib_obj_refs):
                reportObj['objects_created'].append({
                    'ref':
                    lib_obj_ref,
                    'desc':
                    lib_obj_names[lib_obj_i]
                })
            # downloadable data
            for data_i, data_path in enumerate(struct_file_paths):
                try:
                    upload_ret = dfu.file_to_shock({
                        'file_path': data_path,
                        #'pack': 'zip'})
                        'make_handle': 0
                    })
                except:
                    raise ValueError('error uploading ' + data_path +
                                     ' file to shock')
                reportObj['file_links'].append({
                    'shock_id':
                    upload_ret['shock_id'],
                    'name':
                    struct_file_names[data_i],
                    'label':
                    struct_file_names[data_i]
                })

            # html report
            """
            try:
                html_upload_ret = dfu.file_to_shock({'file_path': html_output_dir,
                                                     'make_handle': 0,
                                                     'pack': 'zip'})
            except:
                raise ValueError ('error uploading html report to shock')
            reportObj['direct_html_link_index'] = 0
            reportObj['html_links'] = [{'shock_id': html_upload_ret['shock_id'],
                                        'name': html_file,
                                        'label': params['output_name']+' HTML'
                                    }
                                   ]
            """

        # save report object
        #
        SERVICE_VER = 'release'
        reportClient = KBaseReport(self.callbackURL,
                                   token=ctx['token'],
                                   service_ver=SERVICE_VER)
        #report_info = report.create({'report':reportObj, 'workspace_name':params['workspace_name']})
        report_info = reportClient.create_extended_report(reportObj)

        returnVal = {
            'report_name': report_info['name'],
            'report_ref': report_info['ref']
        }
        #END KButil_Build_InSilico_Metagenomes_with_Grinder

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError(
                'Method KButil_Build_InSilico_Metagenomes_with_Grinder return value '
                + 'returnVal is not type dict as required.')
        # return the results
        return [returnVal]
예제 #18
0
class QualiMapRunner:

    QUALIMAP_PATH = '/kb/module/qualimap-bin/qualimap'
    JAVA_MEM_DEFAULT_SIZE = '16G'
    LARGE_BAM_FILE_SIZE = 20 * 1024 * 1024 * 1024  # 20 GB
    TIMEOUT = 72 * 60 * 60  # 72 hours

    def _get_file_size(self, file_path):
        file_size = os.path.getsize(file_path)
        print('File size: {} -- {}'.format(file_size, file_path))
        return file_size

    def _large_file(self, file_path):

        filename, file_extension = os.path.splitext(file_path)
        multiplier = 0

        if file_extension == '.txt':
            total_file_size = 0
            with open(file_path, 'r') as f:
                for line in f:
                    bam_file_path = line.split('\t')[1]
                    total_file_size += self._get_file_size(bam_file_path)
            print('Total file size: {}'.format(total_file_size))
            multiplier = int(total_file_size) / int(self.LARGE_BAM_FILE_SIZE)
        else:
            multiplier = int(self._get_file_size(file_path)) / int(
                self.LARGE_BAM_FILE_SIZE)

        print('setting number of windows multiplier to: {}'.format(multiplier))

        return multiplier

    def _timeout_handler(self, signum, frame):
        print('Signal handler called with signal', signum)
        raise ValueError('QualiMap takes too long')

    def __init__(self, scratch_dir, callback_url, workspace_url, srv_wiz_url):
        self.scratch_dir = scratch_dir
        self.rau = ReadsAlignmentUtils(callback_url)
        self.kbr = KBaseReport(callback_url)
        self.dfu = DataFileUtil(callback_url)
        self.gfu = GenomeFileUtil(callback_url)
        self.set_api = SetAPI(srv_wiz_url)
        self.ws = Workspace(workspace_url)
        self.valid_commands = ['bamqc', 'multi-bamqc']

    def run_app(self, params):
        self.validate_params(params)
        print('Validated Params = ')
        pprint(params)
        run_info = self.get_run_info(params)

        if run_info.get('mode') not in ['single', 'multi']:
            raise ValueError(
                'Error in fetching the type to determine run settings.')

        run_error = False
        try:
            signal.signal(signal.SIGALRM, self._timeout_handler)
            signal.alarm(self.TIMEOUT)
            if run_info['mode'] == 'single':
                result = self.run_bamqc(params['input_ref'],
                                        run_info['input_info'])
            elif run_info['mode'] == 'multi':
                result = self.run_multi_sample_qc(params['input_ref'],
                                                  run_info['input_info'])
            signal.alarm(0)
        except Exception:
            run_error = True

            workdir = os.path.join(self.scratch_dir,
                                   'qualimap_' + str(int(time.time() * 10000)))
            os.makedirs(workdir)

            with open(os.path.join(workdir, 'qualimapReport.html'),
                      'w') as report:
                report.write('<html><body><p></p></body></html>')

            package_info = self.package_output_folder(
                workdir, 'QualiMap_report',
                'EMPTY HTML report directory for QualiMap BAM QC',
                'qualimapReport.html')

            result = {
                'qc_result_folder_path': workdir,
                'qc_result_zip_info': package_info,
                'shock_id': None
            }
            error_msg = 'Running QualiMap returned an error:\n{}\n'.format(
                traceback.format_exc())
            error_msg += 'Generating simple report instead\n'
            print(error_msg)

        if params['create_report']:
            result = self.create_report(result, params['output_workspace'],
                                        run_error, params['input_ref'])

        return result

    def create_report(self,
                      result,
                      output_workspace,
                      run_error=None,
                      input_ref=None):

        if run_error:
            objects_created = []
            info = self.get_obj_info(input_ref)
            obj_type = self.get_type_from_obj_info(info)
            if obj_type in ['KBaseRNASeq.RNASeqAlignment']:
                objects_created.append({
                    'ref': input_ref,
                    'description': 'Alignment'
                })

            if obj_type in [
                    'KBaseRNASeq.RNASeqAlignmentSet',
                    'KBaseSets.ReadsAlignmentSet'
            ]:
                objects_created.append({
                    'ref': input_ref,
                    'description': 'AlignmentSet'
                })
                reads_alignment_info = self.get_alignments_from_set(input_ref)
                for alignment in reads_alignment_info:
                    alignment_ref = alignment.get('ref')
                    objects_created.append({
                        'ref': alignment_ref,
                        'description': 'Alignment'
                    })

            report_info = self.kbr.create_extended_report({
                'message':
                ' ',
                'objects_created':
                objects_created,
                'report_object_name':
                'qualimap_report' + str(uuid.uuid4()),
                'workspace_name':
                output_workspace
            })
            result['report_name'] = report_info['name']
            result['report_ref'] = report_info['ref']
            return result

        qc_result_zip_info = result['qc_result_zip_info']
        report_info = self.kbr.create_extended_report({
            'message':
            '',
            'objects_created': [],
            'direct_html_link_index':
            0,
            'html_links': [{
                'shock_id': qc_result_zip_info['shock_id'],
                'name': qc_result_zip_info['index_html_file_name'],
                'label': qc_result_zip_info['name']
            }],
            'report_object_name':
            'qualimap_report' + str(uuid.uuid4()),
            'workspace_name':
            output_workspace
        })
        result['report_name'] = report_info['name']
        result['report_ref'] = report_info['ref']
        return result

    def get_gtf_file(self, input_ref, set_op=False):

        print('Start fetching GFF file from genome')

        if set_op:
            set_data = self.set_api.get_reads_alignment_set_v1({
                'ref':
                input_ref,
                'include_item_info':
                1
            })
            input_ref = set_data['data']['items'][0]['ref']

        obj_data = self.dfu.get_objects({"object_refs":
                                         [input_ref]})['data'][0]['data']

        genome_ref = obj_data.get('genome_id')

        if not genome_ref:
            raise ValueError(
                'Alignment is not associated with a Genome object')

        result_directory = os.path.join(self.scratch_dir, str(uuid.uuid4()))
        os.makedirs(result_directory)

        genome_gtf_file = self.gfu.genome_to_gff({
            'genome_ref': genome_ref,
            'is_gtf': True,
            'target_dir': result_directory
        })['file_path']

        return genome_gtf_file

    def run_bamqc(self, input_ref, input_info):
        # download the input and setup a working dir
        alignment_info = self.rau.download_alignment({'source_ref': input_ref})
        bam_file_path = self.find_my_bam_file(
            alignment_info['destination_dir'])
        try:
            gtf_file = self.get_gtf_file(input_ref)
        except:
            gtf_file = ''

        workdir = os.path.join(self.scratch_dir,
                               'qualimap_' + str(int(time.time() * 10000)))

        options = [
            '-bam', bam_file_path, '-c', '-outdir', workdir, '-outformat',
            'html'
        ]

        if gtf_file:
            options += ['-gff', gtf_file]

        options.append('--java-mem-size={}'.format(
            self.JAVA_MEM_DEFAULT_SIZE))  # always use large mem
        multiplier = self._large_file(bam_file_path)
        if multiplier:
            window_size = multiplier * 400
            print('using larger window size: {} and Java memory: {}'.format(
                window_size, self.JAVA_MEM_DEFAULT_SIZE))
            options.append(
                '-nw {}'.format(window_size))  # increase size of windows

        self.run_cli_command('bamqc', options)

        package_info = self.package_output_folder(
            workdir, 'QualiMap_report',
            'HTML report directory for QualiMap BAM QC', 'qualimapReport.html')

        return {
            'qc_result_folder_path': workdir,
            'qc_result_zip_info': package_info
        }

    def run_multi_sample_qc(self, input_ref, input_info):
        # download the input and setup a working dir
        reads_alignment_info = self.get_alignments_from_set(input_ref)
        try:
            gtf_file = self.get_gtf_file(input_ref, set_op=True)
        except:
            gtf_file = ''
        suffix = 'qualimap_' + str(int(time.time() * 10000))
        workdir = os.path.join(self.scratch_dir, suffix)
        os.makedirs(workdir)

        input_file_path = self.create_multi_qualimap_cfg(
            reads_alignment_info, workdir)

        options = [
            '-d', input_file_path, '-r', '-c', '-outdir', workdir,
            '-outformat', 'html'
        ]

        if gtf_file:
            options += ['-gff', gtf_file]

        multiplier = self._large_file(input_file_path)
        if multiplier:
            window_size = multiplier * 400
            print('using larger window size: {} and Java memory: {}'.format(
                window_size, self.JAVA_MEM_DEFAULT_SIZE))
            options.append(
                '-nw {}'.format(window_size))  # increase size of windows
            options.append('--java-mem-size={}'.format(
                self.JAVA_MEM_DEFAULT_SIZE))

        self.run_cli_command('multi-bamqc', options)

        package_info = self.package_output_folder(
            workdir, 'QualiMap_report',
            'HTML report directory for QualiMap Multi-sample BAM QC',
            'multisampleBamQcReport.html')

        return {
            'qc_result_folder_path': workdir,
            'qc_result_zip_info': package_info
        }

    def get_alignments_from_set(self, alignment_set_ref):
        set_data = self.set_api.get_reads_alignment_set_v1({
            'ref':
            alignment_set_ref,
            'include_item_info':
            1
        })
        items = set_data['data']['items']

        reads_alignment_data = []
        for alignment in items:
            alignment_info = self.rau.download_alignment(
                {'source_ref': alignment['ref']})
            bam_file_path = self.find_my_bam_file(
                alignment_info['destination_dir'])
            label = None
            if 'label' in alignment:
                label = alignment['label']
            reads_alignment_data.append({
                'bam_file_path': bam_file_path,
                'ref': alignment['ref'],
                'label': label,
                'info': alignment['info']
            })
        return reads_alignment_data

    def create_multi_qualimap_cfg(self, reads_alignment_info, workdir):
        # Group by labels if there is at least one defined
        use_labels = False
        for alignment in reads_alignment_info:
            if alignment['label']:
                use_labels = True
                break

        # write the file
        input_file_path = os.path.join(workdir, 'multi_input.txt')
        input_file = open(input_file_path, 'w')
        name_lookup = {}
        for alignment in reads_alignment_info:
            name = alignment['info'][1]
            if name in name_lookup:
                name_lookup[name] += 1
                name = name + '_' + str(name_lookup[name])
            else:
                name_lookup[name] = 1

            input_file.write(name + '\t' + alignment['bam_file_path'])
            if use_labels:
                if alignment['label']:
                    input_file.write('\t' + alignment['label'])
                else:
                    input_file.write('\tunlabeled')
            input_file.write('\n')
        input_file.close()
        return input_file_path

    def get_run_info(self, params):
        info = self.get_obj_info(params['input_ref'])
        obj_type = self.get_type_from_obj_info(info)
        if obj_type in ['KBaseRNASeq.RNASeqAlignment']:
            return {'mode': 'single', 'input_info': info}
        if obj_type in [
                'KBaseRNASeq.RNASeqAlignmentSet', 'KBaseSets.ReadsAlignmentSet'
        ]:
            return {'mode': 'multi', 'input_info': info}
        raise ValueError('Object type of input_ref is not valid, was: ' +
                         str(obj_type))

    def validate_params(self, params):
        if 'input_ref' not in params:
            raise ValueError(
                'required parameter field "input_ref" was not set')

        create_report = False
        if 'create_report' in params:
            if int(params['create_report']) == 1:
                if 'output_workspace' not in params:
                    raise ValueError(
                        'If "create_report" was set, then "output_workspace" is required'
                    )
                if not params['output_workspace']:
                    raise ValueError(
                        'If "create_report" was set, then "output_workspace" is required'
                    )
                create_report = True
        params['create_report'] = create_report

    def run_cli_command(self, command, options, cwd=None):
        if command not in self.valid_commands:
            raise ValueError('Invalid QualiMap command: ' + str(command))
        command = [self.QUALIMAP_PATH, command] + options
        print('Running: ' + ' '.join(command))

        if not cwd:
            cwd = self.scratch_dir

        p = subprocess.Popen(command, cwd=cwd, shell=False)
        exitCode = p.wait()

        if (exitCode == 0):
            print('Success, exit code was: ' + str(exitCode))
        else:
            raise ValueError('Error running command: ' + ' '.join(command) +
                             '\n' + 'Exit Code: ' + str(exitCode))

    def find_my_bam_file(self, dirpath):
        bam_path = None
        for f in os.listdir(dirpath):
            fullpath = os.path.join(dirpath, f)
            if os.path.isfile(fullpath) and f.lower().endswith('.bam'):
                if bam_path is not None:
                    raise ValueError(
                        'Error! Too many BAM files were downloaded for this alignment!'
                    )
                bam_path = fullpath
        if bam_path is None:
            raise ValueError(
                'Error! No BAM files were downloaded for this alignment!')
        return bam_path

    def package_output_folder(self, folder_path, zip_file_name,
                              zip_file_description, index_html_file):
        ''' Simple utility for packaging a folder and saving to shock '''
        output = self.dfu.file_to_shock({
            'file_path': folder_path,
            'make_handle': 0,
            'pack': 'zip'
        })
        return {
            'shock_id': output['shock_id'],
            'name': zip_file_name,
            'description': zip_file_description,
            'index_html_file_name': index_html_file
        }

    def get_type_from_obj_info(self, info):
        return info[2].split('-')[0]

    def get_obj_info(self, ref):
        return self.ws.get_object_info3({'objects': [{
            'ref': ref
        }]})['infos'][0]
예제 #19
0
class DataStagingUtils(object):
    def __init__(self, config, ctx):
        self.ctx = ctx
        self.scratch = os.path.abspath(config['scratch'])
        self.ws_url = config['workspace-url']
        self.serviceWizardURL = config['srv-wiz-url']
        self.callbackURL = config['SDK_CALLBACK_URL']
        if not os.path.exists(self.scratch):
            os.makedirs(self.scratch)

        self.SE_flag = 'SE'
        self.PE_flag = 'PE'

        SERVICE_VER = 'release'

        # readsUtils_Client
        try:
            self.readsUtils_Client = ReadsUtils(self.callbackURL,
                                                token=self.ctx['token'],
                                                service_ver=SERVICE_VER)
        except Exception as e:
            raise ValueError(
                'Unable to instantiate readsUtils_Client with callbackURL: ' +
                self.callbackURL + ' ERROR: ' + str(e))

        # setAPI_Client
        try:
            #setAPI_Client = SetAPI (url=self.callbackURL, token=self.ctx['token'])  # for SDK local.  local doesn't work for SetAPI
            self.setAPI_Client = SetAPI(
                url=self.serviceWizardURL,
                token=self.ctx['token'])  # for dynamic service
        except Exception as e:
            raise ValueError(
                'Unable to instantiate setAPI_Client with serviceWizardURL: ' +
                self.serviceWizardURL + ' ERROR: ' + str(e))

    def expand_input(self, input_refs):
        '''
        Expand input based on an input data reference for Kaiju

        input_refs can be a list of references to a PairedEndLibrary, a SingleEndLibrary, or a ReadsSet
        '''
        # config
        #SERVICE_VER = 'dev'
        SERVICE_VER = 'release'

        # expand any sets and build a non-redundant list of reads input objs
        ws = Workspace(self.ws_url)
        expanded_input = []
        input_ref_seen = dict()
        SE_types = [
            'KBaseFile.SingleEndLibrary', 'KBaseAssembly.SingleEndLibrary'
        ]
        PE_types = [
            'KBaseFile.PairedEndLibrary', 'KBaseAssembly.PairedEndLibrary'
        ]

        [
            OBJID_I, NAME_I, TYPE_I, SAVE_DATE_I, VERSION_I, SAVED_BY_I,
            WSID_I, WORKSPACE_I, CHSUM_I, SIZE_I, META_I
        ] = range(11)  # object_info tuple
        for input_ref in input_refs:
            input_info = ws.get_object_info3({'objects': [{
                'ref': input_ref
            }]})['infos'][0]
            obj_name = input_info[NAME_I]
            type_name = input_info[TYPE_I].split('-')[0]

            # ReadsSet
            if type_name in ['KBaseSets.ReadsSet']:
                try:
                    input_readsSet_obj = self.setAPI_Client.get_reads_set_v1({
                        'ref':
                        input_ref,
                        'include_item_info':
                        1
                    })

                except Exception as e:
                    raise ValueError(
                        'SetAPI FAILURE: Unable to get read library set object from workspace: ('
                        + str(input_ref) + ")\n" + str(e))

                for readsLibrary_obj in input_readsSet_obj['data']['items']:
                    this_reads_ref = readsLibrary_obj['ref']
                    if this_reads_ref in input_ref_seen:
                        continue
                    input_ref_seen[this_reads_ref] = True

                    this_reads_name = readsLibrary_obj['info'][NAME_I]
                    reads_item_type = readsLibrary_obj['info'][TYPE_I]
                    reads_item_type = re.sub(
                        '-[0-9]+\.[0-9]+$', "",
                        reads_item_type)  # remove trailing version
                    if reads_item_type in PE_types:
                        this_reads_type = self.PE_flag
                    elif reads_item_type in SE_types:
                        this_reads_type = self.SE_flag
                    else:
                        raise ValueError("Can't handle read item type '" +
                                         reads_item_type + "' obj_name: '" +
                                         this_reads_name + " in Set: '" +
                                         str(input_ref) + "'")
                    expanded_input.append({
                        'ref': this_reads_ref,
                        'name': this_reads_name,
                        'type': this_reads_type
                    })
            # SingleEnd Library
            elif type_name in SE_types:
                this_reads_ref = input_ref
                if this_reads_ref in input_ref_seen:
                    continue
                input_ref_seen[this_reads_ref] = True
                this_reads_name = obj_name
                this_reads_type = self.SE_flag
                expanded_input.append({
                    'ref': this_reads_ref,
                    'name': this_reads_name,
                    'type': this_reads_type
                })
            # PairedEnd Library
            elif type_name in PE_types:
                this_reads_ref = input_ref
                if this_reads_ref in input_ref_seen:
                    continue
                input_ref_seen[this_reads_ref] = True
                this_reads_name = obj_name
                this_reads_type = self.PE_flag
                expanded_input.append({
                    'ref': this_reads_ref,
                    'name': this_reads_name,
                    'type': this_reads_type
                })
            else:
                raise ValueError("Illegal type in input_refs: " +
                                 str(obj_name) + " (" + str(input_ref) +
                                 ") is of type: '" + str(type_name) + "'")

        return expanded_input

    def stage_input(self,
                    input_item=None,
                    subsample_percent=10,
                    subsample_replicates=1,
                    subsample_seed=1,
                    fasta_file_extension='fastq'):
        '''
        Stage input based on an input data reference for Kaiju

        input_refs can be a list of references to a PairedEndLibrary, a SingleEndLibrary, or a ReadsSet

        This method creates a directory in the scratch area with the set of Fasta/Fastq files, names
        will have the fasta_file_extension parameter tacked on.

            ex:

            staged_input = stage_input({'ref':<ref>,'name':<name>,'type':<type>}, subsample_percent, subsample_replicates, subsample_seed, 'fastq')

            staged_input
            {"input_dir": '...'}
        '''
        # init
        staged_input = dict()
        replicate_input = []

        # config
        #SERVICE_VER = 'dev'
        SERVICE_VER = 'release'

        # generate a folder in scratch to hold the input
        suffix = str(int(time.time() * 1000))
        input_dir = os.path.join(self.scratch, 'input_reads_' + suffix)
        if not os.path.exists(input_dir):
            os.makedirs(input_dir)

        #
        # Download reads
        #

        # Paired End Lib
        if input_item['type'] == self.PE_flag:
            try:
                readsLibrary = self.readsUtils_Client.download_reads({
                    'read_libraries': [input_item['ref']],
                    'interleaved':
                    'false'
                })
            except Exception as e:
                raise ValueError(
                    'Unable to get read library object from workspace: (' +
                    str(input_item['ref']) + ")\n" + str(e))

            input_fwd_file_path = readsLibrary['files'][
                input_item['ref']]['files']['fwd']
            input_rev_file_path = readsLibrary['files'][
                input_item['ref']]['files']['rev']
            fwd_filename = os.path.join(
                input_dir, input_item['name'] + '.fwd.' + fasta_file_extension)
            rev_filename = os.path.join(
                input_dir, input_item['name'] + '.rev.' + fasta_file_extension)
            if input_fwd_file_path != fwd_filename:
                shutil.move(input_fwd_file_path, fwd_filename)
            if input_rev_file_path != rev_filename:
                shutil.move(input_rev_file_path, rev_filename)
            input_item['fwd_file'] = fwd_filename
            input_item['rev_file'] = rev_filename

            if not os.path.isfile(fwd_filename):
                raise ValueError('Error generating reads file ' + fwd_filename)
            if not os.path.isfile(rev_filename):
                raise ValueError('Error generating reads file ' + rev_filename)
            # make sure fasta file isn't empty
            min_fasta_len = 1
            if not self._fasta_seq_len_at_least(fwd_filename, min_fasta_len):
                raise ValueError('Reads Library is empty in filename: ' +
                                 str(fwd_filename))
            if not self._fasta_seq_len_at_least(rev_filename, min_fasta_len):
                raise ValueError('Reads Library is empty in filename: ' +
                                 str(rev_filename))

        # Single End Lib
        elif input_item['type'] == self.SE_flag:
            try:
                readsLibrary = self.readsUtils_Client.download_reads(
                    {'read_libraries': [input_item['ref']]})
            except Exception as e:
                raise ValueError(
                    'Unable to get read library object from workspace: (' +
                    str(input_item['ref']) + ")\n" + str(e))

            input_fwd_file_path = readsLibrary['files'][
                input_item['ref']]['files']['fwd']
            fwd_filename = os.path.join(
                input_dir, input_item['name'] + '.fwd.' + fasta_file_extension)
            if input_fwd_file_path != fwd_filename:
                shutil.move(input_fwd_file_path, fwd_filename)
            input_item['fwd_file'] = fwd_filename

            if not os.path.isfile(fwd_filename):
                raise ValueError('Error generating reads file ' + fwd_filename)
            # make sure fasta file isn't empty
            min_fasta_len = 1
            if not self._fasta_seq_len_at_least(fwd_filename, min_fasta_len):
                raise ValueError('Reads Library is empty in filename: ' +
                                 str(fwd_filename))

        else:
            raise ValueError("No type set for input library " +
                             str(input_item['name']) + " (" +
                             str(input_item['ref']) + ")")

        #
        # Subsample
        #

        if subsample_percent == 100:
            replicate_input = [input_item]
        else:
            replicate_input = self._randomly_subsample_reads(
                input_item,
                subsample_percent=subsample_percent,
                subsample_replicates=subsample_replicates,
                subsample_seed=subsample_seed)
            # free up disk
            os.remove(input_item['fwd_file'])
            if input_item['type'] == self.PE_flag:
                os.remove(input_item['rev_file'])

        # return input file info
        #staged_input['input_dir'] = input_dir
        #staged_input['folder_suffix'] = suffix
        staged_input['replicate_input'] = replicate_input
        return staged_input

    def _randomly_subsample_reads(self,
                                  input_item=None,
                                  subsample_percent=100,
                                  subsample_replicates=1,
                                  subsample_seed=1):

        replicate_files = []
        split_num = subsample_replicates

        # for now can only do percentage instead of raw cnt of reads per subsample
        use_reads_num = False
        use_reads_perc = True
        reads_num = 0  # not used.  subsample_percent used instead

        # init randomizer
        random.seed(subsample_seed)

        # Paired End
        #
        if input_item['type'] == self.PE_flag:
            print("SUBSAMPLING PE library " + input_item['name'])  # DEBUG

            # file paths
            input_fwd_path = re.sub("\.fastq$", "", input_item['fwd_file'])
            input_fwd_path = re.sub("\.FASTQ$", "", input_fwd_path)
            input_rev_path = re.sub("\.fastq$", "", input_item['rev_file'])
            input_rev_path = re.sub("\.FASTQ$", "", input_rev_path)
            output_fwd_paired_file_path_base = input_fwd_path + "_fwd_paired"
            output_rev_paired_file_path_base = input_rev_path + "_rev_paired"

            # set up for file io
            total_paired_reads = 0
            total_unpaired_fwd_reads = 0
            total_unpaired_rev_reads = 0
            total_paired_reads_by_set = []
            fwd_ids = dict()
            paired_ids = dict()
            paired_ids_list = []
            paired_lib_i = dict()
            paired_buf_size = 100000
            recs_beep_n = 1000000

            # read fwd file to get fwd ids
            #            rec_cnt = 0  # DEBUG
            print("GETTING IDS")  # DEBUG
            with open(input_item['fwd_file'], 'r',
                      0) as input_reads_file_handle:
                rec_line_i = -1
                for line in input_reads_file_handle:
                    rec_line_i += 1
                    if rec_line_i == 3:
                        rec_line_i = -1
                    elif rec_line_i == 0:
                        if not line.startswith('@'):
                            raise ValueError("badly formatted rec line: '" +
                                             line + "'")
                        read_id = line.rstrip('\n')
                        read_id = re.sub("[ \t]+.*$", "", read_id)
                        read_id = re.sub("[\/\.\_\-\:\;][012lrLRfrFR53]\'*$",
                                         "", read_id)
                        fwd_ids[read_id] = True

                        # DEBUG
#                        if rec_cnt % 100 == 0:
#                            print ("read_id: '"+str(read_id)+"'")
#                        rec_cnt += 1

# read reverse to determine paired
            print("DETERMINING PAIRED IDS")  # DEBUG
            with open(input_item['rev_file'], 'r',
                      0) as input_reads_file_handle:
                rec_line_i = -1
                for line in input_reads_file_handle:
                    rec_line_i += 1
                    if rec_line_i == 3:
                        rec_line_i = -1
                    elif rec_line_i == 0:
                        if not line.startswith('@'):
                            raise ValueError("badly formatted rec line: '" +
                                             line + "'")
                        read_id = line.rstrip('\n')
                        read_id = re.sub("[ \t]+.*$", "", read_id)
                        read_id = re.sub("[\/\.\_\-\:\;][012lrLRfrFR53]\'*$",
                                         "", read_id)
                        if fwd_ids[read_id]:
                            paired_ids[read_id] = True
                            paired_ids_list.append(read_id)

                        # DEBUG
#                        if rec_cnt % 100 == 0:
#                            print ("read_id: '"+str(read_id)+"'")
#                        rec_cnt += 1
            total_paired_reads = len(paired_ids_list)
            print("TOTAL PAIRED READS CNT: " +
                  str(total_paired_reads))  # DEBUG

            # Determine sublibrary sizes
            if use_reads_num:
                reads_per_lib = reads_num
                if reads_per_lib > total_paired_reads // split_num:
                    raise ValueError(
                        "must specify reads_num <= total_paired_reads_cnt / split_num.  You have reads_num:"
                        + str(reads_num) + " > total_paired_reads_cnt:" +
                        str(total_paired_reads) + " / split_num:" +
                        str(split_num) + ".  Instead try reads_num <= " +
                        str(total_paired_reads // split_num))
            elif use_reads_perc:
                reads_per_lib = int(
                    (subsample_percent / 100.0) * total_paired_reads)
                if reads_per_lib > total_paired_reads // split_num:
                    raise ValueError(
                        "must specify reads_perc <= 1 / split_num.  You have reads_perc:"
                        + str(subsample_percent) + " > 1 / split_num:" +
                        str(split_num) + ".  Instead try reads_perc <= " +
                        str(int(100 * 1 / split_num)))
            else:
                raise ValueError(
                    "error in logic reads_num vs. reads_perc logic")

            # Determine random membership in each sublibrary
            print("GETTING RANDOM SUBSAMPLES")  # DEBUG
            for i, read_id in enumerate(
                    random.sample(paired_ids_list, reads_per_lib * split_num)):
                lib_i = i % split_num
                paired_lib_i[read_id] = lib_i

            # split fwd paired
            print("WRITING FWD SPLIT PAIRED")  # DEBUG
            paired_output_reads_file_handles = []
            for lib_i in range(split_num):
                paired_output_reads_file_handles.append(
                    open(
                        output_fwd_paired_file_path_base + "-" + str(lib_i) +
                        ".fastq", 'w', paired_buf_size))
                total_paired_reads_by_set.append(0)

            rec_buf = []
            last_read_id = None
            paired_cnt = 0
            capture_type_paired = False

            with open(input_item['fwd_file'], 'r',
                      0) as input_reads_file_handle:
                rec_line_i = -1
                for line in input_reads_file_handle:
                    rec_line_i += 1
                    if rec_line_i == 3:
                        rec_line_i = -1
                    elif rec_line_i == 0:
                        if not line.startswith('@'):
                            raise ValueError("badly formatted rec line: '" +
                                             line + "'")
                        if last_read_id != None:
                            if capture_type_paired:
                                lib_i = paired_lib_i[last_read_id]
                                paired_output_reads_file_handles[
                                    lib_i].writelines(rec_buf)
                                paired_cnt += 1
                                total_paired_reads_by_set[lib_i] += 1
                                if paired_cnt != 0 and paired_cnt % recs_beep_n == 0:
                                    print("\t" + str(paired_cnt) +
                                          " recs processed")
                            else:
                                #unpaired_fwd_buf.extend(rec_buf)
                                pass
                            rec_buf = []
                        read_id = line.rstrip('\n')
                        read_id = re.sub("[ \t]+.*$", "", read_id)
                        read_id = re.sub("[\/\.\_\-\:\;][012lrLRfrFR53]\'*$",
                                         "", read_id)
                        last_read_id = read_id
                        try:
                            found = paired_lib_i[read_id]
                            capture_type_paired = True
                        except:
                            total_unpaired_fwd_reads += 1
                            capture_type_paired = False
                    rec_buf.append(line)
                # last rec
                if len(rec_buf) > 0:
                    if capture_type_paired:
                        lib_i = paired_lib_i[last_read_id]
                        paired_output_reads_file_handles[lib_i].writelines(
                            rec_buf)
                        paired_cnt += 1
                        if paired_cnt != 0 and paired_cnt % recs_beep_n == 0:
                            print("\t" + str(paired_cnt) + " recs processed")
                    else:
                        #unpaired_fwd_buf.extend(rec_buf)
                        pass
                    rec_buf = []

            for output_handle in paired_output_reads_file_handles:
                output_handle.close()

            print("\t" + str(paired_cnt) + " FWD recs processed")

            # split rev paired
            print("WRITING REV SPLIT PAIRED")  # DEBUG
            paired_output_reads_file_handles = []
            for lib_i in range(split_num):
                paired_output_reads_file_handles.append(
                    open(
                        output_rev_paired_file_path_base + "-" + str(lib_i) +
                        ".fastq", 'w', paired_buf_size))

            rec_buf = []
            last_read_id = None
            paired_cnt = 0
            capture_type_paired = False

            with open(input_item['rev_file'], 'r',
                      0) as input_reads_file_handle:
                rec_line_i = -1
                for line in input_reads_file_handle:
                    rec_line_i += 1
                    if rec_line_i == 3:
                        rec_line_i = -1
                    elif rec_line_i == 0:
                        if not line.startswith('@'):
                            raise ValueError("badly formatted rec line: '" +
                                             line + "'")
                        if last_read_id != None:
                            if capture_type_paired:
                                lib_i = paired_lib_i[last_read_id]
                                paired_output_reads_file_handles[
                                    lib_i].writelines(rec_buf)
                                paired_cnt += 1
                                if paired_cnt != 0 and paired_cnt % recs_beep_n == 0:
                                    print("\t" + str(paired_cnt) +
                                          " recs processed")
                            else:
                                #unpaired_fwd_buf.extend(rec_buf)
                                pass
                            rec_buf = []
                        read_id = line.rstrip('\n')
                        read_id = re.sub("[ \t]+.*$", "", read_id)
                        read_id = re.sub("[\/\.\_\-\:\;][012lrLRfrFR53]\'*$",
                                         "", read_id)
                        last_read_id = read_id
                        try:
                            found = paired_lib_i[read_id]
                            capture_type_paired = True
                        except:
                            total_unpaired_rev_reads += 1
                            capture_type_paired = False
                    rec_buf.append(line)
                # last rec
                if len(rec_buf) > 0:
                    if capture_type_paired:
                        lib_i = paired_lib_i[last_read_id]
                        paired_output_reads_file_handles[lib_i].writelines(
                            rec_buf)
                        paired_cnt += 1
                        if paired_cnt != 0 and paired_cnt % recs_beep_n == 0:
                            print("\t" + str(paired_cnt) + " recs processed")
                    else:
                        #unpaired_fwd_buf.extend(rec_buf)
                        pass
                    rec_buf = []

            for output_handle in paired_output_reads_file_handles:
                output_handle.close()

            print("\t" + str(paired_cnt) + " REV recs processed")

            # summary
            report = 'SUMMARY FOR SUBSAMPLE OF READ LIBRARY: ' + input_item[
                'name'] + "\n"
            report += "TOTAL PAIRED READS: " + str(total_paired_reads) + "\n"
            report += "TOTAL UNPAIRED FWD READS (discarded): " + str(
                total_unpaired_fwd_reads) + "\n"
            report += "TOTAL UNPAIRED REV READS (discarded): " + str(
                total_unpaired_rev_reads) + "\n"
            report += "\n"
            for lib_i in range(split_num):
                report += "PAIRED READS IN SET " + str(lib_i) + ": " + str(
                    total_paired_reads_by_set[lib_i]) + "\n"
            print(report)

            # make replicate objects to return
            #        for replicate_i,replicate_item in enumerate(replicate_files):
            #            replicate_input.append({'fwd_file': replicate_item['fwd_file'],
            #                                    'type': input_item['type'],
            #                                    'name': input_item['name']+"-"+str(replicate_i)
            #                                })
            #            if input_item['type'] == self.PE_flag:
            #                replicate_input[replicate_i]['rev_file'] = replicate_item['rev_file']

            print("MAKING REPLICATE OBJECT")  # DEBUG
            paired_obj_refs = []
            for lib_i in range(split_num):
                output_fwd_paired_file_path = output_fwd_paired_file_path_base + "-" + str(
                    lib_i) + ".fastq"
                output_rev_paired_file_path = output_rev_paired_file_path_base + "-" + str(
                    lib_i) + ".fastq"
                if not os.path.isfile (output_fwd_paired_file_path) \
                     or os.path.getsize (output_fwd_paired_file_path) == 0 \
                   or not os.path.isfile (output_rev_paired_file_path) \
                     or os.path.getsize (output_rev_paired_file_path) == 0:

                    raise ValueError("failed to create paired output")
                else:
                    zero_pad = '0' * (len(str(split_num)) -
                                      len(str(lib_i + 1)))
                    replicate_files.append({
                        'fwd_file':
                        output_fwd_paired_file_path,
                        'rev_file':
                        output_rev_paired_file_path,
                        'ref':
                        input_item[
                            'ref'],  # note: this is for the src, not the subsample which is not saved
                        'type':
                        input_item['type'],
                        'name':
                        input_item['name'] + '-' + zero_pad + str(lib_i + 1)
                    })

        # SingleEndLibrary
        #
        elif input_item['type'] == self.SE_flag:
            print("SUBSAMPLING SE library " + input_item['name'])

            # file paths
            input_fwd_path = re.sub("\.fastq$", "", input_item['fwd_file'])
            input_fwd_path = re.sub("\.FASTQ$", "", input_fwd_path)
            output_fwd_paired_file_path_base = input_fwd_path + "_fwd_paired"

            # get "paired" ids
            print("DETERMINING IDS")  # DEBUG
            paired_ids = dict()
            paired_ids_list = []
            paired_lib_i = dict()
            paired_buf_size = 100000
            recs_beep_n = 100000

            with open(input_item['fwd_file'], 'r',
                      0) as input_reads_file_handle:
                rec_line_i = -1
                for line in input_reads_file_handle:
                    rec_line_i += 1
                    if rec_line_i == 3:
                        rec_line_i = -1
                    elif rec_line_i == 0:
                        if not line.startswith('@'):
                            raise ValueError("badly formatted rec line: '" +
                                             line + "'")
                        read_id = line.rstrip('\n')
                        read_id = re.sub("[ \t]+.*$", "", read_id)
                        read_id = re.sub("[\/\.\_\-\:\;][012lrLRfrFR53]\'*$",
                                         "", read_id)
                        if read_id in paired_ids:
                            raise ValueError("repeat read_id: " + read_id)
                        paired_ids[read_id] = True
                        paired_ids_list.append(read_id)
                        # DEBUG
#                        if rec_cnt % 100 == 0:
#                            print ("read_id: '"+str(read_id)+"'")
#                        rec_cnt += 1
            total_paired_reads = len(paired_ids_list)
            print("TOTAL READS CNT: " + str(total_paired_reads))  # DEBUG

            # Determine sublibrary sizes
            if use_reads_num:
                reads_per_lib = reads_num
                if reads_per_lib > total_paired_reads // split_num:
                    raise ValueError(
                        "must specify reads_num <= total_paired_reads_cnt / split_num.  You have reads_num:"
                        + str(reads_num) + " > total_paired_reads_cnt:" +
                        str(total_paired_reads) + " / split_num:" +
                        str(split_num) + ".  Instead try reads_num <= " +
                        str(total_paired_reads // split_num))
            elif use_reads_perc:
                reads_per_lib = int(
                    (subsample_percent / 100.0) * total_paired_reads)
                if reads_per_lib > total_paired_reads // split_num:
                    raise ValueError(
                        "must specify reads_perc <= 1 / split_num.  You have reads_perc:"
                        + str(subsample_percent) + " > 1 / split_num:" +
                        str(split_num) + ".  Instead try reads_perc <= " +
                        str(int(100 * 1 / split_num)))
            else:
                raise ValueError(
                    "error in logic reads_num vs. reads_perc logic")

            # Determine random membership in each sublibrary
            print("GETTING RANDOM SUBSAMPLES")  # DEBUG
            for i, read_id in enumerate(
                    random.sample(paired_ids_list, reads_per_lib * split_num)):
                lib_i = i % split_num
                paired_lib_i[read_id] = lib_i

            # set up for file io
            total_paired_reads = 0
            total_paired_reads_by_set = []
            paired_buf_size = 1000000

            # split reads
            print("WRITING SPLIT SINGLE END READS")  # DEBUG
            paired_output_reads_file_handles = []
            for lib_i in range(split_num):
                paired_output_reads_file_handles.append(
                    open(
                        output_fwd_paired_file_path_base + "-" + str(lib_i) +
                        ".fastq", 'w', paired_buf_size))
                total_paired_reads_by_set.append(0)

            rec_buf = []
            last_read_id = None
            paired_cnt = 0
            recs_beep_n = 1000000
            with open(input_item['fwd_file'], 'r',
                      0) as input_reads_file_handle:
                rec_line_i = -1
                for line in input_reads_file_handle:
                    rec_line_i += 1
                    if rec_line_i == 3:
                        rec_line_i = -1
                    elif rec_line_i == 0:
                        if not line.startswith('@'):
                            raise ValueError("badly formatted rec line: '" +
                                             line + "'")
                        total_paired_reads += 1
                        if last_read_id != None:
                            try:
                                lib_i = paired_lib_i[last_read_id]
                                total_paired_reads_by_set[lib_i] += 1
                                paired_output_reads_file_handles[
                                    lib_i].writelines(rec_buf)
                                paired_cnt += 1
                            except:
                                pass
                            if paired_cnt != 0 and paired_cnt % recs_beep_n == 0:
                                print("\t" + str(paired_cnt) +
                                      " recs processed")
                            rec_buf = []
                        read_id = line.rstrip('\n')
                        read_id = re.sub("[ \t]+.*$", "", read_id)
                        read_id = re.sub("[\/\.\_\-\:\;][012lrLRfrFR53]\'*$",
                                         "", read_id)
                        last_read_id = read_id
                    rec_buf.append(line)
                # last rec
                if len(rec_buf) > 0:
                    if last_read_id != None:
                        try:
                            lib_i = paired_lib_i[last_read_id]
                            total_paired_reads_by_set[lib_i] += 1
                            paired_output_reads_file_handles[lib_i].writelines(
                                rec_buf)
                            paired_cnt += 1
                        except:
                            pass
                    if paired_cnt != 0 and paired_cnt % recs_beep_n == 0:
                        print("\t" + str(paired_cnt) + " recs processed")
                    rec_buf = []

            for output_handle in paired_output_reads_file_handles:
                output_handle.close()

            # summary
            report = 'SUMMARY FOR SUBSAMPLE OF READ LIBRARY: ' + input_item[
                'name'] + "\n"
            report += "TOTAL READS: " + str(total_paired_reads) + "\n"
            for lib_i in range(split_num):
                report += "SINGLE END READS IN SET " + str(lib_i) + ": " + str(
                    total_paired_reads_by_set[lib_i]) + "\n"
            print(report)

            # make replicate objects to return
            print("MAKING REPLICATE OBJECTS")  # DEBUG
            paired_obj_refs = []
            for lib_i in range(split_num):
                output_fwd_paired_file_path = output_fwd_paired_file_path_base + "-" + str(
                    lib_i) + ".fastq"
                if not os.path.isfile (output_fwd_paired_file_path) \
                     or os.path.getsize (output_fwd_paired_file_path) == 0:

                    raise ValueError("failed to create paired output")
                else:
                    zero_pad = '0' * (len(str(split_num)) -
                                      len(str(lib_i + 1)))
                    replicate_files.append({
                        'fwd_file':
                        output_fwd_paired_file_path,
                        'ref':
                        input_item[
                            'ref'],  # note: this is for the src, not the subsample which is not saved
                        'type':
                        input_item['type'],
                        'name':
                        input_item['name'] + '-' + zero_pad + str(lib_i + 1)
                    })

        else:
            raise ValueError("unknown ReadLibrary type:" +
                             str(input_item['type']) + " for readslibrary: " +
                             input_item['name'])

        return replicate_files

    def _fasta_seq_len_at_least(self, fasta_path, min_fasta_len=1):
        '''
        counts the number of non-header, non-whitespace characters in a FASTA file
        '''
        seq_len = 0
        with open(fasta_path, 'r', 0) as fasta_handle:
            for line in fasta_handle:
                line = line.strip()
                if line.startswith('>'):
                    continue
                line = line.replace(' ', '')
                seq_len += len(line)
                if seq_len >= min_fasta_len:
                    return True
        return False
예제 #20
0
class QualiMapRunner:

    QUALIMAP_PATH = '/kb/module/qualimap-bin/qualimap'

    def __init__(self, scratch_dir, callback_url, workspace_url, srv_wiz_url):
        self.scratch_dir = scratch_dir
        self.rau = ReadsAlignmentUtils(callback_url)
        self.kbr = KBaseReport(callback_url)
        self.dfu = DataFileUtil(callback_url)
        self.set_api = SetAPI(srv_wiz_url)
        self.ws = Workspace(workspace_url)
        self.valid_commands = ['bamqc', 'multi-bamqc']

    def run_app(self, params):
        self.validate_params(params)
        print('Validated Params = ')
        pprint(params)
        run_info = self.get_run_info(params)

        if run_info['mode'] == 'single':
            result = self.run_bamqc(params['input_ref'],
                                    run_info['input_info'])
        elif run_info['mode'] == 'multi':
            result = self.run_multi_sample_qc(params['input_ref'],
                                              run_info['input_info'])
        else:
            raise ValueError(
                'Error in fetching the type to determine run settings.')

        if params['create_report']:
            result = self.create_report(result, params['output_workspace'])

        return result

    def create_report(self, result, output_workspace):
        qc_result_zip_info = result['qc_result_zip_info']
        report_info = self.kbr.create_extended_report({
            'message':
            '',
            'objects_created': [],
            'direct_html_link_index':
            0,
            'html_links': [{
                'shock_id': qc_result_zip_info['shock_id'],
                'name': qc_result_zip_info['index_html_file_name'],
                'label': qc_result_zip_info['name']
            }],
            'report_object_name':
            'qualimap_report' + str(uuid.uuid4()),
            'workspace_name':
            output_workspace
        })
        result['report_name'] = report_info['name']
        result['report_ref'] = report_info['ref']
        return result

    def run_bamqc(self, input_ref, input_info):
        # download the input and setup a working dir
        alignment_info = self.rau.download_alignment({'source_ref': input_ref})
        bam_file_path = self.find_my_bam_file(
            alignment_info['destination_dir'])
        workdir = os.path.join(self.scratch_dir,
                               'qualimap_' + str(int(time.time() * 10000)))

        options = [
            '-bam', bam_file_path, '-outdir', workdir, '-outformat', 'html'
        ]
        self.run_cli_command('bamqc', options)

        package_info = self.package_output_folder(
            workdir, 'QualiMap_report',
            'HTML report directory for QualiMap BAM QC', 'qualimapReport.html')

        return {
            'qc_result_folder_path': workdir,
            'qc_result_zip_info': package_info
        }

    def run_multi_sample_qc(self, input_ref, input_info):
        # download the input and setup a working dir
        reads_alignment_info = self.get_alignments_from_set(input_ref)
        suffix = 'qualimap_' + str(int(time.time() * 10000))
        workdir = os.path.join(self.scratch_dir, suffix)
        os.makedirs(workdir)

        input_file_path = self.create_multi_qualimap_cfg(
            reads_alignment_info, workdir)

        options = [
            '-d', input_file_path, '-r', '-outdir', workdir, '-outformat',
            'html'
        ]
        self.run_cli_command('multi-bamqc', options)

        package_info = self.package_output_folder(
            workdir, 'QualiMap_report',
            'HTML report directory for QualiMap Multi-sample BAM QC',
            'multisampleBamQcReport.html')

        return {
            'qc_result_folder_path': workdir,
            'qc_result_zip_info': package_info
        }

    def get_alignments_from_set(self, alignment_set_ref):
        set_data = self.set_api.get_reads_alignment_set_v1({
            'ref':
            alignment_set_ref,
            'include_item_info':
            1
        })
        items = set_data['data']['items']

        reads_alignment_data = []
        for alignment in items:
            alignment_info = self.rau.download_alignment(
                {'source_ref': alignment['ref']})
            bam_file_path = self.find_my_bam_file(
                alignment_info['destination_dir'])
            label = None
            if 'label' in alignment:
                label = alignment['label']
            reads_alignment_data.append({
                'bam_file_path': bam_file_path,
                'ref': alignment['ref'],
                'label': label,
                'info': alignment['info']
            })
        return reads_alignment_data

    def create_multi_qualimap_cfg(self, reads_alignment_info, workdir):
        # Group by labels if there is at least one defined
        use_labels = False
        for alignment in reads_alignment_info:
            if alignment['label']:
                use_labels = True
                break

        # write the file
        input_file_path = os.path.join(workdir, 'multi_input.txt')
        input_file = open(input_file_path, 'w')
        name_lookup = {}
        for alignment in reads_alignment_info:
            name = alignment['info'][1]
            if name in name_lookup:
                name_lookup[name] += 1
                name = name + '_' + str(name_lookup[name])
            else:
                name_lookup[name] = 1

            input_file.write(name + '\t' + alignment['bam_file_path'])
            if use_labels:
                if alignment['label']:
                    input_file.write('\t' + alignment['label'])
                else:
                    input_file.write('\tunlabeled')
            input_file.write('\n')
        input_file.close()
        return input_file_path

    def get_run_info(self, params):
        info = self.get_obj_info(params['input_ref'])
        obj_type = self.get_type_from_obj_info(info)
        if obj_type in ['KBaseRNASeq.RNASeqAlignment']:
            return {'mode': 'single', 'input_info': info}
        if obj_type in [
                'KBaseRNASeq.RNASeqAlignmentSet', 'KBaseSets.ReadsAlignmentSet'
        ]:
            return {'mode': 'multi', 'input_info': info}
        raise ValueError('Object type of input_ref is not valid, was: ' +
                         str(obj_type))

    def validate_params(self, params):
        if 'input_ref' not in params:
            raise ValueError(
                'required parameter field "input_ref" was not set')

        create_report = False
        if 'create_report' in params:
            if int(params['create_report']) == 1:
                if 'output_workspace' not in params:
                    raise ValueError(
                        'If "create_report" was set, then "output_workspace" is required'
                    )
                if not params['output_workspace']:
                    raise ValueError(
                        'If "create_report" was set, then "output_workspace" is required'
                    )
                create_report = True
        params['create_report'] = create_report

    def run_cli_command(self, command, options, cwd=None):
        if command not in self.valid_commands:
            raise ValueError('Invalid QualiMap command: ' + str(command))
        command = [self.QUALIMAP_PATH, command] + options
        print('Running: ' + ' '.join(command))

        if not cwd:
            cwd = self.scratch_dir

        p = subprocess.Popen(command, cwd=cwd, shell=False)
        exitCode = p.wait()

        if (exitCode == 0):
            print('Success, exit code was: ' + str(exitCode))
        else:
            raise ValueError('Error running command: ' + ' '.join(command) +
                             '\n' + 'Exit Code: ' + str(exitCode))

    def find_my_bam_file(self, dirpath):
        bam_path = None
        for f in os.listdir(dirpath):
            fullpath = os.path.join(dirpath, f)
            if os.path.isfile(fullpath) and f.lower().endswith('.bam'):
                if bam_path is not None:
                    raise ValueError(
                        'Error! Too many BAM files were downloaded for this alignment!'
                    )
                bam_path = fullpath
        if bam_path is None:
            raise ValueError(
                'Error! No BAM files were downloaded for this alignment!')
        return bam_path

    def package_output_folder(self, folder_path, zip_file_name,
                              zip_file_description, index_html_file):
        ''' Simple utility for packaging a folder and saving to shock '''
        output = self.dfu.file_to_shock({
            'file_path': folder_path,
            'make_handle': 0,
            'pack': 'zip'
        })
        return {
            'shock_id': output['shock_id'],
            'name': zip_file_name,
            'description': zip_file_description,
            'index_html_file_name': index_html_file
        }

    def get_type_from_obj_info(self, info):
        return info[2].split('-')[0]

    def get_obj_info(self, ref):
        return self.ws.get_object_info3({'objects': [{
            'ref': ref
        }]})['infos'][0]
예제 #21
0
    def exec_remove_adapters(self, ctx, params):
        """
        :param params: instance of type "RemoveAdaptersParams" -> structure:
           parameter "output_workspace" of String, parameter
           "output_object_name" of String, parameter "input_reads" of type
           "ws_ref" (@ref ws), parameter "five_prime" of type
           "FivePrimeOptions" (unfortunately, we have to name the fields
           uniquely between 3' and 5' options due to the current
           implementation of grouped parameters) -> structure: parameter
           "adapter_sequence_5P" of String, parameter "anchored_5P" of type
           "boolean" (@range (0, 1)), parameter "three_prime" of type
           "ThreePrimeOptions" -> structure: parameter "adapter_sequence_3P"
           of String, parameter "anchored_3P" of type "boolean" (@range (0,
           1)), parameter "error_tolerance" of Double, parameter
           "min_overlap_length" of Long, parameter "min_read_length" of Long,
           parameter "discard_untrimmed" of type "boolean" (@range (0, 1))
        :returns: instance of type "exec_RemoveAdaptersResult" -> structure:
           parameter "report" of String, parameter "output_reads_ref" of
           String
        """
        # ctx is the context object
        # return variables are: result
        #BEGIN exec_remove_adapters
        console = []
        self.log(console, 'Running exec_remove_adapters() with parameters: ')
        self.log(console, "\n" + pformat(params))
        self.log(console, "-----------------------------------------------\n")
        report = ''
        returnVal = dict()
        returnVal['output_reads_ref'] = None

        token = ctx['token']
        wsClient = workspaceService(self.config['workspace-url'], token=token)
        ws = Workspace(self.config['workspace-url'], token=token)
        #setAPI_Client = SetAPI (url=self.config['SDK_CALLBACK_URL'], token=token) # for SDK local, doesn't work for SetAPI
        setAPI_Client = SetAPI(url=self.config['service-wizard-url'],
                               token=token)  # for dynamic service
        headers = {'Authorization': 'OAuth ' + token}
        env = os.environ.copy()
        env['KB_AUTH_TOKEN'] = token

        # 0. param checks
        required_params = [
            'output_workspace', 'input_reads', 'output_object_name'
        ]
        for arg in required_params:
            if arg not in params or params[arg] == None or params[arg] == '':
                raise ValueError("Must define required param: '" + arg + "'")

        # 1. load provenance
        provenance = [{}]
        if 'provenance' in ctx:
            provenance = ctx['provenance']
        # add additional info to provenance here, in this case the input data object reference
        provenance[0]['input_ws_objects'] = [str(params['input_reads'])]

        # 2. Determine whether read library, ReadsSet or RNASeqSampleSet is input object
        #
        try:
            # object_info tuple
            [
                OBJID_I, NAME_I, TYPE_I, SAVE_DATE_I, VERSION_I, SAVED_BY_I,
                WSID_I, WORKSPACE_I, CHSUM_I, SIZE_I, META_I
            ] = range(11)

            input_reads_obj_info = wsClient.get_object_info_new(
                {'objects': [{
                    'ref': params['input_reads']
                }]})[0]
            input_reads_obj_type = input_reads_obj_info[TYPE_I]
            input_reads_obj_type = re.sub(
                '-[0-9]+\.[0-9]+$', "",
                input_reads_obj_type)  # remove trailing version
            #input_reads_obj_version = input_reads_obj_info[VERSION_I]  # this is object version, not type version
        except Exception as e:
            raise ValueError(
                'Unable to get read library object from workspace: (' +
                str(params['input_reads']) + ')' + str(e))

        acceptable_types = [
            "KBaseSets.ReadsSet", "KBaseRNASeq.RNASeqSampleSet",
            "KBaseFile.PairedEndLibrary", "KBaseFile.SingleEndLibrary",
            "KBaseAssembly.PairedEndLibrary", "KBaseAssembly.SingleEndLibrary"
        ]
        if input_reads_obj_type not in acceptable_types:
            raise ValueError("Input reads of type: '" + input_reads_obj_type +
                             "'.  Must be one of " +
                             ", ".join(acceptable_types))

        # 3. Retrieve the set details
        #
        readsSet_ref_list = []
        readsSet_names_list = []
        readsSet_types_list = []
        if "KBaseSets.ReadsSet" in input_reads_obj_type:
            try:
                input_readsSet_obj = setAPI_Client.get_reads_set_v1({
                    'ref':
                    params['input_reads'],
                    'include_item_info':
                    1
                })

            except Exception as e:
                raise ValueError(
                    'SetAPI FAILURE: Unable to get read library set object from workspace: ('
                    + str(params['input_reads']) + ")\n" + str(e))
            for readsLibrary_obj in input_readsSet_obj['data']['items']:
                readsSet_ref_list.append(readsLibrary_obj['ref'])
                NAME_I = 1
                TYPE_I = 2
                readsSet_names_list.append(readsLibrary_obj['info'][NAME_I])
                this_type = readsLibrary_obj['info'][TYPE_I]
                this_type = re.sub('-[0-9]+\.[0-9]+$', "",
                                   this_type)  # remove trailing version
                readsSet_types_list.append(this_type)

        elif "KBaseRNASeq.RNASeqSampleSet" in input_reads_obj_type:
            sample_set = ws.get_objects2(
                {"objects": [{
                    "ref": params['input_reads']
                }]})["data"][0]["data"]
            sample_refs = list()
            for i in range(len(sample_set["sample_ids"])):
                readsSet_ref_list.append(sample_set["sample_ids"][i])
                sample_refs.append({"ref": sample_set["sample_ids"][i]})

            info = ws.get_object_info3({"objects": sample_refs})
            for j in range(len(info["infos"])):
                NAME_I = 1
                TYPE_I = 2
                readsSet_names_list.append(info["infos"][j][NAME_I])
                sample_type = info["infos"][j][TYPE_I]
                sample_type = re.sub('-[0-9]+\.[0-9]+$', "",
                                     sample_type)  # remove trailing version
                readsSet_types_list.append(sample_type)
        else:
            readsSet_ref_list = [params['input_reads']]
            readsSet_names_list = [params['output_object_name']]
            readsSet_types_list = [input_reads_obj_type]

        # 4. Iterate through readsLibrary memebers of set
        #
        report = ''
        cutadapt_readsSet_ref = None
        cutadapt_readsLib_refs = []

        for reads_item_i, input_reads_library_ref in enumerate(
                readsSet_ref_list):
            exec_remove_adapters_OneLibrary_params = {
                'output_workspace': params['output_workspace'],
                'input_reads': input_reads_library_ref,
                'reads_type': readsSet_types_list[reads_item_i]
            }
            if (input_reads_obj_type != "KBaseSets.ReadsSet"
                    and input_reads_obj_type != "KBaseRNASeq.RNASeqSampleSet"):
                exec_remove_adapters_OneLibrary_params[
                    'output_object_name'] = params['output_object_name']
            else:
                exec_remove_adapters_OneLibrary_params[
                    'output_object_name'] = readsSet_names_list[
                        reads_item_i] + "_cutadapt"

            optional_params = [
                'float error_tolerance', 'min_overlap_length',
                'min_read_length', 'discard_untrimmed'
            ]
            optional_g_params = {
                'five_prime': ['adapter_sequence_5P', 'anchored_5P'],
                'three_prime': ['adapter_sequence_3P', 'anchored_3P']
            }
            for arg in optional_params:
                if arg in params and params[arg] != None:
                    exec_remove_adapters_OneLibrary_params[arg] = params[arg]

            for group in optional_g_params.keys():
                if group in params and params[group] != None:
                    exec_remove_adapters_OneLibrary_params[group] = dict()
                    for arg in optional_g_params[group]:
                        if arg in params[group] and params[group][arg] != None:
                            exec_remove_adapters_OneLibrary_params[group][
                                arg] = params[group][arg]

            msg = "\n\nRUNNING exec_remove_adapters_OneLibrary() ON LIBRARY: " + str(
                input_reads_library_ref) + " " + str(
                    readsSet_names_list[reads_item_i]) + "\n"
            msg += "----------------------------------------------------------------------------\n"
            report += msg
            self.log(console, msg)

            # RUN
            exec_remove_adapters_OneLibrary_retVal = self.exec_remove_adapters_OneLibrary(
                ctx, exec_remove_adapters_OneLibrary_params)[0]

            report += exec_remove_adapters_OneLibrary_retVal['report'] + "\n\n"
            cutadapt_readsLib_refs.append(
                exec_remove_adapters_OneLibrary_retVal['output_reads_ref'])

        # 5. Conclude
        # Just one Library
        if (input_reads_obj_type != "KBaseSets.ReadsSet"
                and input_reads_obj_type != "KBaseRNASeq.RNASeqSampleSet"):

            # create return output object
            result = {
                'report': report,
                'output_reads_ref': cutadapt_readsLib_refs[0],
            }
        # ReadsSet or SampleSet
        else:
            # save cutadapt readsSet
            some_cutadapt_output_created = False
            items = []
            for i, lib_ref in enumerate(cutadapt_readsLib_refs):

                if lib_ref == None:
                    #items.append(None)  # can't have 'None' items in ReadsSet
                    continue
                else:
                    some_cutadapt_output_created = True
                    try:
                        label = input_readsSet_obj['data']['items'][i]['label']
                    except:
                        NAME_I = 1
                        label = ws.get_object_info3(
                            {'objects': [{
                                'ref': lib_ref
                            }]})['infos'][0][NAME_I]
                    label = label + "_cutadapt"

                    items.append({
                        'ref': lib_ref,
                        'label': label
                        #'data_attachment': ,
                        #'info':
                    })
            if some_cutadapt_output_created:
                reads_desc_ext = " + Cutadapt"
                #reads_name_ext = "_cutadapt"
                descText = ""
                reads_name_ext = ""
                try:
                    descText = input_readsSet_obj['data']['description']
                except:
                    NAME_I = 1
                    descText = ws.get_object_info3(
                        {'objects': [{
                            'ref': params['input_reads']
                        }]})['infos'][0][NAME_I]
                descText = descText + reads_desc_ext

                output_readsSet_obj = {'description': descText, 'items': items}
                output_readsSet_name = str(
                    params['output_object_name']) + reads_name_ext
                cutadapt_readsSet_ref = setAPI_Client.save_reads_set_v1({
                    'workspace_name':
                    params['output_workspace'],
                    'output_object_name':
                    output_readsSet_name,
                    'data':
                    output_readsSet_obj
                })['set_ref']
            else:
                raise ValueError("No cutadapt output created")

            # create return output object
            result = {
                'report': report,
                'output_reads_ref': cutadapt_readsSet_ref
            }
        #END exec_remove_adapters

        # At some point might do deeper type checking...
        if not isinstance(result, dict):
            raise ValueError('Method exec_remove_adapters return value ' +
                             'result is not type dict as required.')
        # return the results
        return [result]
예제 #22
0
    def stage_input(self, input_ref, fasta_file_extension):
        '''
        Stage input based on an input data reference for CheckM

        input_ref can be a reference to an Assembly, BinnedContigs, or (not yet implemented) a Genome

        This method creates a directory in the scratch area with the set of Fasta files, names
        will have the fasta_file_extension parameter tacked on.

            ex:

            staged_input = stage_input('124/15/1', 'fna')

            staged_input
            {"input_dir": '...'}
        '''
        # config
        #SERVICE_VER = 'dev'
        SERVICE_VER = 'release'

        # generate a folder in scratch to hold the input
        suffix = str(int(time.time() * 1000))
        input_dir = os.path.join(self.scratch, 'bins_' + suffix)
        all_seq_fasta = os.path.join(
            self.scratch,
            'all_sequences_' + suffix + '.' + fasta_file_extension)
        if not os.path.exists(input_dir):
            os.makedirs(input_dir)

        # 2) based on type, download the files
        ws = Workspace(self.ws_url)
        input_info = ws.get_object_info3({'objects': [{
            'ref': input_ref
        }]})['infos'][0]
        # 0 obj_id objid - the numerical id of the object.
        # 1 obj_name name - the name of the object.
        # 2 type_string type - the type of the object.
        # 3 timestamp save_date - the save date of the object.
        # 4 obj_ver ver - the version of the object.
        # 5 username saved_by - the user that saved or copied the object.
        # 6 ws_id wsid - the workspace containing the object.
        # 7 ws_name workspace - the workspace containing the object.
        # 8 string chsum - the md5 checksum of the object.
        # 9 int size - the size of the object in bytes.
        # 10 usermeta meta - arbitrary user-supplied metadata about
        #     the object.
        [
            OBJID_I, NAME_I, TYPE_I, SAVE_DATE_I, VERSION_I, SAVED_BY_I,
            WSID_I, WORKSPACE_I, CHSUM_I, SIZE_I, META_I
        ] = range(11)  # object_info tuple
        obj_name = input_info[NAME_I]
        type_name = input_info[TYPE_I].split('-')[0]

        # auClient
        try:
            auClient = AssemblyUtil(self.callbackURL,
                                    token=self.ctx['token'],
                                    service_ver=SERVICE_VER)
        except Exception as e:
            raise ValueError(
                'Unable to instantiate auClient with callbackURL: ' +
                self.callbackURL + ' ERROR: ' + str(e))

        # setAPI_Client
        try:
            #setAPI_Client = SetAPI (url=self.callbackURL, token=self.ctx['token'])  # for SDK local.  local doesn't work for SetAPI
            setAPI_Client = SetAPI(
                url=self.serviceWizardURL,
                token=self.ctx['token'])  # for dynamic service
        except Exception as e:
            raise ValueError(
                'Unable to instantiate setAPI_Client with serviceWizardURL: ' +
                self.serviceWizardURL + ' ERROR: ' + str(e))

        # mguClient
        try:
            mguClient = MetagenomeUtils(self.callbackURL,
                                        token=self.ctx['token'],
                                        service_ver=SERVICE_VER)
        except Exception as e:
            raise ValueError(
                'Unable to instantiate mguClient with callbackURL: ' +
                self.callbackURL + ' ERROR: ' + str(e))

        # Standard Single Assembly
        #
        if type_name in [
                'KBaseGenomeAnnotations.Assembly', 'KBaseGenomes.ContigSet'
        ]:
            # create file data
            filename = os.path.join(input_dir,
                                    obj_name + '.' + fasta_file_extension)
            auClient.get_assembly_as_fasta({
                'ref': input_ref,
                'filename': filename
            })
            if not os.path.isfile(filename):
                raise ValueError(
                    'Error generating fasta file from an Assembly or ContigSet with AssemblyUtil'
                )
            # make sure fasta file isn't empty
            min_fasta_len = 1
            if not self.fasta_seq_len_at_least(filename, min_fasta_len):
                raise ValueError(
                    'Assembly or ContigSet is empty in filename: ' +
                    str(filename))

        # AssemblySet
        #
        elif type_name == 'KBaseSets.AssemblySet':

            # read assemblySet
            try:
                assemblySet_obj = setAPI_Client.get_assembly_set_v1({
                    'ref':
                    input_ref,
                    'include_item_info':
                    1
                })
            except Exception as e:
                raise ValueError('Unable to get object from workspace: (' +
                                 input_ref + ')' + str(e))
            assembly_refs = []
            assembly_names = []
            for assembly_item in assemblySet_obj['data']['items']:
                this_assembly_ref = assembly_item['ref']
                # assembly obj info
                try:
                    this_assembly_info = ws.get_object_info_new(
                        {'objects': [{
                            'ref': this_assembly_ref
                        }]})[0]
                    this_assembly_name = this_assembly_info[NAME_I]
                except Exception as e:
                    raise ValueError('Unable to get object from workspace: (' +
                                     this_assembly_ref + '): ' + str(e))
                assembly_refs.append(this_assembly_ref)
                assembly_names.append(this_assembly_name)

            # create file data (name for file is what's reported in results)
            for ass_i, assembly_ref in enumerate(assembly_refs):
                this_name = assembly_names[ass_i]
                filename = os.path.join(input_dir,
                                        this_name + '.' + fasta_file_extension)
                auClient.get_assembly_as_fasta({
                    'ref': assembly_ref,
                    'filename': filename
                })
                if not os.path.isfile(filename):
                    raise ValueError(
                        'Error generating fasta file from an Assembly or ContigSet with AssemblyUtil'
                    )
                # make sure fasta file isn't empty
                min_fasta_len = 1
                if not self.fasta_seq_len_at_least(filename, min_fasta_len):
                    raise ValueError(
                        'Assembly or ContigSet is empty in filename: ' +
                        str(filename))

        # Binned Contigs
        #
        elif type_name == 'KBaseMetagenomes.BinnedContigs':

            # download the bins as fasta and set the input folder name
            bin_file_dir = mguClient.binned_contigs_to_file({
                'input_ref': input_ref,
                'save_to_shock': 0
            })['bin_file_directory']
            os.rename(bin_file_dir, input_dir)
            # make sure fasta file isn't empty
            self.set_fasta_file_extensions(input_dir, fasta_file_extension)
            for (dirpath, dirnames, filenames) in os.walk(input_dir):
                for fasta_file in filenames:
                    fasta_path = os.path.join(input_dir, fasta_file)
                    min_fasta_len = 1
                    if not self.fasta_seq_len_at_least(fasta_path,
                                                       min_fasta_len):
                        raise ValueError(
                            'Binned Assembly is empty for fasta_path: ' +
                            str(fasta_path))
                break

        # Genome and GenomeSet
        #
        elif type_name == 'KBaseGenomes.Genome' or type_name == 'KBaseSearch.GenomeSet':
            genome_obj_names = []
            genome_sci_names = []
            genome_assembly_refs = []

            if type_name == 'KBaseGenomes.Genome':
                genomeSet_refs = [input_ref]
            else:  # get genomeSet_refs from GenomeSet object
                genomeSet_refs = []
                try:
                    genomeSet_object = ws.get_objects2(
                        {'objects': [{
                            'ref': input_ref
                        }]})['data'][0]['data']
                except Exception as e:
                    raise ValueError('Unable to fetch ' + str(input_ref) +
                                     ' object from workspace: ' + str(e))
                    #to get the full stack trace: traceback.format_exc()

                # iterate through genomeSet members
                for genome_id in genomeSet_object['elements'].keys():
                    if 'ref' not in genomeSet_object['elements'][genome_id] or \
                       genomeSet_object['elements'][genome_id]['ref'] == None or \
                       genomeSet_object['elements'][genome_id]['ref'] == '':
                        raise ValueError(
                            'genome_ref not found for genome_id: ' +
                            str(genome_id) + ' in genomeSet: ' +
                            str(input_ref))
                    else:
                        genomeSet_refs.append(
                            genomeSet_object['elements'][genome_id]['ref'])

            # genome obj data
            for i, this_input_ref in enumerate(genomeSet_refs):
                try:
                    objects = ws.get_objects2(
                        {'objects': [{
                            'ref': this_input_ref
                        }]})['data']
                    genome_obj = objects[0]['data']
                    genome_obj_info = objects[0]['info']
                    genome_obj_names.append(genome_obj_info[NAME_I])
                    genome_sci_names.append(genome_obj['scientific_name'])
                except:
                    raise ValueError("unable to fetch genome: " +
                                     this_input_ref)

                # Get genome_assembly_ref
                if ('contigset_ref' not in genome_obj or genome_obj['contigset_ref'] == None) \
                   and ('assembly_ref' not in genome_obj or genome_obj['assembly_ref'] == None):
                    msg = "Genome " + genome_obj_names[
                        i] + " (ref:" + input_ref + ") " + genome_sci_names[
                            i] + " MISSING BOTH contigset_ref AND assembly_ref.  Cannot process.  Exiting."
                    raise ValueError(msg)
                    continue
                elif 'assembly_ref' in genome_obj and genome_obj[
                        'assembly_ref'] != None:
                    msg = "Genome " + genome_obj_names[
                        i] + " (ref:" + input_ref + ") " + genome_sci_names[
                            i] + " USING assembly_ref: " + str(
                                genome_obj['assembly_ref'])
                    print(msg)
                    genome_assembly_refs.append(genome_obj['assembly_ref'])
                elif 'contigset_ref' in genome_obj and genome_obj[
                        'contigset_ref'] != None:
                    msg = "Genome " + genome_obj_names[
                        i] + " (ref:" + input_ref + ") " + genome_sci_names[
                            i] + " USING contigset_ref: " + str(
                                genome_obj['contigset_ref'])
                    print(msg)
                    genome_assembly_refs.append(genome_obj['contigset_ref'])

            # create file data (name for file is what's reported in results)
            for ass_i, assembly_ref in enumerate(genome_assembly_refs):
                this_name = genome_obj_names[ass_i]
                filename = os.path.join(input_dir,
                                        this_name + '.' + fasta_file_extension)
                auClient.get_assembly_as_fasta({
                    'ref': assembly_ref,
                    'filename': filename
                })
                if not os.path.isfile(filename):
                    raise ValueError(
                        'Error generating fasta file from an Assembly or ContigSet with AssemblyUtil'
                    )
                # make sure fasta file isn't empty
                min_fasta_len = 1
                if not self.fasta_seq_len_at_least(filename, min_fasta_len):
                    raise ValueError(
                        'Assembly or ContigSet is empty in filename: ' +
                        str(filename))

        # Unknown type slipped through
        #
        else:
            raise ValueError(
                'Cannot stage fasta file input directory from type: ' +
                type_name)

        # create summary fasta file with all bins
        self.cat_fasta_files(input_dir, fasta_file_extension, all_seq_fasta)

        return {
            'input_dir': input_dir,
            'folder_suffix': suffix,
            'all_seq_fasta': all_seq_fasta
        }