Beispiel #1
0
    def get_fasta_file(self, genome_ref):
        ws = Workspace(self.ws_url)
        # test if genome references an assembly type
        # do get_objects2 without data. get list of refs
        genome_obj_info = ws.get_objects2({
            'objects': [{
                'ref': genome_ref
            }],
            'no_data': 1
        })
        # get the list of genome refs from the returned info.
        # if there are no refs (or something funky with the return), this will be an empty list.
        # this WILL fail if data is an empty list. But it shouldn't be, and we know because
        # we have a real genome reference, or get_objects2 would fail.
        genome_obj_refs = genome_obj_info.get('data', [{}])[0].get('refs', [])

        # see which of those are of an appropriate type (ContigSet or Assembly), if any.
        assembly_ref = list()
        ref_params = [{'ref': x} for x in genome_obj_refs]
        ref_info = ws.get_object_info3({'objects': ref_params})
        for idx, info in enumerate(ref_info.get('infos')):
            if "KBaseGenomeAnnotations.Assembly" in info[
                    2] or "KBaseGenomes.ContigSet" in info[2]:
                assembly_ref.append(";".join(ref_info.get('paths')[idx]))
        # now just get the file.
        au = AssemblyUtil(self.callback_url)
        fasta_file = au.get_assembly_as_fasta({'ref': assembly_ref[0]})
        return fasta_file["path"]
Beispiel #2
0
def fetch_fasta_from_genome(genome_ref, ws_url, callback_url):
    """
    Returns an assembly or contigset as FASTA.
    """
    if not check_ref_type(genome_ref, ['KBaseGenomes.Genome'], ws_url):
        raise ValueError("The given genome_ref {} is not a KBaseGenomes.Genome type!")
    # test if genome references an assembly type
    # do get_objects2 without data. get list of refs
    ws = Workspace(ws_url)
    genome_obj_info = ws.get_objects2({
        'objects': [{'ref': genome_ref}],
        'no_data': 1
    })
    # get the list of genome refs from the returned info.
    # if there are no refs (or something funky with the return), this will be an empty list.
    # this WILL fail if data is an empty list. But it shouldn't be, and we know because
    # we have a real genome reference, or get_objects2 would fail.
    genome_obj_refs = genome_obj_info.get('data', [{}])[0].get('refs', [])

    # see which of those are of an appropriate type (ContigSet or Assembly), if any.
    assembly_ref = list()
    ref_params = [{'ref': genome_ref + ";" + x} for x in genome_obj_refs]
    ref_info = ws.get_object_info3({'objects': ref_params})
    for idx, info in enumerate(ref_info.get('infos')):
        if "KBaseGenomeAnnotations.Assembly" in info[2] or "KBaseGenomes.ContigSet" in info[2]:
            assembly_ref.append(";".join(ref_info.get('paths')[idx]))

    if len(assembly_ref) == 1:
        return fetch_fasta_from_assembly(assembly_ref[0], ws_url, callback_url)
    else:
        raise ValueError("Multiple assemblies found associated with the given genome ref {}! "
                         "Unable to continue.")
Beispiel #3
0
    def test_build_hisat2_index_from_assembly_ok(self):
        manager = Hisat2IndexManager(self.wsURL, self.callback_url,
                                     self.scratch)
        ws = Workspace(self.wsURL)
        genome_obj_info = ws.get_objects2({
            'objects': [{
                'ref': self.genome_ref
            }],
            'no_data': 1
        })
        # get the list of genome refs from the returned info.
        # if there are no refs (or something funky with the return), this will be an empty list.
        # this WILL fail if data is an empty list. But it shouldn't be, and we know because
        # we have a real genome reference, or get_objects2 would fail.
        genome_obj_refs = genome_obj_info.get('data', [{}])[0].get('refs', [])

        # see which of those are of an appropriate type (ContigSet or Assembly), if any.
        assembly_ref = list()
        ref_params = [{'ref': x} for x in genome_obj_refs]
        ref_info = ws.get_object_info3({'objects': ref_params})
        for idx, info in enumerate(ref_info.get('infos')):
            if "KBaseGenomeAnnotations.Assembly" in info[
                    2] or "KBaseGenomes.ContigSet" in info[2]:
                assembly_ref.append(";".join(ref_info.get('paths')[idx]))
        assembly_ref = assembly_ref[0]
        idx_prefix = manager.get_hisat2_index(assembly_ref)
        self.assertIn("kb_hisat2_idx", idx_prefix)
    def check_binnedcontigs_cache(self, ref, token):
        ws = Workspace(self.ws_url, token=token)
        info = ws.get_object_info3({"objects": [{"ref": ref}]})['infos'][0]
        inner_chsum = info[8]
        index_file = os.path.join(self.metagenome_index_dir,
                                  inner_chsum + self.BIN_SUFFIX + ".tsv.gz")
        if not os.path.isfile(index_file):
            if self.debug:
                print("    Loading WS object...")
                t1 = time.time()

            included = [
                "/bins/[*]/bid", "/bins/[*]/gc", "/bins/[*]/n_contigs",
                "/bins/[*]/sum_contig_len", "/bins/[*]/cov"
            ]

            binnedcontigs = ws.get_objects2(
                {'objects': [{
                    'ref': ref,
                    'included': included
                }]})['data'][0]['data']
            self.save_binnedcontigs_tsv(binnedcontigs["bins"], inner_chsum)
            if self.debug:
                print("    (time=" + str(time.time() - t1) + ")")
        return inner_chsum
    def check_contigs_in_bin_cache(self, binnedcontigs_ref, bin_id, token):
        ws = Workspace(self.ws_url, token=token)

        info = ws.get_object_info3({"objects": [{
            "ref": binnedcontigs_ref
        }]})['infos'][0]

        # base64 encode the string so it is safe for filenames and still unique per contig id
        b64key = base64.urlsafe_b64encode(
            bin_id.encode("utf-8")).decode('utf-8')

        inner_chsum = info[8] + '_' + b64key
        index_file = os.path.join(
            self.metagenome_index_dir,
            inner_chsum + self.CONTIGS_SUFFIX + ".tsv.gz")
        if not os.path.isfile(index_file):
            if self.debug:
                t1 = time.time()

            # get the position in the array of the bin
            binnedcontigs = ws.get_objects2({
                'objects': [{
                    'ref': binnedcontigs_ref,
                    'included': ["/bins/[*]/bid"]
                }]
            })['data'][0]['data']
            pos = 0
            found = False
            for b in binnedcontigs['bins']:
                if b['bid'] == bin_id:
                    found = True
                    break
                pos += 1
            if not found:
                raise ValueError('No Bin with ID: "' + bin_id + '" found.')

            # get the contigs map
            selection = ws.get_objects2({
                'objects': [{
                    'ref':
                    binnedcontigs_ref,
                    'included': [
                        '/bins/' + str(pos) + '/contigs',
                        '/bins/' + str(pos) + '/bid'
                    ]
                }]
            })['data'][0]['data']
            if selection['bins'][0]['bid'] != bin_id:
                raise ValueError(
                    'Something went wrong- bin ids do not match on second ws get_objects2 call'
                )

            contigs = selection['bins'][0]['contigs']
            self.save_contigs_in_bin_tsv(contigs, inner_chsum)
            if self.debug:
                print("    (time=" + str(time.time() - t1) + ")")
        return inner_chsum
Beispiel #6
0
def get_object_name(ref, ws_url):
    """
    From an object reference, get the object's name.
    """
    if not check_reference(ref):
        raise ValueError(
            "This must be a valid object reference to find the object's name.")
    ws = Workspace(ws_url)
    info = ws.get_object_info3({"objects": [{"ref": ref}]}).get("infos")[0]
    return info[1]
Beispiel #7
0
    def check_assembly_cache(self, ref, token):
        ws = Workspace(self.ws_url, token=token)
        info = ws.get_object_info3({"objects": [{"ref": ref}]})['infos'][0]
        inner_chsum = info[8]
        index_file = os.path.join(
            self.assembly_index_dir,
            inner_chsum + self.ASSEMBLY_SUFFIX + ".tsv.gz")
        if not os.path.isfile(index_file):
            if self.debug:
                print("    Loading WS object...")
                t1 = time.time()

            if 'KBaseGenomeAnnotations.Assembly' in info[2]:
                included = ["/contigs"]
                assembly_data = ws.get_objects2(
                    {'objects': [{
                        'ref': ref,
                        'included': included
                    }]})['data'][0]['data']
                contigs = list(assembly_data['contigs'].values())
                self.save_assembly_tsv(contigs, inner_chsum)

            elif 'KBaseGenomes.ContigSet' in info[2]:
                included = [
                    "/contigs/[*]/id", "/contigs/[*]/length",
                    "/contigs/[*]/md5", "/contigs/[*]/description"
                ]
                cs_data = ws.get_objects2(
                    {'objects': [{
                        'ref': ref,
                        'included': included
                    }]})['data'][0]['data']
                contigs = []
                for c in cs_data['contigs']:
                    this_contig_data = {'contig_id': ''}
                    if 'id' in c:
                        this_contig_data['contig_id'] = c['id']
                    if 'md5' in c:
                        this_contig_data['md5'] = c['md5']
                    if 'length' in c:
                        this_contig_data['length'] = c['length']
                    if 'description' in c:
                        this_contig_data['description'] = c['description']
                    contigs.append(this_contig_data)

                self.save_assembly_tsv(contigs, inner_chsum)
            else:
                raise ValueError(
                    'The "ref" is not an Assembly or ContigSet data object.  It was a '
                    + info[2])

            if self.debug:
                print("    (time=" + str(time.time() - t1) + ")")
        return inner_chsum
Beispiel #8
0
def check_reference_type(ref, allowed_types, ws_url):
    """
    Tests whether the given object reference is one of the allowed types.
    Specifically, it tests whether the list of allowed_types strings is a substring
    of the ref object's name.
    E.g. if ".Genome" is an allowed type, it'll pass if the object is a "KBaseGenomes.Genome"
    """
    ws = Workspace(ws_url)
    info = ws.get_object_info3({"objects": [{"ref": ref}]}).get("infos")[0]
    passes = False
    for t in allowed_types:
        if t in info[2]:
            passes = True
    return passes
Beispiel #9
0
def get_object_type(ref, ws_url):
    """
    Fetches and returns the typed object name of ref from the given workspace url.
    If that object doesn't exist, or there's another Workspace error, this raises a
    RuntimeError exception.
    """
    ws = Workspace(ws_url)
    info = ws.get_object_info3({"objects": [{"ref": ref}]})
    obj_info = info.get("infos", [[]])[0]
    if len(obj_info) == 0:
        raise RuntimeError(
            "An error occurred while fetching type info from the Workspace. "
            "No information returned for reference {}".format(ref))
    return obj_info[2]
Beispiel #10
0
def get_object_names(ref_list, ws_url):
    """
    From a list of workspace references, returns a mapping from ref -> name of the object.
    """
    ws = Workspace(ws_url)
    obj_ids = list()
    for ref in ref_list:
        obj_ids.append({"ref": ref})
    info = ws.get_object_info3({"objects": obj_ids})
    name_map = dict()
    # might be in a data palette, so we can't just use the ref.
    # we already have the refs as passed previously, so use those for mapping, as they're in
    # the same order as what's returned.
    for i in range(len(info["infos"])):
        name_map[ref_list[i]] = info["infos"][i][1]
    return name_map
Beispiel #11
0
    def check_object_cache(self, ref, search_object, info_included,
                           index_dir, object_suffix, debug):
        ws = Workspace(self.ws_url, token=self.token)
        info = ws.get_object_info3({"objects": [{"ref": ref}]})['infos'][0]
        inner_chsum = info[8]
        index_file = os.path.join(index_dir,
                                  inner_chsum + object_suffix + ".tsv.gz")
        if not os.path.isfile(index_file):
            if debug:
                print("    Loading WS object...")
                t1 = time.time()

            included = self.build_info_included(search_object, info_included)
            object = ws.get_objects2({'objects': [{'ref': ref,
                                                   'included': included}]})['data'][0]['data']
            self.save_object_tsv(object[search_object], inner_chsum, info_included,
                                 index_dir, object_suffix)
            if debug:
                print("    (time=" + str(time.time() - t1) + ")")
        return inner_chsum
Beispiel #12
0
class Bowtie2Aligner(object):
    def __init__(self, scratch_dir, workspace_url, callback_url, srv_wiz_url,
                 provenance):
        self.scratch_dir = scratch_dir
        self.workspace_url = workspace_url
        self.callback_url = callback_url
        self.srv_wiz_url = srv_wiz_url
        self.provenance = provenance

        # from the provenance, extract out the version to run by exact hash if possible
        self.my_version = 'release'
        if len(provenance) > 0:
            if 'subactions' in provenance[0]:
                self.my_version = self.get_version_from_subactions(
                    'kb_Bowtie2', provenance[0]['subactions'])
        print('Running kb_Bowtie2 version = ' + self.my_version)

        self.ws = Workspace(self.workspace_url)
        self.bowtie2 = Bowtie2Runner(self.scratch_dir)
        self.parallel_runner = KBParallel(self.callback_url)
        self.qualimap = kb_QualiMap(self.callback_url)

    def get_version_from_subactions(self, module_name, subactions):
        # go through each sub action looking for
        if not subactions:
            return 'release'  # default to release if we can't find anything
        for sa in subactions:
            if 'name' in sa:
                if sa['name'] == module_name:
                    # local-docker-image implies that we are running in kb-test, so return 'dev'
                    if sa['commit'] == 'local-docker-image':
                        return 'dev'
                    # to check that it is a valid hash, make sure it is the right
                    # length and made up of valid hash characters
                    if re.match('[a-fA-F0-9]{40}$', sa['commit']):
                        return sa['commit']
        # again, default to setting this to release
        return 'release'

    def align(self, params):
        validated_params = self.validate_params(params)
        input_info = self.determine_input_info(validated_params)
        # input info provides information on the input and tells us if we should
        # run as a single_library or as a set:
        #     input_info = {'run_mode': '', 'info': [..], 'ref': '55/1/2'}

        assembly_or_genome_ref = validated_params['assembly_or_genome_ref']

        if input_info['run_mode'] == 'single_library':
            if 'output_alignment_name' not in validated_params:
                suffix = '_alignment'
                if 'output_alignment_suffix' in validated_params:
                    suffix = validated_params['output_alignment_suffix']
                validated_params[
                    'output_alignment_name'] = input_info['info'][1] + suffix
            single_lib_result = self.single_reads_lib_run(
                input_info,
                assembly_or_genome_ref,
                validated_params,
                create_report=validated_params['create_report'])

            return single_lib_result

        if input_info['run_mode'] == 'sample_set':
            reads = self.fetch_reads_refs_from_sampleset(
                input_info['ref'], input_info['info'], validated_params)
            self.build_bowtie2_index(assembly_or_genome_ref,
                                     validated_params['output_workspace'])

            print('Running on set of reads=')
            pprint(reads)

            tasks = []
            for r in reads:
                tasks.append(
                    self.build_single_execution_task(
                        r['ref'], params, r['alignment_output_name'],
                        r['condition']))

            batch_run_params = {
                'tasks': tasks,
                'runner': 'parallel',
                'max_retries': 2
            }
            if validated_params['concurrent_local_tasks'] is not None:
                batch_run_params['concurrent_local_tasks'] = validated_params[
                    'concurrent_local_tasks']
            if validated_params['concurrent_njsw_tasks'] is not None:
                batch_run_params['concurrent_njsw_tasks'] = validated_params[
                    'concurrent_njsw_tasks']
            results = self.parallel_runner.run_batch(batch_run_params)
            print('Batch run results=')
            pprint(results)
            batch_result = self.process_batch_result(results, validated_params,
                                                     reads, input_info['info'])
            return batch_result

        raise ('Improper run mode')

    def build_single_execution_task(self, reads_lib_ref, params, output_name,
                                    condition):
        task_params = copy.deepcopy(params)

        task_params['input_ref'] = reads_lib_ref
        task_params['output_alignment_name'] = output_name
        task_params['create_report'] = 0
        task_params['condition_label'] = condition

        return {
            'module_name': 'kb_Bowtie2',
            'function_name': 'align_reads_to_assembly_app',
            'version': self.my_version,
            'parameters': task_params
        }

    def single_reads_lib_run(self,
                             read_lib_info,
                             assembly_or_genome_ref,
                             validated_params,
                             create_report=False,
                             bowtie2_index_info=None):
        ''' run on one reads '''

        # download reads and prepare any bowtie2 index files
        input_configuration = self.prepare_single_run(
            read_lib_info, assembly_or_genome_ref, bowtie2_index_info,
            validated_params['output_workspace'])

        # run the actual program
        run_output_info = self.run_bowtie2_align_cli(input_configuration,
                                                     validated_params)

        # process the result and save the output
        upload_results = self.save_read_alignment_output(
            run_output_info, input_configuration, validated_params)
        run_output_info['upload_results'] = upload_results

        report_info = None
        if create_report:
            report_info = self.create_report_for_single_run(
                run_output_info, input_configuration, validated_params)

        self.clean(run_output_info)

        return {'output_info': run_output_info, 'report_info': report_info}

    def build_bowtie2_index(self, assembly_or_genome_ref, ws_for_cache):
        bowtie2IndexBuilder = Bowtie2IndexBuilder(self.scratch_dir,
                                                  self.workspace_url,
                                                  self.callback_url,
                                                  self.srv_wiz_url,
                                                  self.provenance)

        return bowtie2IndexBuilder.get_index({
            'ref': assembly_or_genome_ref,
            'ws_for_cache': ws_for_cache
        })

    def prepare_single_run(self, input_info, assembly_or_genome_ref,
                           bowtie2_index_info, ws_for_cache):
        ''' Given a reads ref and an assembly, setup the bowtie2 index '''
        # first setup the bowtie2 index of the assembly
        input_configuration = {'bowtie2_index_info': bowtie2_index_info}
        if not bowtie2_index_info:
            bowtie2IndexBuilder = Bowtie2IndexBuilder(self.scratch_dir,
                                                      self.workspace_url,
                                                      self.callback_url,
                                                      self.srv_wiz_url,
                                                      self.provenance)

            index_result = bowtie2IndexBuilder.get_index({
                'ref':
                assembly_or_genome_ref,
                'ws_for_cache':
                ws_for_cache
            })
            input_configuration['bowtie2_index_info'] = index_result

        # next download the reads
        read_lib_ref = input_info['ref']
        read_lib_info = input_info['info']
        reads_params = {
            'read_libraries': [read_lib_ref],
            'interleaved': 'false',
            'gzipped': None
        }
        ru = ReadsUtils(self.callback_url)
        reads = ru.download_reads(reads_params)['files']

        input_configuration['reads_lib_type'] = self.get_type_from_obj_info(
            read_lib_info).split('.')[1]
        input_configuration['reads_files'] = reads[read_lib_ref]
        input_configuration['reads_lib_ref'] = read_lib_ref

        return input_configuration

    def run_bowtie2_align_cli(self, input_configuration, validated_params):
        # pprint('======== input_configuration =====')
        # pprint(input_configuration)
        options = []
        run_output_info = {}

        # set the bowtie2 index location
        bt2_index_dir = input_configuration['bowtie2_index_info']['output_dir']
        bt2_index_basename = input_configuration['bowtie2_index_info'][
            'index_files_basename']
        options.extend(['-x', bt2_index_basename])

        # set the input reads
        if input_configuration['reads_lib_type'] == 'SingleEndLibrary':
            options.extend(
                ['-U', input_configuration['reads_files']['files']['fwd']])
            run_output_info['library_type'] = 'single_end'
        elif input_configuration['reads_lib_type'] == 'PairedEndLibrary':
            options.extend(
                ['-1', input_configuration['reads_files']['files']['fwd']])
            options.extend(
                ['-2', input_configuration['reads_files']['files']['rev']])
            run_output_info['library_type'] = 'paired_end'

        # setup the output file name
        output_dir = os.path.join(
            self.scratch_dir,
            'bowtie2_alignment_output_' + str(int(time.time() * 10000)))
        output_sam_file = os.path.join(output_dir, 'reads_alignment.sam')
        os.makedirs(output_dir)
        options.extend(['-S', output_sam_file])
        run_output_info['output_sam_file'] = output_sam_file
        run_output_info['output_dir'] = output_dir

        # parse all the other parameters
        if 'quality_score' in validated_params:
            options.append('--' + str(validated_params['quality_score']))

        if 'alignment_type' in validated_params:
            options.append('--' + str(validated_params['alignment_type']))

        if 'preset_options' in validated_params:
            if 'alignment_type' in validated_params and validated_params[
                    'alignment_type'] == 'local':
                options.append('--' + str(validated_params['preset_options'] +
                                          '-local'))
            else:
                options.append('--' + str(validated_params['preset_options']))

        if 'trim5' in validated_params:
            options.extend(['--trim5', str(validated_params['trim5'])])
        if 'trim3' in validated_params:
            options.extend(['--trim3', str(validated_params['trim3'])])
        if 'np' in validated_params:
            options.extend(['--np', str(validated_params['np'])])

        if 'minins' in validated_params:
            options.extend(['--minins', str(validated_params['minins'])])
        if 'maxins' in validated_params:
            options.extend(['--maxins', str(validated_params['maxins'])])

        # unfortunately, bowtie2 expects the index files to be in the current directory, and
        # you cannot configure it otherwise.  So run bowtie out of the index directory, but
        # place the output SAM file somewhere else
        self.bowtie2.run('bowtie2', options, cwd=bt2_index_dir)

        return run_output_info

    def save_read_alignment_output(self, run_output_info, input_configuration,
                                   validated_params):
        rau = ReadsAlignmentUtils(self.callback_url)
        destination_ref = validated_params[
            'output_workspace'] + '/' + validated_params[
                'output_alignment_name']
        condition = 'unknown'
        if 'condition_label' in validated_params:
            condition = validated_params['condition_label']
        upload_params = {
            'file_path': run_output_info['output_sam_file'],
            'destination_ref': destination_ref,
            'read_library_ref': input_configuration['reads_lib_ref'],
            'assembly_or_genome_ref':
            validated_params['assembly_or_genome_ref'],
            'condition': condition
        }
        upload_results = rau.upload_alignment(upload_params)
        return upload_results

    def clean(self, run_output_info):
        ''' Not really necessary on a single run, but if we are running multiple local subjobs, we
        should clean up files that have already been saved back up to kbase '''
        pass

    def create_report_for_single_run(self, run_output_info,
                                     input_configuration, validated_params):
        # first run qualimap
        qualimap_report = self.qualimap.run_bamqc(
            {'input_ref': run_output_info['upload_results']['obj_ref']})
        qc_result_zip_info = qualimap_report['qc_result_zip_info']

        # create report
        report_text = 'Ran on a single reads library.\n\n'
        alignment_info = self.get_obj_info(
            run_output_info['upload_results']['obj_ref'])
        report_text = 'Created ReadsAlignment: ' + str(
            alignment_info[1]) + '\n'
        report_text = '                        ' + run_output_info[
            'upload_results']['obj_ref'] + '\n'
        kbr = KBaseReport(self.callback_url)
        report_info = kbr.create_extended_report({
            'message':
            report_text,
            'objects_created': [{
                'ref':
                run_output_info['upload_results']['obj_ref'],
                'description':
                'ReadsAlignment'
            }],
            'report_object_name':
            'kb_Bowtie2_' + str(uuid.uuid4()),
            'direct_html_link_index':
            0,
            'html_links': [{
                'shock_id': qc_result_zip_info['shock_id'],
                'name': qc_result_zip_info['index_html_file_name'],
                'label': qc_result_zip_info['name']
            }],
            'workspace_name':
            validated_params['output_workspace']
        })
        return {
            'report_name': report_info['name'],
            'report_ref': report_info['ref']
        }

    def process_batch_result(self, batch_result, validated_params, reads,
                             input_set_info):

        n_jobs = len(batch_result['results'])
        n_success = 0
        n_error = 0
        ran_locally = 0
        ran_njsw = 0

        # reads alignment set items
        items = []
        objects_created = []

        for k in range(0, len(batch_result['results'])):
            job = batch_result['results'][k]
            result_package = job['result_package']
            if job['is_error']:
                n_error += 1
            else:
                n_success += 1
                output_info = result_package['result'][0]['output_info']
                ra_ref = output_info['upload_results']['obj_ref']
                # Note: could add a label to the alignment here?
                items.append({'ref': ra_ref, 'label': reads[k]['condition']})
                objects_created.append({'ref': ra_ref})

            if result_package['run_context']['location'] == 'local':
                ran_locally += 1
            if result_package['run_context']['location'] == 'njsw':
                ran_njsw += 1

        # Save the alignment set
        alignment_set_data = {'description': '', 'items': items}
        alignment_set_save_params = {
            'data':
            alignment_set_data,
            'workspace':
            validated_params['output_workspace'],
            'output_object_name':
            str(input_set_info[1]) + validated_params['output_obj_name_suffix']
        }

        set_api = SetAPI(self.srv_wiz_url)
        save_result = set_api.save_reads_alignment_set_v1(
            alignment_set_save_params)
        print('Saved ReadsAlignment=')
        pprint(save_result)
        objects_created.append({
            'ref':
            save_result['set_ref'],
            'description':
            'Set of all reads alignments generated'
        })
        set_name = save_result['set_info'][1]

        # run qualimap
        qualimap_report = self.qualimap.run_bamqc(
            {'input_ref': save_result['set_ref']})
        qc_result_zip_info = qualimap_report['qc_result_zip_info']

        # create the report
        report_text = 'Ran on SampleSet or ReadsSet.\n\n'
        report_text = 'Created ReadsAlignmentSet: ' + str(set_name) + '\n\n'
        report_text += 'Total ReadsLibraries = ' + str(n_jobs) + '\n'
        report_text += '        Successful runs = ' + str(n_success) + '\n'
        report_text += '            Failed runs = ' + str(n_error) + '\n'
        report_text += '       Ran on main node = ' + str(ran_locally) + '\n'
        report_text += '   Ran on remote worker = ' + str(ran_njsw) + '\n\n'

        print('Report text=')
        print(report_text)

        kbr = KBaseReport(self.callback_url)
        report_info = kbr.create_extended_report({
            'message':
            report_text,
            'objects_created':
            objects_created,
            'report_object_name':
            'kb_Bowtie2_' + str(uuid.uuid4()),
            'direct_html_link_index':
            0,
            'html_links': [{
                'shock_id': qc_result_zip_info['shock_id'],
                'name': qc_result_zip_info['index_html_file_name'],
                'label': qc_result_zip_info['name']
            }],
            'workspace_name':
            validated_params['output_workspace']
        })

        result = {
            'report_info': {
                'report_name': report_info['name'],
                'report_ref': report_info['ref']
            }
        }
        result['batch_output_info'] = batch_result

        return result

    def validate_params(self, params):
        validated_params = {}

        required_string_fields = [
            'input_ref', 'assembly_or_genome_ref', 'output_obj_name_suffix',
            'output_workspace'
        ]
        for field in required_string_fields:
            if field in params and params[field]:
                validated_params[field] = params[field]
            else:
                raise ValueError('"' + field +
                                 '" field required to run bowtie2 aligner app')

        optional_fields = [
            'quality_score', 'alignment_type', 'preset_options', 'trim5',
            'trim3', 'condition_label', 'np', 'minins', 'maxins',
            'output_alignment_suffix', 'output_alignment_name'
        ]
        for field in optional_fields:
            if field in params:
                if params[field] is not None:
                    validated_params[field] = params[field]

        validated_params['create_report'] = True
        if 'create_report' in params and params['create_report'] is not None:
            if int(params['create_report']) == 1:
                validated_params['create_report'] = True
            elif int(params['create_report']) == 0:
                validated_params['create_report'] = False
            else:
                raise ValueError(
                    '"create_report" field, if present, should be set to a boolean value: 0 or 1'
                )

        validated_params['concurrent_local_tasks'] = None
        validated_params['concurrent_njsw_tasks'] = None

        if 'concurrent_local_tasks' in params and params[
                'concurrent_local_tasks'] is not None:
            validated_params['concurrent_local_tasks'] = int(
                params['concurrent_local_tasks'])
        if 'concurrent_njsw_tasks' in params and params[
                'concurrent_njsw_tasks'] is not None:
            validated_params['concurrent_njsw_tasks'] = int(
                params['concurrent_njsw_tasks'])

        return validated_params

    def fetch_reads_refs_from_sampleset(self, ref, info, validated_params):
        """
        Note: adapted from kbaseapps/kb_hisat2 - file_util.py

        From the given object ref, return a list of all reads objects that are a part of that
        object. E.g., if ref is a ReadsSet, return a list of all PairedEndLibrary or SingleEndLibrary
        refs that are a member of that ReadsSet. This is returned as a list of dictionaries as follows:
        {
            "ref": reads object reference,
            "condition": condition string associated with that reads object
        }
        The only one required is "ref", all other keys may or may not be present, based on the reads
        object or object type in initial ref variable. E.g. a RNASeqSampleSet might have condition info
        for each reads object, but a single PairedEndLibrary may not have that info.
        If ref is already a Reads library, just returns a list with ref as a single element.
        """
        obj_type = self.get_type_from_obj_info(info)
        refs = list()
        refs_for_ws_info = list()
        if "KBaseSets.ReadsSet" in obj_type or "KBaseRNASeq.RNASeqSampleSet" in obj_type:
            print("Looking up reads references in ReadsSet object")
            set_api = SetAPI(self.srv_wiz_url)
            reads_set = set_api.get_reads_set_v1({
                'ref':
                ref,
                'include_item_info':
                0,
                'include_set_item_ref_paths':
                1
            })

            for reads in reads_set["data"]["items"]:
                refs.append({
                    'ref': reads['ref_path'],
                    'condition': reads['label']
                })
                refs_for_ws_info.append({'ref': reads['ref_path']})
        else:
            raise ValueError("Unable to fetch reads reference from object {} "
                             "which is a {}".format(ref, obj_type))

        # get object info so we can name things properly
        infos = self.ws.get_object_info3({'objects':
                                          refs_for_ws_info})['infos']

        name_ext = '_alignment'
        if 'output_alignment_suffix' in validated_params \
                and validated_params['output_alignment_suffix'] is not None:
            ext = validated_params['output_alignment_suffix'].replace(' ', '')
            if ext:
                name_ext = ext

        unique_name_lookup = {}
        for k in range(0, len(refs)):
            refs[k]['info'] = infos[k]
            name = infos[k][1]
            if name not in unique_name_lookup:
                unique_name_lookup[name] = 1
            else:
                unique_name_lookup[name] += 1
                name = name + '_' + str(unique_name_lookup[name])
            name = name + name_ext
            refs[k]['alignment_output_name'] = name

        return refs

    def determine_input_info(self, validated_params):
        ''' get info on the input_ref object and determine if we run once or run on a set '''
        info = self.get_obj_info(validated_params['input_ref'])
        obj_type = self.get_type_from_obj_info(info)
        if obj_type in [
                'KBaseAssembly.PairedEndLibrary',
                'KBaseAssembly.SingleEndLibrary', 'KBaseFile.PairedEndLibrary',
                'KBaseFile.SingleEndLibrary'
        ]:
            return {
                'run_mode': 'single_library',
                'info': info,
                'ref': validated_params['input_ref']
            }
        if obj_type == 'KBaseRNASeq.RNASeqSampleSet':
            return {
                'run_mode': 'sample_set',
                'info': info,
                'ref': validated_params['input_ref']
            }
        if obj_type == 'KBaseSets.ReadsSet':
            return {
                'run_mode': 'sample_set',
                'info': info,
                'ref': validated_params['input_ref']
            }

        raise ValueError('Object type of input_ref is not valid, was: ' +
                         str(obj_type))

    def get_type_from_obj_info(self, info):
        return info[2].split('-')[0]

    def get_obj_info(self, ref):
        return self.ws.get_object_info3({'objects': [{
            'ref': ref
        }]})['infos'][0]
Beispiel #13
0
class vConTACTUtils:

    def __init__(self, config):
        self.scratch = os.path.abspath(config['scratch'])
        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.token = os.environ['KB_AUTH_TOKEN']
        self.scratch = os.path.abspath(config['scratch'])
        self.ws = Workspace(config['workspace-url'], token=self.token)
        self.genome_api = GenomeAnnotationAPI(self.callback_url)
        self.au = AssemblyUtil(self.callback_url)

    def vcontact_help(self):
        command = "vcontact --help"
        self._run_command(command)

    def execute(self, command: list):
        """
        :param command: Command suitable for running in subprocess, must use a ['ls', '-l'] format
        :return: Response from command
        """
        # logger.info('Running command: {}'.format(command))
        print('Running command: {}'.format(' '.join(command)))
        res = subprocess.run(command, shell=False, encoding='utf-8', check=True)

        return res

    def run_vcontact(self, params):

        # Determine KBase "inputs" for vConTACT2
        genome = params['genome']

        obj_type = self.ws.get_object_info3({'objects': [{'ref': genome}]})['infos'][0][2]

        if 'assembly' in obj_type.lower():  # If KBaseGenomeAnnotations.Assembly

            # Assembly requires annotation
            genome_fp = self.au.get_assembly_as_fasta({'ref': genome})['path']
            proteins_fp = os.path.join(self.scratch, 'proteins.faa')
            proteins_gbk = os.path.join(self.scratch, 'proteins.gbk')
            gene2genome_fp = os.path.join(self.scratch, 'gene2genome.csv')

            prodigal_cmd = ['prodigal', '-a', proteins_fp, '-o', proteins_gbk, '-f', 'gbk',
                            '-i', genome_fp, '-p', 'meta']
            res = self.execute(prodigal_cmd)

            records = {}
            with open(proteins_fp, 'r') as proteins_fh:
                for record in SeqIO.parse(proteins_fh, 'fasta'):

                    records[len(records)] = {
                        'protein_id': record.id,
                        'contig_id': record.id.rsplit('_', 1)[0],
                        'keywords': 'None'
                    }

            g2g_df = pd.DataFrame.from_dict(records, orient='index')
            g2g_df.to_csv(gene2genome_fp, index=False)

            # Pass filepaths to the app and run
            params['gene2genome'] = gene2genome_fp
            params['sequences'] = proteins_fp

        elif 'kbasegenomes' in obj_type.lower(): # If KBaseGenomes.Genome
            genome_data = self.genome_api.get_genome_v1({"genomes": [{"ref": genome}]})

            # Convert genome data into "reasonable" parse form and write to scratch filesystem
            gene2genome, sequences = self.genome_to_inputs(genome_data)
            gene2genome_fp, sequences_fp = self.write_inputs(gene2genome, sequences)

            # Pass filepaths to the app and run
            params['gene2genome'] = gene2genome_fp
            params['sequences'] = sequences_fp

        elif 'binnedcontigs' in obj_type.lower():  # If KBaseMetagenomes.BinnedContigs
            print('KBaseMetagenomes.BinnedContigs hasnt been enabled. Check back later.')
            exit(1)
        else:
            print('Unknown error in identifying object types')

        print('Available database files')
        print(os.listdir('/miniconda/lib/python3.7/site-packages/vcontact2/data/'))

        # Just iterate through all parameters
        mappings = {
            'gene2genome': '--proteins-fp',
            'sequences': '--raw-proteins',
            'db': '--db',
            'pcs_mode': '--pcs-mode',
            'vcs_mode': '--vcs-mode',
            'blast_evalue': '--blast-evalue',
            'pc_max_overlap': '--max-overlap',
            'pc_penalty': '--penalty',
            'pc_haircut': '--haircut',
            'pc_inflation': '--pc-inflation',
            'vc_inflation': '--vc-inflation',
            'vc_density': '--min-density',
            'vc_min_size': '--min-size',
            'vc_max_overlap': '--vc-overlap',
            'vc_penalty': '--vc-penalty',
            'vc_haircut': '--vc-haircut',
            'merge_method': '--merge-method',
            'similarity': '--similarity',
            'seed_method': '--seed-method',
            'min_significance': '--sig',
            'max_significance': '--max-sig',
            'module_inflation': '--mod-inflation',
            'mod_significance': '--mod-sig',
            'module_min_shared': '--mod-shared-min',
            'link_significance': '--link-sig',
            'link_proportion': '--link-prop'
        }

        bool_args = ['optimize', 'permissive']

        # Should create build_command?
        command = 'vcontact2 --output-dir outdir'
        # Binaries
        command += ' --diamond-bin /usr/local/bin/diamond --c1-bin /usr/local/bin/cluster_one-1.0.jar'

        for param, cmd in mappings.items():
            command += ' {} {}'.format(cmd, params[param])

        self._run_command(command)

        report = self._generate_report(params)

        return report

    def _run_command(self, command):
        """
        _run_command: run command and print result
        """

        log('Start executing command:\n{}'.format(command))
        pipe = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True)
        output = pipe.communicate()[0]
        exitCode = pipe.returncode

        if (exitCode == 0):
            log('Executed command:\n{}\n'.format(command) +
                'Exit Code: {}\nOutput:\n{}'.format(exitCode, output))
        else:
            error_msg = 'Error running command:\n{}\n'.format(command)
            error_msg += 'Exit Code: {}\nOutput:\n{}'.format(exitCode, output)
            raise ValueError(error_msg)

    def genome_to_inputs(self, genome):
        """
        genome_to_inputs: convert genome annotation data (~json) to file inputs required by vConTACT
        :param genome:
        :return:
        """

        records = []
        gene2genome = OrderedDict()

        genome_data = genome['genomes'][0]

        for item in genome_data['data']['features']:
            if 'id' not in item:
                continue
                print('This feature does not have a valid id')
            elif 'dna_sequence' not in item or 'protein_translation' not in item:
                continue
                print('This feature {} does not have a valid DNA sequence.'.format(item['id']))
            else:
                # Create FASTA file
                if item['type'] == 'gene':
                    desc = (item['functions'] if item.get('functions', None)
                                              else item.get('function', ''))
                    gene_record = SeqRecord(Seq(item['protein_translation']), id=item['id'],
                                            description=desc)
                    records.append(gene_record)

                    # Build gene2genome
                    gene2genome.update({
                        item['id']: {
                            # 'contig_id': genome_data['data']['contig_ids'][0],
                            'contig_id': item['location'][0][0],
                            'protein_id': item['id'],
                            'keywords': item['function']
                        }
                    })

        return gene2genome, records

    def write_inputs(self, mapping, sequences):

        fasta_for_proteins_fp = os.path.join(self.scratch, 'vConTACT_proteins.fasta')
        with open(fasta_for_proteins_fp, 'w') as fasta_for_proteins_fh:
            SeqIO.write(sequences, fasta_for_proteins_fh, 'fasta')

        genes_to_genomes_mapping_fp = os.path.join(self.scratch, 'vConTACT_gene2genome.csv')
        with open(genes_to_genomes_mapping_fp, 'w') as genes_to_genomes_mapping_fh:
            fields = ['contig_id', 'protein_id', 'keywords']
            writer = csv.DictWriter(genes_to_genomes_mapping_fh, fieldnames=fields)
            writer.writeheader()

            for gene in mapping.keys():
                writer.writerow(mapping[gene])

        return genes_to_genomes_mapping_fp, fasta_for_proteins_fp

    def _generate_report(self, params):
        """
        _generate_report: generate summary report

        This will contain ALL the logic to generate the report, including areas that should/will be re-factored later

        """

        # Get
        self.dfu = dfu(self.callback_url)

        # Get filepath of summary file
        summary_fp = os.path.join(os.getcwd(), 'outdir', 'genome_by_genome_overview.csv')

        summary_df = pd.read_csv(summary_fp, header=0, index_col=0)
        html = summary_df.to_html(index=False, classes='my_class table-striped" id = "my_id')

        # Need to file write below
        direct_html = html_template.substitute(html_table=html)

        # Find header so it can be copied to footer, as dataframe.to_html doesn't include footer
        start_header = Literal("<thead>")
        end_header = Literal("</thead>")

        text = start_header + SkipTo(end_header)

        new_text = ''
        for data, start_pos, end_pos in text.scanString(direct_html):
            new_text = ''.join(data).replace(' style="text-align: right;"', '').replace('thead>',
                                                                                        'tfoot>\n  ') + '\n</tfoot>'

        # Get start and end positions to insert new text
        end_tbody = Literal("</tbody>")
        end_table = Literal("</table>")

        insertion_pos = end_tbody + SkipTo(end_table)

        final_html = ''
        for data, start_pos, end_pos in insertion_pos.scanString(direct_html):
            final_html = direct_html[:start_pos + 8] + '\n' + new_text + direct_html[start_pos + 8:]

        output_dir = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_dir)
        result_fp = os.path.join(output_dir, 'index.html')

        with open(result_fp, 'w') as result_fh:
            result_fh.write(final_html)

        report_shock_id = self.dfu.file_to_shock({
            'file_path': output_dir,
            'pack': 'zip'
        })['shock_id']

        html_report = [{
            'shock_id': report_shock_id,
            'name': os.path.basename(result_fp),
            'label': os.path.basename(result_fp),
            'description': 'HTML summary report for vConTACT2'
        }]

        report_params = {'message': 'Basic message to show in the report',
                         'workspace_name': params['workspace_name'],
                         'html_links': html_report,
                         'direct_html_link_index': 0,
                         'report_object_name': 'vConTACT_report_{}'.format(str(uuid.uuid4())),
                         # Don't use until have files to attach to report
                         # 'file_links': [{}],
                         # Don't use until data objects that are created as result of running app
                         # 'objects_created': [{'ref': matrix_obj_ref,
                         #                      'description': 'Imported Matrix'}],
                         }

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {'report_name': output['name'], 'report_ref': output['ref']}

        return report_output

    def _mkdir_p(self, path):
        """
        _mkdir_p: make directory for given path
        """
        # https://stackoverflow.com/a/600612/643675
        if not path:
            return
        try:
            os.makedirs(path)
        except OSError as exc:
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise
Beispiel #14
0
class STARUtils:
    STAR_VERSION = 'STAR 2.5.3a'
    STAR_BIN = '/kb/deployment/bin/STAR'
    STAR_IDX_DIR = 'STAR_Genome_index'
    STAR_OUT_DIR = 'STAR_Output'
    PARAM_IN_WS = 'output_workspace'
    PARAM_IN_FASTA_FILES = 'genomeFastaFiles'
    PARAM_IN_OUTFILE_PREFIX = 'outFileNamePrefix'
    PARAM_IN_STARMODE = 'runMode'
    PARAM_IN_THREADN = 'runThreadN'
    PARAM_IN_READS_FILES = 'readFilesIn'
    PARAM_IN_READS = 'readsset_ref'
    PARAM_IN_GENOME = 'genome_ref'
    SET_READS = 'set_reads_refs'

    def __init__(self, scratch_dir, workspace_url, callback_url, srv_wiz_url, provenance):
        self.workspace_url = workspace_url
        self.callback_url = callback_url
        self.srv_wiz_url = srv_wiz_url
        self.au = AssemblyUtil(self.callback_url)
        self.dfu = DataFileUtil(self.callback_url, service_ver='beta')
        self.scratch = scratch_dir
        self.working_dir = scratch_dir
        self.prog_runner = Program_Runner(self.STAR_BIN, self.scratch)
        self.provenance = provenance
        self.ws_client = Workspace(self.workspace_url)

        self.parallel_runner = KBParallel(self.callback_url)
        self.qualimap = kb_QualiMap(self.callback_url, service_ver='dev')
        self.set_api_client = SetAPI(self.srv_wiz_url, service_ver='dev')
        self.eu = ExpressionUtils(self.callback_url, service_ver='beta')

    def process_params(self, params):
        """
        process_params: checks params passed to run_star method and set default values
        """
        log('Start validating run_star parameters')
        # check for required parameters
        if params.get(self.PARAM_IN_WS, None) is None:
            raise ValueError(self.PARAM_IN_WS + ' parameter is required')

        if params.get(self.PARAM_IN_STARMODE, None) is None:
            params[self.PARAM_IN_STARMODE] = 'alignReads'
	if params.get(self.PARAM_IN_GENOME, None) is None:
            raise ValueError(self.PARAM_IN_GENOME +
				' parameter is required for generating genome index')

        if (params.get(self.PARAM_IN_STARMODE, None) is not None and
		params[self.PARAM_IN_STARMODE] != "genomeGenerate"):
            if params.get(self.PARAM_IN_READS, None) is None:
		raise ValueError(self.PARAM_IN_READS +
				' parameter is required for reads mapping')
		if not valid_string(params[self.PARAM_IN_READS], is_ref=True):
			raise ValueError("Parameter readsset_ref must be a valid Workspace object reference, "
					 "not {}".format(params.get(self.PARAM_IN_READS, None)))

        if params.get(self.PARAM_IN_THREADN, None) is not None:
            if not isinstance(params[self.PARAM_IN_THREADN], int):
                raise ValueError(self.PARAM_IN_HASH_THREADN + ' must be of type int')
	else:
             params[self.PARAM_IN_THREADN] = 2

	if "alignment_suffix" not in params or not valid_string(params["alignment_suffix"]):
            raise ValueError("Parameter alignment_suffix must be a valid Workspace object string, "
                      "not {}".format(params.get("alignment_suffix", None)))

        if params.get(self.PARAM_IN_OUTFILE_PREFIX, None) is not None:
            if params[self.PARAM_IN_OUTFILE_PREFIX].find('/') != -1:
                raise ValueError(self.PARAM_IN_OUTFILE_PREFIX + ' cannot contain subfolder(s).')
        else:
            params[self.PARAM_IN_OUTFILE_PREFIX] = 'star_'

        if params.get('create_report', None) is None:
            params['create_report'] = 0

        return self._setDefaultParameters(params)


    def _setDefaultParameters(self, params_in):
        """set default for this group of parameters
        """
        params = copy.deepcopy(params_in)
        if params.get('outFilterType', None) is None:
            params['outFilterType'] = "\"BySJout\""
        if params.get('outFilterMultimapNmax', None) is None:
            params['outFilterMultimapNmax'] = 20
        if params.get('outSAMtype', None) is None:
            params['outSAMtype'] = 'BAM'
        if params.get('outSAMattrIHstart', None) is None:
            params['outSAMattrIHstart'] = 0
        if params.get('outSAMstrandField', None) is None:
            params['outSAMstrandField'] = 'intronMotif'
        if params.get('outFilterIntronMotifs', None) is None:
            params['outFilterIntronMotifs'] = 'RemoveNoncanonical'
        if params.get(self.SET_READS, None) is None:
            params[self.SET_READS] = self._get_reads_refs_from_setref(params)

        return params

    def _get_genome_gtf_file(self, gnm_ref, gtf_file_dir):
        """
        Get data from genome object ref and return the GTF filename (with path)
        for STAR indexing and mapping.
        STAR uses the reference annotation to guide assembly and for creating alignment
        """
        log("Converting genome {0} to GFF file in folder {1}".format(gnm_ref, gtf_file_dir))
        gfu = GenomeFileUtil(self.callback_url)
        try:
            gfu_ret = gfu.genome_to_gff({self.PARAM_IN_GENOME: gnm_ref,
                                         'is_gtf': 1,
                                         'target_dir': gtf_file_dir
                                      })
        except ValueError as egfu:
            log('GFU getting GTF file raised error:\n')
            pprint(egfu)
            return None
        else:#no exception raised
            return gfu_ret.get('file_path')


    def _construct_indexing_cmd(self, params):
	# STEP 1: construct the command for running `STAR --runMode genomeGenerate...`
        idx_cmd = [self.STAR_BIN]
	idx_cmd.append('--genomeDir')
	idx_cmd.append(params[self.STAR_IDX_DIR])
	idx_cmd.append('--' + self.PARAM_IN_STARMODE)
	idx_cmd.append('genomeGenerate')
	idx_cmd.append('--' + self.PARAM_IN_THREADN)
	idx_cmd.append(str(params[self.PARAM_IN_THREADN]))

	if params.get(self.PARAM_IN_FASTA_FILES, None) is not None:
            idx_cmd.append('--' + self.PARAM_IN_FASTA_FILES)
            for fasta_file in params[self.PARAM_IN_FASTA_FILES]:
                idx_cmd.append(fasta_file)

	# STEP 2: append the standard optional inputs
        if params.get('sjdbGTFfile', None) is not None:
            idx_cmd.append('--sjdbGTFfile')
            idx_cmd.append(params['sjdbGTFfile'])
        if (params.get('sjdbOverhang', None) is not None
		and params['sjdbOverhang'] > 0):
            idx_cmd.append('--sjdbOverhang')
            idx_cmd.append(str(params['sjdbOverhang']))

        #print ('STAR indexing CMD:')
        #print ' '.join(idx_cmd)
        return idx_cmd

    def _construct_mapping_cmd(self, params):
	if params.get(self.PARAM_IN_STARMODE, None) is None:
            params[self.PARAM_IN_STARMODE] = 'alignReads'

        # STEP 1: set the working folder housing the STAR output results as well as the reads info
        star_out_dir = ''
	if params.get('align_output', None) is None:
            star_out_dir = self.scratch
	else:
            star_out_dir = params['align_output']

        # STEP 2: construct the command for running STAR mapping
        mp_cmd = [self.STAR_BIN]
	mp_cmd.append('--genomeDir')
	mp_cmd.append(params[self.STAR_IDX_DIR])
	mp_cmd.append('--' + self.PARAM_IN_STARMODE)
	mp_cmd.append(params[self.PARAM_IN_STARMODE])
	mp_cmd.append('--' + self.PARAM_IN_THREADN)
	mp_cmd.append(str(params[self.PARAM_IN_THREADN]))

	if params.get(self.PARAM_IN_READS_FILES, None) is not None:
            #print('Input reads files:\n' + pformat(params[self.PARAM_IN_READS_FILES]))
            mp_cmd.append('--' + self.PARAM_IN_READS_FILES)
            for reads_file in params[self.PARAM_IN_READS_FILES]:
                mp_cmd.append(reads_file)
		readName, readsExtension = os.path.splitext(reads_file)
                #print ('Reads file name-- {}/extension-- {}:'.format(readName, readsExtension))
		if readsExtension == '.gz':
			mp_cmd.append('--readFilesCommand')
			mp_cmd.append('gunzip')
			mp_cmd.append('-c')

		if readsExtension == '.bz2':
			mp_cmd.append('--readFilesCommand')
			mp_cmd.append('bunzip2')
			mp_cmd.append('-c')

        # STEP 3: appending the advanced optional inputs
        mp_cmd.append('--' + self.PARAM_IN_OUTFILE_PREFIX)
        mp_cmd.append(os.path.join(star_out_dir, params[self.PARAM_IN_OUTFILE_PREFIX]))

        if params.get('sjdbGTFfile', None) is not None:
            mp_cmd.append('--sjdbGTFfile')
            mp_cmd.append(params['sjdbGTFfile'])
        if (params.get('sjdbOverhang', None) is not None
		and params['sjdbOverhang'] > 0):
            mp_cmd.append('--sjdbOverhang')
            mp_cmd.append(str(params['sjdbOverhang']))

        if (params.get('outFilterType', None) is not None
                and isinstance(params['outFilterType'], str)):
            mp_cmd.append('--outFilterType')
            mp_cmd.append(params['outFilterType'])
        if (params.get('outFilterMultimapNmax', None) is not None
                and isinstance(params['outFilterMultimapNmax'], int)
                and params['outFilterMultimapNmax'] >= 0):
            mp_cmd.append('--outFilterMultimapNmax')
            mp_cmd.append(str(params['outFilterMultimapNmax']))

        #output sorted file:Aligned.sortedByCoord.out.bam
        #allowed values of --outSAMtype are BAM Unsorted or SortedByCoordinate or both
        if params.get('outSAMtype', None) is not None:
            mp_cmd.append('--outSAMtype')
            mp_cmd.append(params['outSAMtype'])
            if params.get('outSAMtype', None) == 'BAM':
                mp_cmd.append('SortedByCoordinate')

        # 'It is recommended to remove the non-canonical junctions for Cnks runs using
        # --outFilterIntronMotifs RemoveNoncanonical'
        if params.get('outFilterIntronMotifs', None) is not None:
            mp_cmd.append('--outFilterIntronMotifs')
            mp_cmd.append('RemoveNoncanonical')

        if (params.get('outSAMattrIHstart', None) is not None
                and isinstance(params['outSAMattrIHstart'], int)
                and params['outSAMattrIHstart'] >= 0):
            mp_cmd.append('--outSAMattrIHstart')
            mp_cmd.append(str(params['outSAMattrIHstart']))
        if (params.get('outSAMstrandField', None) is not None
                and isinstance(params['outSAMstrandField'], str)):
            mp_cmd.append('--outSAMstrandField')
            mp_cmd.append(params['outSAMstrandField'])

        quant_modes = ["TranscriptomeSAM", "GeneCounts", "Both"]
        if (params.get('quantMode', None) is not None
                and params.get('quantMode', None) in quant_modes):
            mp_cmd.append('--quantMode')
            if params['quantMode'] == "Both":
                mp_cmd.append("TranscriptomeSAM")
                mp_cmd.append("GeneCounts")
            else:
                mp_cmd.append(params['quantMode'])
        if (params.get('alignSJoverhangMin', None) is not None
		and isinstance(params['alignSJoverhangMin'], int)
                and params['alignSJoverhangMin'] > 0):
            mp_cmd.append('--alignSJoverhangMin')
            mp_cmd.append(str(params['alignSJoverhangMin']))
        if (params.get('alignSJDBoverhangMin', None) is not None
                and isinstance(params['alignSJDBoverhangMin'], int)
                and params['alignSJDBoverhangMin'] > 0):
            mp_cmd.append('--alignSJDBoverhangMin')
            mp_cmd.append(str(params['alignSJDBoverhangMin']))
        if (params.get('outFilterMismatchNmax', None) is not None
		and isinstance(params['outFilterMismatchNmax'], int)
                and params['outFilterMismatchNmax'] > 0):
            mp_cmd.append('--outFilterMismatchNmax')
            mp_cmd.append(str(params['outFilterMismatchNmax']))
        if (params.get('alignIntronMin', None) is not None
		and isinstance(params['alignIntronMin'], int)
                and params['alignIntronMin'] > 0):
            mp_cmd.append('--alignIntronMin')
            mp_cmd.append(str(params['alignIntronMin']))
        if (params.get('alignIntronMax', None) is not None
		and isinstance(params['alignIntronMax'], int)
                and params['alignIntronMax'] >= 0):
            mp_cmd.append('--alignIntronMax')
            mp_cmd.append(str(params['alignIntronMax']))
        if (params.get('alignMatesGapMax', None) is not None
		and isinstance(params['alignMatesGapMax'], int)
                and params['alignMatesGapMax'] >= 0):
            mp_cmd.append('--alignMatesGapMax')
            mp_cmd.append(str(params['alignMatesGapMax']))

        #print ('STAR mapping CMD:')
        #print ' '.join(mp_cmd)
        return mp_cmd

    def _exec_indexing(self, params):
        log('Running STAR index generating with params:\n' + pformat(params))

        idx_cmd = self._construct_indexing_cmd(params)

        exitCode = self.prog_runner.run(idx_cmd, self.scratch)

        return exitCode

    def _exec_mapping(self, params):
        log('Running STAR mapping with params:\n' + pformat(params))

        mp_cmd = self._construct_mapping_cmd(params)

        exitCode = self.prog_runner.run(mp_cmd, self.scratch)

        return exitCode

    def _exec_star_pipeline(self, params, rds_files, rds_name, idx_dir, out_dir):
        # build the parameters
        params_idx = self._get_indexing_params(params, idx_dir)
        params_mp = self._get_mapping_params(params, rds_files, rds_name, idx_dir, out_dir)

        # execute indexing and then mapping
        retVal = {}
        try:
            if params[self.PARAM_IN_STARMODE]=='genomeGenerate':
                ret = self._exec_indexing(params_idx)
            else:
		ret = 0
            while( ret != 0 ):
                time.sleep(1)
        except ValueError as eidx:
            log('STAR genome indexing raised error:\n')
            pprint(eidx)
        else:#no exception raised by genome indexing and STAR returns 0, then run mapping
            params_mp[self.PARAM_IN_STARMODE] = 'alignReads'
            try:
                ret = self._exec_mapping(params_mp)
                while( ret != 0 ):
                    time.sleep(1)
            except ValueError as emp:
                log('STAR mapping raised error:\n')
                pprint(emp)
            else:#no exception raised by STAR mapping and STAR returns 0, then move to saving and reporting  
                ret = {'star_idx': star_idx, 'star_output': params_mp.get('align_output')}

        return ret


    def upload_STARalignment(self, input_params, reads_ref, reads_info, output_bam_file):
        """
        Uploads the alignment file + metadata.
        Returns the STAR alignment reference.
        """

        aligner_opts = dict()
        for k in input_params:
            aligner_opts[k] = str(input_params[k])
        pprint(reads_info)

        alignment_name = reads_ref['alignment_output_name']
        align_upload_params = {
            "destination_ref": "{}/{}".format(input_params[self.PARAM_IN_WS], alignment_name),
            "file_path": output_bam_file,
            "assembly_or_genome_ref": input_params[self.PARAM_IN_GENOME],
            "read_library_ref": reads_info['object_ref'],
            "library_type": reads_info['style'],
            "condition": reads_info['condition'],
            "aligned_using": 'STAR',
            "aligner_version":self.STAR_VERSION,
            "aligner_opts": aligner_opts
        }

        pprint(align_upload_params)

        ra_util = ReadsAlignmentUtils(self.callback_url, service_ver='beta')
        rau_upload_ret = ra_util.upload_alignment(align_upload_params)
        alignment_ref = rau_upload_ret["obj_ref"]
        print("STAR alignment uploaded as object {}".format(alignment_ref))
        return rau_upload_ret


    def generate_report_for_single_run(self, run_output_info, params):
        input_ref = run_output_info['upload_results']['obj_ref']
        index_dir = run_output_info['index_dir']
        output_dir = run_output_info['output_dir']
        output_files = self._generate_output_file_list(index_dir, output_dir)

        # first run qualimap
        qualimap_report = self.qualimap.run_bamqc({'input_ref': input_ref})
        qc_result_zip_info = qualimap_report['qc_result_zip_info']

        # create report
        report_text = 'Ran on a single reads library.\n\n'
        alignment_info = self.get_obj_infos(input_ref)[0]
        report_text = 'Created ReadsAlignment: ' + str(alignment_info[1]) + '\n'
        report_text += '                        ' + input_ref + '\n'
        kbr = KBaseReport(self.callback_url)
        report_info = kbr.create_extended_report({'message': report_text,
                                                  'file_links': output_files,
                                                  'objects_created': [{'ref': input_ref,
                                                                       'description': 'ReadsAlignment'}],
                                                  'report_object_name': 'kb_STAR_report_' + str(uuid.uuid4()),
                                                  'direct_html_link_index': 0,
                                                  'html_links': [{'shock_id': qc_result_zip_info['shock_id'],
                                                                  'name': qc_result_zip_info['index_html_file_name'],
                                                                  'label': qc_result_zip_info['name']}],
                                                  'html_window_height': 366,
                                                  'workspace_name': params['output_workspace']
                                                  })
        return report_info #{'report_name': report_info['name'], 'report_ref': report_info['ref']}

    def _get_reads_info(self, reads, readsSet_ref):
        '''
        _get_reads_info:fetches the detailed info for each reads with ref in list reads_refs
        return an object of the following structure:
        {
            "style": "paired", "single", or "interleaved",
            "file_fwd": path_to_file,
            "name": name of the reads,
            "file_rev": path_to_file, only if paired end,
            "object_ref": reads reference for downstream convenience,
            "condition": the condition for the reads.
        }
        '''
        try:
            print("Fetching FASTA file from reads reference {}".format(reads['ref']))
            ret_reads_info = fetch_reads_from_reference(reads['ref'], self.callback_url)
        except ValueError:
            print("Incorrect object type for fetching a FASTA file!")
            raise

        if ret_reads_info.get("file_fwd", None) is None:
            raise RuntimeError("FASTA file fetched from reads {} doesn't seem to exist!".format(reads['ref']))
        else:
            if reads.get('condition', None) is not None:
                ret_reads_info['condition'] = reads['condition']
            else:
                ret_reads_info['condition'] = 'unspecified'
            if reads.get('object_ref', None) != readsSet_ref:
                ret_reads_info[self.PARAM_IN_READS] = readsSet_ref

        return ret_reads_info


    def _get_genome_fasta(self, gnm_ref):
        genome_fasta_files = list()
	if gnm_ref is not None:
            try:
		print("Fetching FASTA file from object {}".format(gnm_ref))
		genome_fasta_file = fetch_fasta_from_object(gnm_ref, self.workspace_url, self.callback_url)
		print("Done fetching FASTA file! Path = {}".format(genome_fasta_file.get("path", None)))
            except ValueError:
		print("Incorrect object type for fetching a FASTA file!")
		raise

            if genome_fasta_file.get("path", None) is None:
		raise RuntimeError("FASTA file fetched from object {} doesn't seem exist!".format(gnm_ref))
            else:
		genome_fasta_files.append(genome_fasta_file["path"])
        return genome_fasta_files


    def convert_params(self, validated_params):
        """
        Convert input parameters with KBase ref format into STAR parameters,
        and add the advanced options.
        """
        params = copy.deepcopy(validated_params)
        params['runMode'] = 'genomeGenerate'

        if validated_params.get('create_report', None) is not None:
                params['create_report'] = validated_params['create_report']
        if validated_params.get('concurrent_local_tasks', None) is not None:
                params['concurrent_local_tasks'] = validated_params['concurrent_local_tasks']
        if validated_params.get('concurrent_njsw_tasks', None) is not None:
                params['concurrent_njsw_tasks'] = validated_params['concurrent_njsw_tasks']
        if validated_params.get('alignmentset_suffix', None) is not None:
                params['alignmentset_suffix'] = validated_params['alignmentset_suffix']

        # Add advanced options from validated_params to params
        sjdbGTFfile = validated_params.get("sjdbGTFfile", None)
	if sjdbGTFfile is not None:
            params['sjdbGTFfile'] = sjdbGTFfile
        else:
            params['sjdbGTFfile'] = self._get_genome_gtf_file(
                                        params[self.PARAM_IN_GENOME],
                                        os.path.join(self.scratch, self.STAR_IDX_DIR))
        if validated_params.get('sjdbOverhang', None) is not None :
            params['sjdbOverhang'] = validated_params['sjdbOverhang']
        else:
            params['sjdbOverhang'] = 100

        quant_modes = ["TranscriptomeSAM", "GeneCounts", "Both"]
        if (validated_params.get('quantMode', None) is not None
                and validated_params.get('quantMode', None) in quant_modes):
            params['quantMode'] = validated_params['quantMode']
        else:
            params['quantMode'] = 'Both'

        return params


    def _get_indexing_params(self, params, star_idx_dir):
        params_idx = {
                'runMode': 'genomeGenerate',
		'runThreadN': params[self.PARAM_IN_THREADN],
		self.STAR_IDX_DIR: star_idx_dir,
                'genomeFastaFiles': params[self.PARAM_IN_FASTA_FILES]
        }
        if params.get('sjdbGTFfile', None) is not None:
            params_idx['sjdbGTFfile'] = params['sjdbGTFfile']
        if params.get('sjdbOverhang', None) is not None :
            params_idx['sjdbOverhang'] = params['sjdbOverhang']

        return params_idx


    def _get_mapping_params(self, params, rds_files, rds_name, idx_dir, out_dir):
        ''' build the mapping parameters'''
        aligndir = out_dir
        if rds_name:
            aligndir = os.path.join(out_dir, rds_name)
            self._mkdir_p(aligndir)
            #print '**********STAR output directory created:{}'.format(aligndir)

        params_mp = copy.deepcopy(params)
        params_mp['runMode'] = 'alignReads'
        params_mp['readFilesIn'] = rds_files
	params_mp[self.STAR_IDX_DIR] = idx_dir
        params_mp['align_output'] = aligndir

        return params_mp


    def determine_input_info(self, validated_params):
        ''' get info on the readsset_ref object and determine if we run once or run on a set
        input info provides information on the input and tells us if we should
        run as a single_library or as a set:
             input_info = {'run_mode': '', 'info': [..], 'ref': '55/1/2'}
        '''
        info = self.get_obj_infos(validated_params[self.PARAM_IN_READS])[0]
        obj_type = self.get_type_from_obj_info(info)
        if obj_type in ['KBaseAssembly.PairedEndLibrary', 'KBaseAssembly.SingleEndLibrary',
                        'KBaseFile.PairedEndLibrary', 'KBaseFile.SingleEndLibrary']:
            return {'run_mode': 'single_library', 'info': info, 'ref': validated_params[self.PARAM_IN_READS]}
        if obj_type == 'KBaseRNASeq.RNASeqSampleSet':
            return {'run_mode': 'sample_set', 'info': info, 'ref': validated_params[self.PARAM_IN_READS]}
        if obj_type == 'KBaseSets.ReadsSet':
            return {'run_mode': 'sample_set', 'info': info, 'ref': validated_params[self.PARAM_IN_READS]}

        raise ValueError('Object type of readsset_ref is not valid, was: ' + str(obj_type))

    def determine_unique_reads_names(self, validated_params):
        infos = self.get_obj_infos(validated_params[self.PARAM_IN_READS])
        return get_unique_names(infos)

    def get_type_from_obj_info(self, info):
        return info[2].split('-')[0]

    def get_name_from_obj_info(self, info):
        return info[1]

    def get_obj_infos(self, ref):
        return self.ws_client.get_object_info3({'objects': [{'ref': ref}]})['infos']

    def get_object_names(self, ref_list):
        """
        From a list of workspace references, returns a mapping from ref -> name of the object.
        """
        obj_ids = list()
        for ref in ref_list:
            obj_ids.append({"ref": ref})
        info = self.ws_client.get_object_info3({"objects": obj_ids})
        name_map = dict()
        # we already have the refs as passed previously, so use those for mapping, as they're in
        # the same order as what's returned.
        for i in range(len(info["infos"])):
            name_map[ref_list[i]] = info["infos"][i][1]
        return name_map


    def _mkdir_p(self, dir):
        """
        _mkdir_p: make directory for given path
        """
        log('Creating a new dir: ' + dir)
        if not dir:
            return
        if not os.path.exists(dir):
            os.makedirs(dir)
        else:
            log('{} has existed, so skip creating.'.format(dir))


    def create_star_dirs(self, star_home):
        '''creating the directories for STAR'''
        # the index directory
        idxdir = os.path.join(star_home, self.STAR_IDX_DIR)
        self._mkdir_p(idxdir)
        # the output directory
        outdir = os.path.join(star_home, self.STAR_OUT_DIR)
        self._mkdir_p(outdir)

        return (idxdir, outdir)


    def _get_reads_refs_from_setref(self, params):
        readsSet_ref = params[self.PARAM_IN_READS]
        reads_refs = list()
        try:
            #print("Fetching reads ref(s) from sample/reads set ref {}".format(readsSet_ref))
            reads_refs = fetch_reads_refs_from_sampleset(
                                    readsSet_ref,
                                    self.workspace_url,
                                    self.callback_url,
                                    params)
            #print("\nDone fetching reads ref(s) from readsSet {}--\nDetails:\n".format(readsSet_ref))
            #pprint(reads_refs)
        except ValueError:
            print("Incorrect object type for fetching reads ref(s)!")
            raise

        return reads_refs

    def _generate_output_file_list(self, idx_dir, out_dir):
        """
        _generate_output_file_list: zip result files and generate file_links for report
        """

        log('start packing result files')

        output_files = list()

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_directory)
        star_index = os.path.join(output_directory, 'star_index.zip')
        star_output = os.path.join(output_directory, 'star_output.zip')
        self.zip_folder(idx_dir, star_index)
        self.zip_folder(out_dir, star_output)

        #star_index = self.zip_folder_withDFU(idx_dir, 'star_index')
        #star_output = self.zip_folder_withDFU(out_dir, 'star_output')

        output_files.append({'path': star_index,
                             'name': os.path.basename(star_index),
                             'label': os.path.basename(star_index),
                             'description': 'Index file(s) generated by STAR'})

        output_files.append({'path': star_output,
                             'name': os.path.basename(star_output),
                             'label': os.path.basename(star_output),
                             'description': 'Output file(s) generated by STAR'})

        return output_files


    def zip_folder_withDFU(self, folder_path, output_name):
        """Zip the contents of an entire folder (with that folder included
        in the archive). Empty subfolders will be included in the archive
        as well.
        """
        output_path = self.dfu.pack_file(
                {'file_path': folder_path + '/' + output_name,
                 'pack': 'zip'})['file_path']

        print "{} created successfully.".format(output_path)

        #with zipfile.ZipFile(output_path, "r") as f:
            #print 'Checking the zipped file......\n'
            #for info in f.infolist():
                #    print info.filename, info.date_time, info.file_size, info.compress_size
            #for fn in f.namelist():
                #print fn

        return output_path


    def zip_folder(self, folder_path, output_path):
        """Zip the contents of an entire folder (with that folder included in the archive). 
        Empty subfolders could be included in the archive as well if the commented portion is used.
        """
        with zipfile.ZipFile(output_path, 'w',
                             zipfile.ZIP_DEFLATED,
                             allowZip64=True) as ziph:
            for root, folders, files in os.walk(folder_path):
                # Include all subfolders, including empty ones.
                #for folder_name in folders:
                #    absolute_path = os.path.join(root, folder_name)
                #    relative_path = os.path.join(os.path.basename(root), folder_name)
                #    print "Adding {} to archive.".format(absolute_path)
                #    ziph.write(absolute_path, relative_path)
                for f in files:
                    absolute_path = os.path.join(root, f)
                    relative_path = os.path.join(os.path.basename(root), f)
                    #print "Adding {} to archive.".format(absolute_path)
                    ziph.write(absolute_path, relative_path)

        print "{} created successfully.".format(output_path)

        #with zipfile.ZipFile(output_path, "r") as f:
        #    print 'Checking the zipped file......\n'
        #    for info in f.infolist():
        #        print info.filename, info.date_time, info.file_size, info.compress_size


    def _generate_html_report(self, out_dir, obj_ref):
        """
        _generate_html_report: generate html summary report
        """

        log('start generating html report')
        html_report = list()

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_directory)
        result_file_path = os.path.join(output_directory, 'report.html')

        star_obj = self.ws_client.get_objects2({'objects':
                                                 [{'ref': obj_ref}]})['data'][0]
        star_obj_info = star_obj['info']
        star_obj_data = star_obj['data']
        star_obj_type = star_obj_info[2]

        Overview_Content = ''
        if re.match('KBaseRNASeq.RNASeqAlignment-\d.\d', star_obj_type):
            Overview_Content += '<br/><table><tr><th>Generated Alignment Object</th>'
            Overview_Content += '<th></th></tr>'
            Overview_Content += '<tr><th>Alignment Name</th><th>Condition</th></tr>'
            Overview_Content += '<tr><td>{} ({})</td>'.format(star_obj_info[1],obj_ref)
            Overview_Content += '<td>{}</td></tr>'.format(star_obj_data['condition'])
            Overview_Content += '</table>'
        elif (re.match('KBaseRNASeq.RNASeqAlignmentSet-\d.\d', star_obj_type)
                or re.match('KBaseSets.ReadsAlignmentSet-\d.\d', star_obj_type)
                or re.match('KBaseSet.RNASeqAlignmentSet-\d.\d', star_obj_type)):
            Overview_Content += '<br/><table><tr><th>Generated AlignmentSet Object</th></tr>'
            Overview_Content += '<tr><td>{} ({})'.format(star_obj_info[1],obj_ref)
            Overview_Content += '</td></tr></table>'
            Overview_Content += '<p><br/></p>'
            Overview_Content += '<table><tr><th>Generated Alignment Objects</th>'
            Overview_Content += '<th></th></tr>'
            Overview_Content += self._fill_html_trs('Alignment Name', star_obj_data)
            Overview_Content += '</table>'
        elif re.match('KBaseRNASeq.RNASeqExpression-\d.\d', star_obj_type):
            Overview_Content += '<br/><table><tr><th>Generated Expression Object</th>'
            Overview_Content += '<th></th></tr>'
            Overview_Content += '<tr><th>Expression Name</th><th>Condition</th></tr>'
            Overview_Content += '<tr><td>{} ({})</td>'.format(star_obj_info[1], obj_ref)
            Overview_Content += '<td>{}</td></tr>'.format(star_obj_data['condition'])
            Overview_Content += '</table>'
        elif re.match('KBaseSets.ExpressionSet-\d.\d', star_obj_type):
            Overview_Content += '<br/><table><tr><th>Generated ExpressionSet Object</th></tr>'
            Overview_Content += '<tr><td>{} ({})'.format(star_obj_info[1], obj_ref)
            Overview_Content += '</td></tr></table>'
            Overview_Content += '<p><br/></p>'
            Overview_Content += '<table><tr><th>Generated Expression Objects</th>'
            Overview_Content += '<th></th></tr>'
            Overview_Content += self._fill_html_trs('Expression Name', star_obj_data)
            Overview_Content += '</table>'

        with open(result_file_path, 'w') as result_file:
            with open(os.path.join(os.path.dirname(__file__), 'report_template.html'),
                      'r') as report_template_file:
                report_template = report_template_file.read()
                report_template = report_template.replace('<p>Overview_Content</p>',
                                                          Overview_Content)
                result_file.write(report_template)

        html_report.append({'path': result_file_path,
                            'name': os.path.basename(result_file_path),
                            'label': os.path.basename(result_file_path),
                            'description': 'HTML summary report for STAR'})
        return html_report

    def _fill_html_trs(self, col_caption, obj_data):
        '''
        _fill_html_trs: simple creates an html string that has rows (tr) of td for a table
        '''
        tr_html_str = '<tr><th>{}</th><th>Condition</th></tr>'.format(col_caption)

        for item in obj_data['items']:
            item_obj = self.ws_client.get_objects2({'objects':[{'ref': item['ref']}]})['data'][0]
            item_obj_info = item_obj['info']
            item_obj_data = item_obj['data']
            obj_name = item_obj_info[1]

            tr_html_str += '<tr><td>{} ({})</td>'.format(obj_name, item['ref'])
            tr_html_str += '<td>{}</td></tr>'.format(item_obj_data['condition'])

        return tr_html_str

    def _generate_star_report(self, obj_ref, report_text, html_links, workspace_name, index_dir, output_dir):
        """
        _generate_star_report: generate summary report
        """
        log('creating STAR report')

        output_files = self._generate_output_file_list(index_dir, output_dir)
        output_html_files = self._generate_html_report(output_dir, obj_ref)
        output_html_files += html_links

        star_obj = self.ws_client.get_objects2({'objects':[{'ref': obj_ref}]})['data'][0]
        star_obj_info = star_obj['info']
        star_obj_data = star_obj['data']

        star_obj_type = star_obj_info[2]
        if re.match('KBaseRNASeq.RNASeqAlignment-\d+.\d+', star_obj_type):
            objects_created = [{'ref': obj_ref,
                                'description': 'RNASeqAlignment generated by STAR'}]
        elif (re.match('KBaseRNASeq.RNASeqAlignmentSet-\d+.\d+', star_obj_type)
                or re.match('KBaseSets.ReadsAlignmentSet-\d+.\d+', star_obj_type)
                or re.match('KBaseSet.RNASeqAlignmentSet-\d+.\d+', star_obj_type)):
            objects_created = [{'ref': obj_ref,
                'description': '{} generated by STAR'.format(re.sub(r"-\d+.\d+", "",star_obj_type))}]
            items = star_obj_data['items']
            for item in items:
                objects_created.append({'ref': item['ref'],
                                        'description': 'Alignment generated by STAR'})
        elif re.match('KBaseRNASeq.RNASeqExpression-\d+.\d+', star_obj_type):
            objects_created = [{'ref': obj_ref,
                                'description': 'Expression generated by STAR'}]
        elif re.match('KBaseSets.ExpressionSet-\d+.\d+', star_obj_type):
            objects_created = [{'ref': obj_ref,
                                'description': 'ExpressionSet generated by STAR'}]
            items = star_obj_data['items']
            for item in items:
                objects_created.append({'ref': item['ref'],
                                        'description': 'Expression generated by STAR'})

        report_params = {'message': report_text,
                         'workspace_name': workspace_name,
                         'file_links': output_files,
                         'objects_created': objects_created,
                         'html_links': output_html_files,
                         'direct_html_link_index': 0,
                         'html_window_height': 366,
                         'report_object_name': 'kb_STAR_report_' + str(uuid.uuid4())}

        kbase_report_client = KBaseReport(self.callback_url)
        report_output = kbase_report_client.create_extended_report(report_params)

        return report_output

    def upload_alignment_set(self, alignment_items, alignmentset_name, ws_name):
        """
        Compiles and saves a set of alignment references (+ other stuff) into a
        KBaseRNASeq.RNASeqAlignmentSet.
        Returns the reference to the new alignment set.
        alignment_items: [{
            "ref": alignment_ref,
            "label": condition label.
        }]
        """
        print("Uploading completed alignment set")
        alignment_set = {
            "description": "Alignments using STAR, v.{}".format(self.STAR_VERSION),
            "items": alignment_items
        }
        set_info = self.set_api_client.save_reads_alignment_set_v1({
            "workspace": ws_name,
            "output_object_name": alignmentset_name,
            "data": alignment_set
        })
        return set_info
Beispiel #15
0
class StringTieUtil:
    STRINGTIE_TOOLKIT_PATH = '/kb/deployment/bin/StringTie'
    GFFREAD_TOOLKIT_PATH = '/kb/deployment/bin/gffread'

    OPTIONS_MAP = {
        'output_transcripts': '-o',
        'gene_abundances_file': '-A',
        'num_threads': '-p',
        'fr_firststrand': '--rf',
        'fr_secondstrand': '--fr',
        'cov_refs_file': '-C',
        'junction_base': '-a',
        'junction_coverage': '-j',
        'disable_trimming': '-t',
        'min_locus_gap_sep_value': '-g',
        'ballgown_mode': '-B',
        'skip_reads_with_no_ref': '-e',
        'maximum_fraction': '-M',
        'label': '-l',
        'gtf_file': '-G',
        'min_length': '-m',
        'min_read_coverage': '-c',
        'min_isoform_abundance': '-f'
    }

    BOOLEAN_OPTIONS = [
        'disable_trimming', 'ballgown_mode', 'skip_reads_with_no_ref'
    ]

    def _validate_run_stringtie_params(self, params):
        """
        _validate_run_stringtie_params:
                validates params passed to run_stringtie method
        """

        log('start validating run_stringtie params')

        # check for required parameters
        for p in [
                'alignment_object_ref', 'workspace_name', 'expression_suffix',
                'expression_set_suffix'
        ]:
            if p not in params:
                raise ValueError(
                    '"{}" parameter is required, but missing'.format(p))

    def _mkdir_p(self, path):
        """
        _mkdir_p: make directory for given path
        """
        if not path:
            return
        try:
            os.makedirs(path)
        except OSError as exc:
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise

    def _generate_command(self, params):
        """
        _generate_command: generate stringtie command
        """

        command = self.STRINGTIE_TOOLKIT_PATH + '/stringtie '

        for key, option in self.OPTIONS_MAP.items():
            option_value = params.get(key)
            if key in self.BOOLEAN_OPTIONS and option_value:
                option_value = ' '
            if option_value:
                command += '{} {} '.format(option, option_value)

        command += '{} '.format(params.get('input_file'))

        log('generated stringtie command: {}'.format(command))

        return command

    def _run_command(self, command):
        """
        _run_command: run command and print result
        """

        log('start executing command:\n{}'.format(command))

        pipe = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True)
        output = pipe.communicate()[0]
        exitCode = pipe.returncode

        if (exitCode == 0):
            log('Executed commend:\n{}\n'.format(command) +
                'Exit Code: {}\nOutput:\n{}'.format(exitCode, output))
        else:
            error_msg = 'Error running commend:\n{}\n'.format(command)
            error_msg += 'Exit Code: {}\nOutput:\n{}'.format(exitCode, output)
            raise ValueError(error_msg)

    def _run_gffread(self, gff_path, gtf_path):
        """
        _run_gffread: run gffread script

        ref: http://ccb.jhu.edu/software/stringtie/gff.shtml
        """

        log('converting gff to gtf')

        command = self.GFFREAD_TOOLKIT_PATH + '/gffread '
        command += "-E {0} -T -o {1}".format(gff_path, gtf_path)

        self._run_command(command)

    def _get_input_file(self, alignment_ref):
        """
        _get_input_file: get input  SAM/BAM file from Alignment object
        """

        log('getting bam file from alignment')

        bam_file_dir = self.rau.download_alignment(
            {'source_ref': alignment_ref})['destination_dir']

        files = os.listdir(bam_file_dir)
        bam_file_list = [
            file for file in files if re.match(r'.*\_sorted\.bam', file)
        ]
        if not bam_file_list:
            bam_file_list = [
                file for file in files if re.match(r'.*(?<!sorted)\.bam', file)
            ]

        if not bam_file_list:
            raise ValueError('Cannot find .bam file from alignment {}'.format(
                alignment_ref))

        bam_file_name = bam_file_list[0]

        bam_file = os.path.join(bam_file_dir, bam_file_name)

        return bam_file

    def _get_gtf_file(self, alignment_ref, result_directory):
        """
        _get_gtf_file: get the reference annotation file (in GTF or GFF3 format)
        """

        alignment_data = self.ws.get_objects2(
            {'objects': [{
                'ref': alignment_ref
            }]})['data'][0]['data']

        genome_ref = alignment_data.get('genome_id')

        # annotation_file = self._create_gtf_file(genome_ref, result_directory)
        annotation_file = self._create_gtf_annotation_from_genome(
            genome_ref, result_directory)

        gene_name_annotation_file = annotation_file.split(
            '.gtf')[0] + '_append_name.gtf'

        with open(gene_name_annotation_file, 'w') as output_file:
            with open(annotation_file, 'r') as input_file:
                for line in input_file:
                    if ('gene_id \"' in line) and ('gene_name \"' not in line):
                        line = line.replace("\n", "")
                        gene_id = line.split('gene_id \"')[1].split('"')[0]
                        line += ' gene_name \"{}\";\n'.format(gene_id)
                        output_file.write(line)
                    else:
                        output_file.write(line)

        return gene_name_annotation_file

    def _create_gtf_annotation_from_genome(self, genome_ref, result_directory):
        """
         Create reference annotation file from genome
        """
        ref = self.ws.get_object_subset([{
            'ref':
            genome_ref,
            'included': ['contigset_ref', 'assembly_ref']
        }])
        if 'contigset_ref' in ref[0]['data']:
            contig_id = ref[0]['data']['contigset_ref']
        elif 'assembly_ref' in ref[0]['data']:
            contig_id = ref[0]['data']['assembly_ref']
        if contig_id is None:
            raise ValueError(
                "Genome at {0} does not have reference to the assembly object".
                format(genome_ref))
        print contig_id
        log("Generating GFF file from Genome")
        try:
            ret = self.au.get_assembly_as_fasta({'ref': contig_id})
            fa_output_file = ret['path']

            shutil.copy(fa_output_file, result_directory)
            fa_output_name = os.path.basename(fa_output_file)
            fa_output_file = os.path.join(result_directory, fa_output_name)

            mapping_filename = c_mapping.create_sanitized_contig_ids(
                fa_output_file)

            # get the GFF
            ret = self.gfu.genome_to_gff({
                'genome_ref': genome_ref,
                'target_dir': result_directory
            })
            genome_gff_file = ret['file_path']
            c_mapping.replace_gff_contig_ids(genome_gff_file,
                                             mapping_filename,
                                             to_modified=True)
            gtf_ext = ".gtf"

            if not genome_gff_file.endswith(gtf_ext):
                gtf_path = os.path.splitext(genome_gff_file)[0] + '.gtf'
                self._run_gffread(genome_gff_file, gtf_path)
            else:
                gtf_path = genome_gff_file

            log("gtf file : " + gtf_path)
        except Exception:
            raise ValueError(
                "Generating GTF file from Genome Annotation object Failed :  {}"
                .format("".join(traceback.format_exc())))

        return gtf_path

    def _create_gtf_file(self, genome_ref, result_directory):
        """
        _create_gtf_file: create reference annotation file from genome
        """

        log('start generating reference annotation file')

        genome_gtf_file = self.gfu.genome_to_gff({
            'genome_ref': genome_ref,
            'target_dir': result_directory,
            'is_gtf': True
        })['file_path']

        return genome_gtf_file

    def _save_expression(self, result_directory, alignment_ref, workspace_name,
                         gtf_file, expression_suffix):
        """
        _save_expression: save Expression object to workspace
        """

        log('start saving Expression object')

        alignment_data_object = self.ws.get_objects2(
            {'objects': [{
                'ref': alignment_ref
            }]})['data'][0]

        alignment_name = alignment_data_object['info'][1]
        if re.match('.*_*[Aa]lignment', alignment_name):
            expression_obj_name = re.sub('_*[Aa]lignment', expression_suffix,
                                         alignment_name)
        else:
            expression_obj_name = alignment_name + expression_suffix
        destination_ref = workspace_name + '/' + expression_obj_name
        upload_expression_params = {
            'destination_ref': destination_ref,
            'source_dir': result_directory,
            'alignment_ref': alignment_ref,
            'tool_used': 'StringTie',
            'tool_version': '1.3.3'
        }

        expression_ref = self.eu.upload_expression(
            upload_expression_params)['obj_ref']

        return expression_ref

    def _save_expression_set(self, alignment_expression_map, alignment_set_ref,
                             workspace_name, expression_set_suffix):
        """
        _save_expression_set: save ExpressionSet object to workspace
        """

        log('start saving ExpressionSet object')

        items = []
        for alignment_expression in alignment_expression_map:
            items.append({
                'ref': alignment_expression.get('expression_obj_ref'),
                'label': alignment_expression.get('alignment_label')
            })

        expression_set_data = {
            'description': 'ExpressionSet using StringTie',
            'items': items
        }

        alignment_set_data_object = self.ws.get_objects2(
            {'objects': [{
                'ref': alignment_set_ref
            }]})['data'][0]

        alignment_set_name = alignment_set_data_object['info'][1]
        if re.match('.*_*[Aa]lignment_*[Ss]et', alignment_set_name):
            expression_set_name = re.sub('_*[Aa]lignment_*[Ss]et',
                                         expression_set_suffix,
                                         alignment_set_name)
        else:
            expression_set_name = alignment_set_name + expression_set_suffix

        expression_set_save_params = {
            'data': expression_set_data,
            'workspace': workspace_name,
            'output_object_name': expression_set_name
        }

        save_result = self.set_client.save_expression_set_v1(
            expression_set_save_params)
        expression_set_ref = save_result['set_ref']

        return expression_set_ref

    def _save_expression_matrix(self, expressionset_ref, workspace_name):
        """
        _save_expression_matrix: save FPKM and TPM ExpressionMatrix
        """

        log('start saving ExpressionMatrix object')

        expression_set_name = self.ws.get_object_info(
            [{
                "ref": expressionset_ref
            }], includeMetadata=None)[0][1]

        output_obj_name_prefix = re.sub('_*[Ee]xpression_*[Ss]et', '',
                                        expression_set_name)

        upload_expression_matrix_params = {
            'expressionset_ref': expressionset_ref,
            'output_obj_name': output_obj_name_prefix,
            'workspace_name': workspace_name
        }

        expression_matrix_refs = self.eu.get_expressionMatrix(
            upload_expression_matrix_params)

        return expression_matrix_refs

    def _generate_output_file_list(self, result_directory):
        """
        _generate_output_file_list: zip result files and generate file_links for report
        """

        log('start packing result files')

        output_files = list()

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_directory)
        result_file = os.path.join(output_directory, 'stringtie_result.zip')

        with zipfile.ZipFile(result_file,
                             'w',
                             zipfile.ZIP_DEFLATED,
                             allowZip64=True) as zip_file:
            for root, dirs, files in os.walk(result_directory):
                for file in files:
                    if not (file.endswith('.DS_Store')
                            or os.path.basename(root) == 'merge_result'):
                        zip_file.write(
                            os.path.join(root, file),
                            os.path.join(os.path.basename(root), file))

        output_files.append({
            'path': result_file,
            'name': os.path.basename(result_file),
            'label': os.path.basename(result_file),
            'description': 'File(s) generated by StringTie App'
        })

        result_dirs = os.listdir(result_directory)
        if 'merge_result' in result_dirs:
            merge_file = os.path.join(result_directory, 'merge_result',
                                      'stringtie_merge.gtf')
            output_files.append({
                'path':
                merge_file,
                'name':
                os.path.basename(merge_file),
                'label':
                os.path.basename(merge_file),
                'description':
                'merge file generated by StringTie App'
            })

            filtered_merge_file = os.path.join(result_directory,
                                               'merge_result',
                                               'filtered_stringtie_merge.gtf')
            output_files.append({
                'path':
                filtered_merge_file,
                'name':
                os.path.basename(filtered_merge_file),
                'label':
                os.path.basename(filtered_merge_file),
                'description':
                'filtered merge file generated by StringTie App'
            })

        return output_files

    def _generate_merge_html_report(self, result_directory):
        """
        _generate_html_report: generate html summary report
        """

        log('start generating merge html report')
        html_report = list()

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_directory)
        result_file_path = os.path.join(output_directory, 'report.html')

        result_dirs = os.listdir(result_directory)

        Overview_Content = ''
        Overview_Content += '<br/><table><tr><th>Generated Files</th>'
        Overview_Content += '<th></th></tr>'
        Overview_Content += '<tr><th>Directory</th><th>File Name</th></tr>'
        for result_dir in result_dirs:
            result_files = os.listdir(
                os.path.join(result_directory, result_dir))
            result_files.sort()
            first_file = True
            for file_name in result_files:
                if first_file:
                    Overview_Content += '<tr><td>{}</td>'.format(result_dir)
                    Overview_Content += '<td>{}</td></tr>'.format(file_name)
                    first_file = False
                else:
                    Overview_Content += '<tr><td>{}</td>'.format('')
                    Overview_Content += '<td>{}</td></tr>'.format(file_name)
        Overview_Content += '</table>'

        with open(result_file_path, 'w') as result_file:
            with open(
                    os.path.join(os.path.dirname(__file__),
                                 'report_template.html'),
                    'r') as report_template_file:
                report_template = report_template_file.read()
                report_template = report_template.replace(
                    '<p>Overview_Content</p>', Overview_Content)
                result_file.write(report_template)

        html_report.append({
            'path':
            result_file_path,
            'name':
            os.path.basename(result_file_path),
            'label':
            os.path.basename(result_file_path),
            'description':
            'HTML summary report for StringTie App'
        })
        return html_report

    def _generate_html_report(self, result_directory, obj_ref):
        """
        _generate_html_report: generate html summary report
        """

        log('start generating html report')
        html_report = list()

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_directory)
        result_file_path = os.path.join(output_directory, 'report.html')

        expression_object = self.ws.get_objects2(
            {'objects': [{
                'ref': obj_ref
            }]})['data'][0]
        expression_info = expression_object['info']
        expression_data = expression_object['data']

        expression_object_type = expression_info[2]
        Overview_Content = ''
        if re.match('KBaseRNASeq.RNASeqExpression-\d.\d',
                    expression_object_type):
            Overview_Content += '<br/><table><tr><th>Generated Expression Object</th>'
            Overview_Content += '<th></th></tr>'
            Overview_Content += '<tr><th>Expression Name</th><th>Condition</th></tr>'
            Overview_Content += '<tr><td>{} ({})</td>'.format(
                expression_info[1], obj_ref)
            Overview_Content += '<td>{}</td></tr>'.format(
                expression_data['condition'])
            Overview_Content += '</table>'
        elif re.match('KBaseSets.ExpressionSet-\d.\d', expression_object_type):
            Overview_Content += '<br/><table><tr><th>Generated ExpressionSet Object</th></tr>'
            Overview_Content += '<tr><td>{} ({})'.format(
                expression_info[1], obj_ref)
            Overview_Content += '</td></tr></table>'
            Overview_Content += '<p><br/></p>'
            Overview_Content += '<table><tr><th>Generated Expression Objects</th>'
            Overview_Content += '<th></th></tr>'
            Overview_Content += '<tr><th>Expression Name</th><th>Condition</th></tr>'
            for item in expression_data['items']:
                item_expression_object = self.ws.get_objects2(
                    {'objects': [{
                        'ref': item['ref']
                    }]})['data'][0]
                item_expression_info = item_expression_object['info']
                item_expression_data = item_expression_object['data']
                expression_name = item_expression_info[1]
                Overview_Content += '<tr><td>{} ({})</td>'.format(
                    expression_name, item['ref'])
                Overview_Content += '<td>{}</td>'.format(
                    item_expression_data['condition'])
                Overview_Content += '</tr>'
            Overview_Content += '</table>'
        with open(result_file_path, 'w') as result_file:
            with open(
                    os.path.join(os.path.dirname(__file__),
                                 'report_template.html'),
                    'r') as report_template_file:
                report_template = report_template_file.read()
                report_template = report_template.replace(
                    '<p>Overview_Content</p>', Overview_Content)
                result_file.write(report_template)

        html_report.append({
            'path':
            result_file_path,
            'name':
            os.path.basename(result_file_path),
            'label':
            os.path.basename(result_file_path),
            'description':
            'HTML summary report for StringTie App'
        })
        return html_report

    def _generate_merge_report(self, workspace_name, result_directory):
        """
        _generate_merge_report: generate summary report
        """

        log('creating merge report')

        output_files = self._generate_output_file_list(result_directory)
        output_html_files = self._generate_merge_html_report(result_directory)

        report_params = {
            'message': '',
            'workspace_name': workspace_name,
            'file_links': output_files,
            'html_links': output_html_files,
            'direct_html_link_index': 0,
            'html_window_height': 366,
            'report_object_name': 'kb_stringtie_report_' + str(uuid.uuid4())
        }

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {
            'report_name': output['name'],
            'report_ref': output['ref']
        }

        return report_output

    def _generate_report(self,
                         obj_ref,
                         workspace_name,
                         result_directory,
                         exprMatrix_FPKM_ref=None,
                         exprMatrix_TPM_ref=None):
        """
        _generate_report: generate summary report
        """

        log('creating report')

        output_files = self._generate_output_file_list(result_directory)
        output_html_files = self._generate_html_report(result_directory,
                                                       obj_ref)

        expression_object = self.ws.get_objects2(
            {'objects': [{
                'ref': obj_ref
            }]})['data'][0]
        expression_info = expression_object['info']
        expression_data = expression_object['data']

        expression_object_type = expression_info[2]
        if re.match('KBaseRNASeq.RNASeqExpression-\d+.\d+',
                    expression_object_type):
            objects_created = [{
                'ref':
                obj_ref,
                'description':
                'Expression generated by StringTie'
            }]
        elif re.match('KBaseSets.ExpressionSet-\d+.\d+',
                      expression_object_type):
            objects_created = [{
                'ref':
                obj_ref,
                'description':
                'ExpressionSet generated by StringTie'
            }]
            items = expression_data['items']
            for item in items:
                objects_created.append({
                    'ref':
                    item['ref'],
                    'description':
                    'Expression generated by StringTie'
                })
            objects_created.append({
                'ref':
                exprMatrix_FPKM_ref,
                'description':
                'FPKM ExpressionMatrix generated by StringTie'
            })
            objects_created.append({
                'ref':
                exprMatrix_TPM_ref,
                'description':
                'TPM ExpressionMatrix generated by StringTie'
            })

        report_params = {
            'message': '',
            'workspace_name': workspace_name,
            'file_links': output_files,
            'objects_created': objects_created,
            'html_links': output_html_files,
            'direct_html_link_index': 0,
            'html_window_height': 366,
            'report_object_name': 'kb_stringtie_report_' + str(uuid.uuid4())
        }

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {
            'report_name': output['name'],
            'report_ref': output['ref']
        }

        return report_output

    def _process_alignment_object(self, params):
        """
        _process_alignment_object: process KBaseRNASeq.RNASeqAlignment type input object
        """

        try:

            log('start processing RNASeqAlignment object\n')
            log('params:\n{}'.format(json.dumps(params, indent=1)))
            alignment_ref = params.get('alignment_ref')

            alignment_set_object = self.ws.get_objects2(
                {'objects': [{
                    'ref': alignment_ref
                }]})['data'][0]

            alignment_info = alignment_set_object['info']
            alignment_data = alignment_set_object['data']

            alignment_name = alignment_info[1]
            alignment_label = alignment_data['condition']

            result_directory = os.path.join(
                self.scratch, alignment_name + '_' + str(uuid.uuid4()))
            self._mkdir_p(result_directory)

            # input files
            params['input_file'] = self._get_input_file(alignment_ref)
            if not params.get('gtf_file'):
                params['gtf_file'] = self._get_gtf_file(
                    alignment_ref, result_directory)
            else:
                shutil.copy(params.get('gtf_file'), result_directory)
            log('using {} as reference annotation file.'.format(
                params.get('gtf_file')))

            # output files
            self.output_transcripts = 'transcripts.gtf'
            params['output_transcripts'] = os.path.join(
                result_directory, self.output_transcripts)

            self.gene_abundances_file = 'genes.fpkm_tracking'
            params['gene_abundances_file'] = os.path.join(
                result_directory, self.gene_abundances_file)

            command = self._generate_command(params)
            self._run_command(command)

            if params.get('exchange_gene_ids'):
                self._exchange_gene_ids(result_directory)

            if ('generate_ws_object' in params
                    and not params.get('generate_ws_object')):
                log('skip generating expression object')
                expression_obj_ref = ''
            else:
                expression_obj_ref = self._save_expression(
                    result_directory, alignment_ref,
                    params.get('workspace_name'), params['gtf_file'],
                    params['expression_suffix'])

            returnVal = {
                'result_directory': result_directory,
                'expression_obj_ref': expression_obj_ref,
                'alignment_ref': alignment_ref,
                'annotation_file': params['gtf_file'],
                'alignment_label': alignment_label
            }
        except:
            log('caught exception in worker')
            exctype, value = sys.exc_info()[:2]

            returnVal = {'exception': '{}: {}'.format(exctype, value)}
        finally:
            return returnVal

    def _process_alignment_set_object(self, params):
        """
        _process_alignment_set_object: process KBaseRNASeq.RNASeqAlignmentSet type input object
        """

        log('start processing AlignmentSet object\nparams:\n{}'.format(
            json.dumps(params, indent=1)))

        alignment_set_ref = params.get('alignment_set_ref')
        alignment_set_object = self.ws.get_objects2(
            {'objects': [{
                'ref': alignment_set_ref
            }]})['data'][0]

        alignment_set_info = alignment_set_object['info']
        alignment_set_data = alignment_set_object['data']

        alignment_set_type = alignment_set_info[2]

        mul_processor_params = []
        if re.match('KBaseRNASeq.RNASeqAlignmentSet-\d.\d',
                    alignment_set_type):
            mapped_alignment_ids = alignment_set_data['mapped_alignments_ids']
            for i in mapped_alignment_ids:
                for sample_name, alignment_id in i.items():
                    aliment_upload_params = params.copy()
                    aliment_upload_params['alignment_ref'] = alignment_id
                    mul_processor_params.append(aliment_upload_params)
        elif re.match('KBaseSets.ReadsAlignmentSet-\d.\d', alignment_set_type):
            items = alignment_set_data['items']
            for item in items:
                alignment_ref = item['ref']
                aliment_upload_params = params.copy()
                aliment_upload_params['alignment_ref'] = alignment_ref
                mul_processor_params.append(aliment_upload_params)

        cpus = min(params.get('num_threads'), multiprocessing.cpu_count())
        pool = Pool(ncpus=cpus)
        log('running _process_alignment_object with {} cpus'.format(cpus))
        alignment_expression_map = pool.map(self._process_alignment_object,
                                            mul_processor_params)

        for proc_alignment_return in alignment_expression_map:
            if 'exception' in proc_alignment_return:
                error_msg = 'Caught exception in worker\n'
                error_msg += 'Exception: {}'.format(
                    proc_alignment_return['exception'])
                raise ValueError(error_msg)

        result_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(result_directory)

        for proc_alignment_return in alignment_expression_map:
            alignment_ref = proc_alignment_return.get('alignment_ref')
            alignment_name = self.ws.get_object_info(
                [{
                    "ref": alignment_ref
                }], includeMetadata=None)[0][1]
            self._run_command('cp -R {} {}'.format(
                proc_alignment_return.get('result_directory'),
                os.path.join(result_directory, alignment_name)))
        if ('generate_ws_object' in params
                and not params.get('generate_ws_object')):
            log('skip generating expression set object')
            expression_obj_ref = ''
            expression_matrix_refs = {}
        else:
            expression_obj_ref = self._save_expression_set(
                alignment_expression_map, alignment_set_ref,
                params.get('workspace_name'), params['expression_set_suffix'])
            expression_matrix_refs = self._save_expression_matrix(
                expression_obj_ref, params.get('workspace_name'))

        annotation_file_name = os.path.basename(
            alignment_expression_map[0]['annotation_file'])
        annotation_file_path = os.path.join(result_directory,
                                            os.listdir(result_directory)[0],
                                            annotation_file_name)

        returnVal = {
            'result_directory':
            result_directory,
            'expression_obj_ref':
            expression_obj_ref,
            'annotation_file':
            annotation_file_path,
            'exprMatrix_FPKM_ref':
            expression_matrix_refs.get('exprMatrix_FPKM_ref'),
            'exprMatrix_TPM_ref':
            expression_matrix_refs.get('exprMatrix_TPM_ref')
        }

        return returnVal

    def _run_merge_option(self, result_directory, params, annotation_file):

        log('start running stringtie merge')

        result_dirs = os.listdir(result_directory)

        merge_directory = os.path.join(result_directory, 'merge_result')
        self._mkdir_p(merge_directory)

        option_params = params.copy()

        option_params.pop('num_threads', None)
        option_params.pop('ballgown_mode', None)
        option_params.pop('skip_reads_with_no_ref', None)
        option_params.pop('junction_coverage', None)
        option_params.pop('junction_base', None)
        option_params.pop('min_read_coverage', None)
        option_params.pop('min_locus_gap_sep_value', None)

        output_merge = 'stringtie_merge.gtf'
        option_params['output_transcripts'] = os.path.join(
            merge_directory, output_merge)

        command = self.STRINGTIE_TOOLKIT_PATH + '/stringtie '
        command += '--merge '
        command += '-G {} '.format(annotation_file)

        for key, option in self.OPTIONS_MAP.items():
            option_value = option_params.get(key)
            if key in self.BOOLEAN_OPTIONS and option_value:
                option_value = ' '
            if option_value:
                command += '{} {} '.format(option, option_value)

        for result_dir in result_dirs:
            gtf_file = os.path.join(result_directory, result_dir,
                                    'transcripts.gtf')
            command += '{} '.format(gtf_file)

        self._run_command(command)

    def _filter_merge_file(self, gtf_file):
        """
        _filter_merge_file: remove lines with no gene_name
        """

        log('start filtering merged gft file')

        dir_name = os.path.dirname(gtf_file)

        filtered_file_name = 'filtered_stringtie_merge.gtf'
        filtered_file_path = os.path.join(dir_name, filtered_file_name)

        with open(filtered_file_path, 'w') as output_file:
            with open(gtf_file, 'r') as input_file:
                for line in input_file:
                    if line.startswith('#') or 'gene_name' in line:
                        output_file.write(line)
                    else:
                        log('skipping line:\n{}'.format(line))

        return filtered_file_path

    def _exchange_gene_ids(self, result_directory):
        """
        _exchange_gene_ids: exchange gene_ids with gene_name
        """

        log('starting exchanging gene_ids with gene_name')

        result_files = os.listdir(result_directory)

        for result_file_name in result_files:
            if result_file_name == 'transcripts.gtf':
                log('updating transcripts.gtf gene_ids')
                os.rename(
                    os.path.join(result_directory, 'transcripts.gtf'),
                    os.path.join(result_directory, 'original_transcripts.gtf'))
                original_transcript_path = os.path.join(
                    result_directory, 'original_transcripts.gtf')
                exchange_transcript_path = os.path.join(
                    result_directory, result_file_name)

                with open(exchange_transcript_path, 'w') as output_file:
                    with open(original_transcript_path, 'r') as input_file:
                        for line in input_file:
                            if 'gene_id \"' in line and 'ref_gene_name \"' in line:
                                gene_id = line.split('gene_id \"')[1].split(
                                    '"')[0]
                                gene_name = line.split(
                                    'ref_gene_name \"')[1].split('"')[0]
                                line = line.replace(gene_id, gene_name)
                                output_file.write(line)
                            elif 'gene_id \"' in line and 'gene_name \"' in line:
                                gene_id = line.split('gene_id \"')[1].split(
                                    '"')[0]
                                gene_name = line.split(
                                    'gene_name \"')[1].split('"')[0]
                                line = line.replace(gene_id, gene_name)
                                output_file.write(line)
                            else:
                                output_file.write(line)
            elif result_file_name == 't_data.ctab':
                log('updating t_data.ctab gene_ids')
                os.rename(
                    os.path.join(result_directory, 't_data.ctab'),
                    os.path.join(result_directory, 'original_t_data.ctab'))
                original_tdata_path = os.path.join(result_directory,
                                                   'original_t_data.ctab')
                exchange_tdata_path = os.path.join(result_directory,
                                                   result_file_name)

                first_line = True
                with open(exchange_tdata_path, 'w') as output_file:
                    with open(original_tdata_path, 'r') as input_file:
                        for line in input_file:
                            if first_line:
                                gene_id_index = line.split('\t').index(
                                    'gene_id')
                                gene_name_index = line.split('\t').index(
                                    'gene_name')
                                first_line = False
                                output_file.write(line)
                            else:
                                line_list = line.split('\t')
                                if len(line_list) >= max(
                                        gene_id_index, gene_name_index):
                                    line_list[gene_id_index] = line_list[
                                        gene_name_index]
                                    line = '\t'.join(line_list)
                                    output_file.write(line)
                                else:
                                    output_file.write(line)

    def __init__(self, config):
        self.ws_url = config["workspace-url"]
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        self.shock_url = config['shock-url']
        self.srv_wiz_url = config['srv-wiz-url']
        self.scratch = config['scratch']
        self.dfu = DataFileUtil(self.callback_url)
        self.gfu = GenomeFileUtil(self.callback_url)
        self.rau = ReadsAlignmentUtils(self.callback_url)
        self.au = AssemblyUtil(self.callback_url)
        self.eu = ExpressionUtils(self.callback_url, service_ver='dev')
        self.ws = Workspace(self.ws_url, token=self.token)
        self.set_client = SetAPI(self.srv_wiz_url)

    def run_stringtie_app(self, params):
        """
        run_stringtie_app: run StringTie app
        (http://ccb.jhu.edu/software/stringtie/index.shtml?t=manual)

        required params:
        alignment_object_ref: Alignment or AlignmentSet object reference
        workspace_name: the name of the workspace it gets saved to
        expression_set_suffix: suffix append to expression set object name
        expression_suffix: suffix append to expression object name
        mode: one of ['normal', 'merge', 'novel_isoform']

        optional params:
        num_threads: number of processing threads
        junction_base: junctions that don't have spliced reads
        junction_coverage: junction coverage
        disable_trimming: disables trimming at the ends of the assembled transcripts
        min_locus_gap_sep_value: minimum locus gap separation value
        ballgown_mode: enables the output of Ballgown input table files
        skip_reads_with_no_ref: reads with no reference will be skipped
        maximum_fraction: maximum fraction of muliple-location-mapped reads
        label: prefix for the name of the output transcripts
        min_length: minimum length allowed for the predicted transcripts
        min_read_coverage: minimum input transcript coverage
        min_isoform_abundance: minimum isoform abundance

        return:
        result_directory: folder path that holds all files generated by run_stringtie_app
        expression_obj_ref: generated Expression/ExpressionSet object reference
        report_name: report name generated by KBaseReport
        report_ref: report reference generated by KBaseReport
        """
        log('--->\nrunning StringTieUtil.run_stringtie\n' +
            'params:\n{}'.format(json.dumps(params, indent=1)))

        self._validate_run_stringtie_params(params)

        alignment_object_ref = params.get('alignment_object_ref')
        alignment_object_info = self.ws.get_object_info3(
            {"objects": [{
                "ref": alignment_object_ref
            }]})['infos'][0]
        alignment_object_type = alignment_object_info[2]

        if re.match('KBaseRNASeq.RNASeqAlignment-\d.\d',
                    alignment_object_type):
            params.update({'alignment_ref': alignment_object_ref})
            returnVal = self._process_alignment_object(params)
            report_output = self._generate_report(
                returnVal.get('expression_obj_ref'),
                params.get('workspace_name'),
                returnVal.get('result_directory'))
            returnVal.update(report_output)
        elif (re.match('KBaseRNASeq.RNASeqAlignmentSet-\d.\d',
                       alignment_object_type)
              or re.match('KBaseSets.ReadsAlignmentSet-\d.\d',
                          alignment_object_type)):
            params.update({'alignment_set_ref': alignment_object_ref})
            if params.get('mode') in ['merge', 'novel_isoform']:

                log('running Stringtie the 1st time')
                if params.get('mode') == 'novel_isoform':
                    params.update({'ballgown_mode': 0})
                    params.update({'skip_reads_with_no_ref': 0})
                elif params.get('mode') == 'merge':
                    params.update({'ballgown_mode': 1})
                    params.update({'skip_reads_with_no_ref': 1})

                params['generate_ws_object'] = False
                params['exchange_gene_ids'] = False
                returnVal = self._process_alignment_set_object(params)
                first_run_result_dir = returnVal.get('result_directory')
                annotation_file = returnVal['annotation_file']

                log('running StringTie merge')
                self._run_merge_option(first_run_result_dir, params,
                                       annotation_file)

                merge_file = os.path.join(first_run_result_dir, 'merge_result',
                                          'stringtie_merge.gtf')

                log('running StringTie the 3rd time with merged gtf')
                if params.get('mode') == 'novel_isoform':
                    params.update({'gtf_file': merge_file})
                elif params.get('mode') == 'merge':
                    filtered_merge_file = self._filter_merge_file(merge_file)
                    params.update({'gtf_file': filtered_merge_file})
                    params.update({'generate_ws_object': True})
                    params.update({'exchange_gene_ids': 1})

                params.update({'ballgown_mode': 1})
                params.update({'skip_reads_with_no_ref': 1})
                returnVal = self._process_alignment_set_object(params)

                self._run_command('cp -R {} {}'.format(
                    os.path.join(first_run_result_dir, 'merge_result'),
                    returnVal.get('result_directory')))

                if params.get('mode') == 'novel_isoform':
                    report_output = self._generate_merge_report(
                        params.get('workspace_name'),
                        returnVal.get('result_directory'))
                elif params.get('mode') == 'merge':
                    report_output = self._generate_report(
                        returnVal.get('expression_obj_ref'),
                        params.get('workspace_name'),
                        returnVal.get('result_directory'),
                        returnVal.get('exprMatrix_FPKM_ref'),
                        returnVal.get('exprMatrix_TPM_ref'))
            else:
                params.update({'ballgown_mode': 1})
                params.update({'skip_reads_with_no_ref': 1})
                params.update({'exchange_gene_ids': 0})

                returnVal = self._process_alignment_set_object(params)

                report_output = self._generate_report(
                    returnVal.get('expression_obj_ref'),
                    params.get('workspace_name'),
                    returnVal.get('result_directory'),
                    returnVal.get('exprMatrix_FPKM_ref'),
                    returnVal.get('exprMatrix_TPM_ref'))
            returnVal.update(report_output)
        else:
            error_msg = 'Invalid input object type\nObject info:\n{}'.format(
                alignment_object_info)
            raise ValueError(error_msg)

        return returnVal
Beispiel #16
0
class GenomeBrowserMaker:
    def __init__(self, callback_url, workspace_url, scratch_dir):
        self.callback_url = callback_url
        self.scratch_dir = scratch_dir
        self.workspace_url = workspace_url
        self.ws = Workspace(self.workspace_url)
        self.jbrowse_dir = os.path.abspath(
            os.path.join(os.sep, 'kb', 'module', 'jbrowse'))
        self.jbrowse_bin = os.path.join(self.jbrowse_dir, 'bin')
        self.out_dir = os.path.join(self.scratch_dir, 'browser_data')

    def _get_assembly_ref(self, genome_ref):
        # test if genome references an assembly type
        # do get_objects2 without data. get list of refs
        genome_obj_info = self.ws.get_objects2({
            'objects': [{
                'ref': genome_ref
            }],
            'no_data': 1
        })
        # get the list of genome refs from the returned info.
        # if there are no refs (or something funky with the return), this will be an empty list.
        # this WILL fail if data is an empty list. But it shouldn't be, and we know because
        # we have a real genome reference, or get_objects2 would fail.
        genome_obj_refs = genome_obj_info.get('data', [{}])[0].get('refs', [])

        # see which of those are of an appropriate type (ContigSet or Assembly), if any.
        assembly_ref = list()
        ref_params = [{'ref': x} for x in genome_obj_refs]
        ref_info = self.ws.get_object_info3({'objects': ref_params})
        for idx, info in enumerate(ref_info.get('infos')):
            if "KBaseGenomeAnnotations.Assembly" in info[
                    2] or "KBaseGenomes.ContigSet" in info[2]:
                assembly_ref.append(";".join(ref_info.get('paths')[idx]))

        return assembly_ref

    def create_browser_data_from_files(self, fasta_file, gff_file,
                                       alignment_files):
        """
        fasta_file = string, path to file
        gff_file = string, path to file
        alignment_files = dict,
            keys = alignment "name" (not necessarily file name),
            values = path to file
        """
        print('Starting create_browser_data_from_files')
        # STEP 1: run the refseq creation
        print('Converting FASTA to JBrowse refseq track...')
        refseq_cmd = [
            os.path.join(self.jbrowse_bin, 'prepare-refseqs.pl'), '--fasta',
            fasta_file, '--out', self.out_dir
        ]
        print('refseq command:')
        print(refseq_cmd)
        p = subprocess.Popen(refseq_cmd, shell=False)
        retcode = p.wait()
        if retcode != 0:
            raise RuntimeError(
                'Failed to build reference sequence track from FASTA file! Return code: {}'
                .format(retcode))
        print('Done creating refseq track!')

        # STEP 2: run the feature track creation
        print('Converting GFF to annotation track...')
        track_cmd = [
            os.path.join(self.jbrowse_bin, 'flatfile-to-json.pl'), '--gff',
            gff_file, '--trackLabel', 'FeatureAnnotations', '--trackType',
            'CanvasFeatures', '--out', self.out_dir
        ]
        print('track command:')
        print(track_cmd)
        p = subprocess.Popen(track_cmd, shell=False)
        retcode = p.wait()
        if retcode != 0:
            raise RuntimeError(
                'Failed to build feature annotation track from GFF file! Return code: {}'
                .format(retcode))
        print('Done creating annotation track!')

        # STEP 3: run the BAM track creation
        print('Converting BAM files to alignment tracks...')
        for alignment_name in alignment_files:
            print('Converting BAM file {}'.format(alignment_name))
            # 1. get the actual file name for the bam file
            align_file_fullpath = alignment_files[alignment_name]
            align_filename = os.path.basename(align_file_fullpath)
            # 2. move it (copy?) to the right location
            shutil.copy2(align_file_fullpath,
                         os.path.join(self.out_dir, align_filename))

            # 2.a. TODO: check the header inside the file that it's the correct chromosome...
            # 3. make a BAI file with samtools
            align_file_fullpath = os.path.join(self.out_dir, align_filename)
            sam_bai_cmd = [
                'samtools', 'index', align_file_fullpath,
                "{}.bai".format(align_file_fullpath)
            ]
            print("Building BAM index with command: {}".format(
                " ".join(sam_bai_cmd)))
            p = subprocess.Popen(sam_bai_cmd, shell=False)
            retcode = p.wait()
            if retcode != 0:
                raise RuntimeError(
                    'Failed to make index file from BAM file! Return code: {}'.
                    format(retcode))
            # 4. add the filename to the trackList with add-bam-track.pl
            track_cmd = [
                os.path.join(self.jbrowse_bin, 'add-bam-track.pl'), '--label',
                alignment_name, '--bam_url', align_filename, '--in',
                os.path.join(self.out_dir, 'trackList.json')
            ]
            p = subprocess.Popen(track_cmd, shell=False)
            retcode = p.wait()
            if retcode != 0:
                raise RuntimeError(
                    'Failed to build alignment track from BAM file! Return code: {}'
                    .format(retcode))
            print('Done creating alignment track {}'.format(alignment_name))
        print('Done creating all alignment tracks!')
        print('Done running create_browser_data_from_files')
        # Final step: return the directory where the JBrowse data lives.
        return {
            'gff_file': gff_file,
            'fasta_file': fasta_file,
            'data_dir': self.out_dir
        }

    def get_genome_data_files(self, genome_ref):
        genome_files = {"assembly": None, "gff": None}
        print('Fetching assembly or contig information from genome...')
        assembly_ref = self._get_assembly_ref(genome_ref)
        if len(assembly_ref) > 1:
            raise ValueError(
                'This genome, {}, appears to reference {} Assemblies or ContigSets, with these object references: {}'
                .format(genome_ref, len(assembly_ref), assembly_ref))
        elif len(assembly_ref) == 0:
            raise ValueError(
                'There was no Assembly or ContigSet found as a reference to this genome. Unable to build browser data.'
            )
        print('Done! Found valid assembly data.')

        print('Converting sequence data to FASTA file...')
        au = AssemblyUtil(self.callback_url)
        fasta_file = au.get_assembly_as_fasta({'ref': assembly_ref[0]})
        print('Done! FASTA file created: {}'.format(fasta_file))

        if "path" not in fasta_file:
            raise IOError(
                'FASTA file was not apparently generated from the given genome fasta_file. fasta_file object missing key "path": {}'
                .format(fasta_file))
        genome_files["assembly"] = fasta_file.get('path', None)

        print('Converting genome annotation data to gff file...')
        gfu = GenomeFileUtil(self.callback_url)
        gff_file = gfu.genome_to_gff({'genome_ref': genome_ref})
        print('Done! GFF file created: {}'.format(gff_file))
        if "file_path" not in gff_file:
            raise IOError(
                'GFF file was not apparently generated from the given genome. gff_file object missing key "file_path": {}'
                .format(gff_file))
        genome_files["gff"] = gff_file.get('file_path', None)

        return genome_files

    def get_alignment_data_files(self, alignment_refs):
        """
        Returns a dictionary of data files. Key = object name, value = path to the file.
        """
        alignment_files = dict()
        ru = ReadsAlignmentUtils(self.callback_url, service_ver='dev')
        for ref in alignment_refs:
            ref_name = get_object_name(ref, self.workspace_url)
            align_file = ru.download_alignment({
                "source_ref": ref,
                "downloadBAI": 0
            })
            for f in os.listdir(align_file["destination_dir"]):
                if f.endswith("bam"):
                    alignment_files[ref_name] = os.path.join(
                        align_file["destination_dir"], f)
        return alignment_files

    def get_browser_data_files(self, genome_ref=None, alignment_refs=None):
        """
        if genome_ref is not None, this figures out the assembly ref, then returns paths to
        both the GFF file and FASTA for the assembly.

        if the alignment_refs list is not None, it fetches the BAM files from each alignment in
        the list.

        Returns a dictionary of the following:
        {
            assembly: string (file path) or None,
            gff: string (file path) or None,
            alignments: {
                name1 (from alignment obj ref): path1,
                name2: path2,
                ...etc
            }
        }

        if genome_ref and alignment_refs are both none, this doesn't do much, and returns a boring
        empty dict.
        """
        files = dict()
        if genome_ref is not None:
            files.update(self.get_genome_data_files(genome_ref))

        files["alignment_refs"] = dict()
        if alignment_refs is not None:
            files["alignment_refs"] = self.get_alignment_data_files(
                alignment_refs)
        return files

    def create_browser_data(self, genome_ref, alignment_refs=None):
        # parameter checking
        # make sure that genome_ref is an object reference, and its a genome, and we have
        # permission to it.
        if not genome_ref:
            raise ValueError('genome_ref parameter is required')
        if not check_reference(genome_ref):
            raise ValueError(
                'genome_ref must be a reference of the format ws/oid or ws/oid/ver'
            )
        if not check_reference_type(genome_ref, ['.Genome'],
                                    self.workspace_url):
            raise ValueError(
                'genome_ref must reference a KBaseGenomes.Genome object')
        if alignment_refs is not None:
            for ref in alignment_refs:
                if not check_reference(ref):
                    raise ValueError(
                        'all alignment_refs must be a reference of the format ws/oid or ws/oid/ver'
                    )

        files = self.get_browser_data_files(genome_ref=genome_ref,
                                            alignment_refs=alignment_refs)
        return self.create_browser_data_from_files(
            files.get("assembly", None), files.get("gff", None),
            files.get("alignment_refs", dict()))

    def package_jbrowse_data(self, data_dir, output_dir):
        """
        Packages the necessary parts of JBrowse (in the standard location:
        /kb/module/jbrowse) along with the data_dir into output_dir.
        output_dir should be an ABSOLUTE path. (e.g. /kb/module/work/tmp/minimal_jbrowse)

        NOTE: this is the big piece necessary for JBrowse compatibility with specific
        versions. It's currently made for JBrowse 1.12.3.
        """
        # mkdir output_dir
        os.makedirs(output_dir)

        # cp pieces of the JBrowse dir into output_dir
        dirs_from_jbrowse = ['browser', 'css', 'img', 'plugins', 'src']
        files_from_jbrowse = [
            'bower.json', 'compat_121.html', 'index.html', 'jbrowse_conf.json',
            'jbrowse.conf', 'LICENSE', 'MYMETA.json', 'MYMETA.yml',
            'package.json'
        ]
        for d in dirs_from_jbrowse:
            jb_path = os.path.join(self.jbrowse_dir, d)
            shutil.copytree(jb_path, os.path.join(output_dir, d))
        for f in files_from_jbrowse:
            jb_path = os.path.join(self.jbrowse_dir, f)
            shutil.copy2(jb_path, os.path.join(output_dir, f))

        # get rid of the src/util tree
        shutil.rmtree(os.path.join(output_dir, 'src', 'util'))

        # mv data_dir -> output_dir/data
        shutil.move(data_dir, os.path.join(output_dir, 'data'))
class CufflinksUtils:
    CUFFLINKS_TOOLKIT_PATH = '/opt/cufflinks/'
    GFFREAD_TOOLKIT_PATH = '/opt/cufflinks/'

    def __init__(self, config):
        """

        :param config:
        :param logger:
        :param directory: Working directory
        :param urls: Service urls
        """
        # BEGIN_CONSTRUCTOR
        self.ws_url = config["workspace-url"]
        self.ws_url = config["workspace-url"]
        self.callback_url = config['SDK_CALLBACK_URL']
        self.srv_wiz_url = config['srv-wiz-url']
        self.token = config['KB_AUTH_TOKEN']
        self.shock_url = config['shock-url']
        self.dfu = DataFileUtil(self.callback_url)
        self.gfu = GenomeFileUtil(self.callback_url)
        self.au = AssemblyUtil(self.callback_url)
        self.rau = ReadsAlignmentUtils(self.callback_url)
        self.set_api = SetAPI(self.srv_wiz_url, service_ver='dev')
        self.eu = ExpressionUtils(self.callback_url)
        self.ws = Workspace(self.ws_url, token=self.token)

        self.scratch = os.path.join(config['scratch'], str(uuid.uuid4()))
        self._mkdir_p(self.scratch)

        self.tool_used = "Cufflinks"
        self.tool_version = os.environ['VERSION']
        # END_CONSTRUCTOR
        pass

    def parse_FPKMtracking_calc_TPM(self, filename):
        """
        Generates TPM from FPKM
        :return:
        """
        fpkm_dict = {}
        tpm_dict = {}
        gene_col = 0
        fpkm_col = 9
        sum_fpkm = 0.0
        with open(filename) as f:
            next(f)
            for line in f:
                larr = line.split("\t")
                gene_id = larr[gene_col]
                if gene_id != "":
                    fpkm = float(larr[fpkm_col])
                    sum_fpkm = sum_fpkm + fpkm
                    fpkm_dict[gene_id] = math.log(fpkm + 1, 2)
                    tpm_dict[gene_id] = fpkm

        if sum_fpkm == 0.0:
            log("Warning: Unable to calculate TPM values as sum of FPKM values is 0"
                )
        else:
            for g in tpm_dict:
                tpm_dict[g] = math.log((tpm_dict[g] / sum_fpkm) * 1e6 + 1, 2)

        return fpkm_dict, tpm_dict

    def _mkdir_p(self, path):
        """
        _mkdir_p: make directory for given path
        """
        if not path:
            return
        try:
            os.makedirs(path)
        except OSError as exc:
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise

    def _validate_run_cufflinks_params(self, params):
        """
        _validate_run_cufflinks_params:
                Raises an exception if params are invalid
        """

        log('Start validating run_cufflinks params')

        # check for required parameters
        for p in ['alignment_object_ref', 'workspace_name', 'genome_ref']:
            if p not in params:
                raise ValueError(
                    '"{}" parameter is required, but missing'.format(p))

    def _run_command(self, command):
        """
        _run_command: run command and print result
        """

        log('Start executing command:\n{}'.format(command))
        pipe = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True)
        output = pipe.communicate()[0]
        exitCode = pipe.returncode

        if (exitCode == 0):
            log('Executed command:\n{}\n'.format(command) +
                'Exit Code: {}\nOutput:\n{}'.format(exitCode, output))
        else:
            error_msg = 'Error running command:\n{}\n'.format(command)
            error_msg += 'Exit Code: {}\nOutput:\n{}'.format(exitCode, output)

            raise ValueError(error_msg)

    def _run_gffread(self, gff_path, gtf_path):
        """
        _run_gffread: run gffread script

        ref: http://cole-trapnell-lab.github.io/cufflinks/file_formats/#the-gffread-utility
        """
        log('converting gff to gtf')
        command = self.GFFREAD_TOOLKIT_PATH + '/gffread '
        command += "-E {0} -T -o {1}".format(gff_path, gtf_path)

        self._run_command(command)

    def _create_gtf_annotation_from_genome(self, genome_ref):
        """
         Create reference annotation file from genome
        """
        ref = self.ws.get_object_subset([{
            'ref':
            genome_ref,
            'included': ['contigset_ref', 'assembly_ref']
        }])
        if 'contigset_ref' in ref[0]['data']:
            contig_id = ref[0]['data']['contigset_ref']
        elif 'assembly_ref' in ref[0]['data']:
            contig_id = ref[0]['data']['assembly_ref']
        if contig_id is None:
            raise ValueError(
                "Genome at {0} does not have reference to the assembly object".
                format(genome_ref))
        print(contig_id)
        log("Generating GFF file from Genome")
        try:
            ret = self.au.get_assembly_as_fasta({'ref': contig_id})
            output_file = ret['path']
            mapping_filename = c_mapping.create_sanitized_contig_ids(
                output_file)
            os.remove(output_file)
            # get the GFF
            ret = self.gfu.genome_to_gff({'genome_ref': genome_ref})
            genome_gff_file = ret['file_path']
            c_mapping.replace_gff_contig_ids(genome_gff_file,
                                             mapping_filename,
                                             to_modified=True)
            gtf_ext = ".gtf"

            if not genome_gff_file.endswith(gtf_ext):
                gtf_path = os.path.splitext(genome_gff_file)[0] + '.gtf'
                self._run_gffread(genome_gff_file, gtf_path)
            else:
                gtf_path = genome_gff_file

            log("gtf file : " + gtf_path)
        except Exception:
            raise ValueError(
                "Generating GTF file from Genome Annotation object Failed :  {}"
                .format("".join(traceback.format_exc())))
        return gtf_path

    def _get_gtf_file(self, alignment_ref):
        """
        _get_gtf_file: get the reference annotation file (in GTF or GFF3 format)
        """
        result_directory = self.scratch
        alignment_data = self.ws.get_objects2(
            {'objects': [{
                'ref': alignment_ref
            }]})['data'][0]['data']

        genome_ref = alignment_data.get('genome_id')
        # genome_name = self.ws.get_object_info([{"ref": genome_ref}], includeMetadata=None)[0][1]
        # ws_gtf = genome_name+"_GTF_Annotation"

        genome_data = self.ws.get_objects2({'objects': [{
            'ref': genome_ref
        }]})['data'][0]['data']

        gff_handle_ref = genome_data.get('gff_handle_ref')

        if gff_handle_ref:
            log('getting reference annotation file from genome')
            annotation_file = self.dfu.shock_to_file({
                'handle_id': gff_handle_ref,
                'file_path': result_directory,
                'unpack': 'unpack'
            })['file_path']
        else:
            annotation_file = self._create_gtf_annotation_from_genome(
                genome_ref)

        return annotation_file

    def _get_gtf_file_from_genome_ref(self, genome_ref):
        """
        _get_gtf_file: get the reference annotation file (in GTF or GFF3 format)
        """
        result_directory = self.scratch

        genome_data = self.ws.get_objects2({'objects': [{
            'ref': genome_ref
        }]})['data'][0]['data']

        gff_handle_ref = genome_data.get('gff_handle_ref')

        if gff_handle_ref:
            log('getting reference annotation file from genome')
            annotation_file = self.dfu.shock_to_file({
                'handle_id': gff_handle_ref,
                'file_path': result_directory,
                'unpack': 'unpack'
            })['file_path']
        else:
            annotation_file = self._create_gtf_annotation_from_genome(
                genome_ref)

        return annotation_file

    def _get_input_file(self, alignment_ref):
        """
        _get_input_file: get input BAM file from Alignment object
        """

        bam_file_dir = self.rau.download_alignment(
            {'source_ref': alignment_ref})['destination_dir']

        files = os.listdir(bam_file_dir)
        bam_file_list = [
            file for file in files if re.match(r'.*\_sorted\.bam', file)
        ]
        if not bam_file_list:
            bam_file_list = [
                file for file in files if re.match(r'.*(?<!sorted)\.bam', file)
            ]

        if not bam_file_list:
            raise ValueError('Cannot find .bam file from alignment {}'.format(
                alignment_ref))

        bam_file_name = bam_file_list[0]

        bam_file = os.path.join(bam_file_dir, bam_file_name)

        return bam_file

    def _generate_command(self, params):
        """
        _generate_command: generate cufflinks command
        """
        cufflinks_command = '/opt/cufflinks/cufflinks'
        cufflinks_command += (' -q --no-update-check -p ' +
                              str(params.get('num_threads', 1)))
        if 'max_intron_length' in params and params[
                'max_intron_length'] is not None:
            cufflinks_command += (' --max-intron-length ' +
                                  str(params['max_intron_length']))
        if 'min_intron_length' in params and params[
                'min_intron_length'] is not None:
            cufflinks_command += (' --min-intron-length ' +
                                  str(params['min_intron_length']))
        if 'overhang_tolerance' in params and params[
                'overhang_tolerance'] is not None:
            cufflinks_command += (' --overhang-tolerance ' +
                                  str(params['overhang_tolerance']))

        cufflinks_command += " -o {0} -G {1} {2}".format(
            params['result_directory'], params['gtf_file'],
            params['input_file'])

        log('Generated cufflinks command: {}'.format(cufflinks_command))

        return cufflinks_command

    def _process_rnaseq_alignment_object(self, params):
        """
        _process_alignment_object: process KBaseRNASeq.RNASeqAlignment type input object
        """
        log('start processing RNASeqAlignment object\nparams:\n{}'.format(
            json.dumps(params, indent=1)))
        alignment_ref = params.get('alignment_ref')

        result_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(result_directory)
        params['result_directory'] = str(result_directory)

        # input files
        params['input_file'] = self._get_input_file(alignment_ref)
        if not params.get('gtf_file'):
            params['gtf_file'] = self._get_gtf_file(alignment_ref)

        if '/' not in params['genome_ref']:
            params['genome_ref'] = params['workspace_name'] + '/' + params[
                'genome_ref']

        command = self._generate_command(params)
        self._run_command(command)

        expression_obj_ref = self._save_rnaseq_expression(
            result_directory, alignment_ref, params.get('workspace_name'),
            params.get('genome_ref'), params['gtf_file'],
            params['expression_suffix'])

        returnVal = {
            'result_directory': result_directory,
            'expression_obj_ref': expression_obj_ref,
            'alignment_ref': alignment_ref
        }

        expression_name = self.ws.get_object_info([{
            "ref": expression_obj_ref
        }],
                                                  includeMetadata=None)[0][1]

        widget_params = {
            "output": expression_name,
            "workspace": params.get('workspace_name')
        }
        returnVal.update(widget_params)

        return returnVal

    def _process_kbasesets_alignment_object(self, params):
        """
        _process_alignment_object: process KBaseRNASeq.RNASeqAlignment type input object
        """
        log('start processing KBaseSets object\nparams:\n{}'.format(
            json.dumps(params, indent=1)))
        alignment_ref = params.get('alignment_ref')

        result_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(result_directory)
        params['result_directory'] = str(result_directory)

        # input files
        params['input_file'] = self._get_input_file(alignment_ref)
        if not params.get('gtf_file'):
            params['gtf_file'] = self._get_gtf_file(alignment_ref)

        command = self._generate_command(params)
        self._run_command(command)

        expression_obj_ref = self._save_kbasesets_expression(
            result_directory, alignment_ref, params.get('workspace_name'),
            params.get('genome_ref'), params.get('gtf_file'),
            params.get('expression_suffix'))

        returnVal = {
            'result_directory': result_directory,
            'expression_obj_ref': expression_obj_ref,
            'alignment_ref': alignment_ref
        }

        expression_name = self.ws.get_object_info([{
            "ref": expression_obj_ref
        }],
                                                  includeMetadata=None)[0][1]

        widget_params = {
            "output": expression_name,
            "workspace": params.get('workspace_name')
        }
        returnVal.update(widget_params)

        return returnVal

    def _generate_html_report(self, result_directory, obj_ref):
        """
        _generate_html_report: generate html summary report
        """
        log('Start generating html report')
        html_report = list()

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_directory)
        result_file_path = os.path.join(output_directory, 'report.html')

        expression_object = self.ws.get_objects2(
            {'objects': [{
                'ref': obj_ref
            }]})['data'][0]

        expression_object_type = expression_object.get('info')[2]

        Overview_Content = ''
        if re.match('KBaseRNASeq.RNASeqExpression-\d.\d',
                    expression_object_type):
            Overview_Content += '<p>Generated Expression Object:</p><p>{}</p>'.format(
                expression_object.get('info')[1])
        elif re.match('KBaseRNASeq.RNASeqExpressionSet-\d.\d',
                      expression_object_type):
            Overview_Content += '<p>Generated Expression Set Object:</p><p>{}</p>'.format(
                expression_object.get('info')[1])
            Overview_Content += '<br><p>Generated Expression Object:</p>'
            for expression_ref in expression_object['data'][
                    'sample_expression_ids']:
                expression_name = self.ws.get_object_info(
                    [{
                        "ref": expression_ref
                    }], includeMetadata=None)[0][1]
                Overview_Content += '<p>{}</p>'.format(expression_name)
        elif re.match('KBaseSets.ExpressionSet-\d.\d', expression_object_type):
            pprint(expression_object)
            Overview_Content += '<p>Generated Expression Set Object:</p><p>{}</p>'.format(
                expression_object.get('info')[1])
            Overview_Content += '<br><p>Generated Expression Object:</p>'
            for expression_ref in expression_object['data']['items']:
                expression_name = self.ws.get_object_info(
                    [{
                        "ref": expression_ref['ref']
                    }], includeMetadata=None)[0][1]
                condition = expression_ref['label']
                Overview_Content += '<p>condition:{0}; expression_name: {1}</p>'.format(
                    condition, expression_name)

        with open(result_file_path, 'w') as result_file:
            with open(
                    os.path.join(os.path.dirname(__file__),
                                 'report_template.html'),
                    'r') as report_template_file:
                report_template = report_template_file.read()
                report_template = report_template.replace(
                    '<p>Overview_Content</p>', Overview_Content)
                result_file.write(report_template)

        html_report.append({
            'path':
            result_file_path,
            'name':
            os.path.basename(result_file_path),
            'label':
            os.path.basename(result_file_path),
            'description':
            'HTML summary report for Cufflinks App'
        })
        return html_report

    def _save_rnaseq_expression(self, result_directory, alignment_ref,
                                workspace_name, genome_ref, gtf_file,
                                expression_suffix):
        """
        _save_rnaseq_expression: save Expression object to workspace
        """
        log('start saving Expression object')
        alignment_object_name = self.ws.get_object_info(
            [{
                "ref": alignment_ref
            }], includeMetadata=None)[0][1]

        # set expression name
        if re.match('.*_[Aa]lignment$', alignment_object_name):
            expression_name = re.sub('_[Aa]lignment$', expression_suffix,
                                     alignment_object_name)
        else:  # assume user specified suffix
            expression_name = alignment_object_name + expression_suffix

        expression_ref = self.eu.upload_expression({
            'destination_ref':
            workspace_name + '/' + expression_name,
            'source_dir':
            result_directory,
            'alignment_ref':
            alignment_ref,
            'tool_used':
            self.tool_used,
            'tool_version':
            self.tool_version
        })['obj_ref']

        return expression_ref

    def _save_kbasesets_expression(self, result_directory, alignment_ref,
                                   workspace_name, genome_ref, gtf_file,
                                   expression_suffix):
        """
        _save_kbasesets_expression: save Expression object to workspace using ExpressionUtils
        and SetAPI
        """
        log('start saving Expression object')

        alignment_info = self.ws.get_object_info3(
            {'objects': [{
                "ref": alignment_ref
            }]})
        alignment_object_name = alignment_info['infos'][0][1]

        # set expression name
        if re.match('.*_[Aa]lignment$', alignment_object_name):
            expression_name = re.sub('_[Aa]lignment$', expression_suffix,
                                     alignment_object_name)
        else:  # assume user specified suffix
            expression_name = alignment_object_name + expression_suffix

        expression_ref = self.eu.upload_expression({
            'destination_ref':
            workspace_name + '/' + expression_name,
            'source_dir':
            result_directory,
            'alignment_ref':
            alignment_ref,
            'tool_used':
            self.tool_used,
            'tool_version':
            self.tool_version
        })['obj_ref']

        return expression_ref

    def _save_rnaseq_expression_set(self, alignment_expression_map,
                                    alignment_set_ref, workspace_name,
                                    expression_set_name):
        """
        _save_rnaseq_expression_set: save ExpressionSet object to workspace
        """
        log('start saving ExpressionSet object')
        if isinstance(workspace_name, int) or workspace_name.isdigit():
            workspace_id = workspace_name
        else:
            workspace_id = self.dfu.ws_name_to_id(workspace_name)

        expression_set_data = self._generate_expression_set_data(
            alignment_expression_map, alignment_set_ref, expression_set_name)

        object_type = 'KBaseRNASeq.RNASeqExpressionSet'
        save_object_params = {
            'id':
            workspace_id,
            'objects': [{
                'type': object_type,
                'data': expression_set_data,
                'name': expression_set_name
            }]
        }

        dfu_oi = self.dfu.save_objects(save_object_params)[0]
        expression_set_ref = str(dfu_oi[6]) + '/' + str(dfu_oi[0]) + '/' + str(
            dfu_oi[4])

        return expression_set_ref

    def _save_kbasesets_expression_set(self, alignment_expression_map,
                                       alignment_set_ref, workspace_name,
                                       expression_set_name):
        """
        _save_kbasesets_expression_set: save ExpressionSet object to workspace
        """
        log('start saving ExpressionSet object')
        if isinstance(workspace_name, int) or workspace_name.isdigit():
            workspace_id = workspace_name
        else:
            workspace_id = self.dfu.ws_name_to_id(workspace_name)

        expression_set_data = self._generate_expression_set_data(
            alignment_expression_map, alignment_set_ref, expression_set_name)

        object_type = 'KBaseRNASeq.RNASeqExpressionSet'
        save_object_params = {
            'id':
            workspace_id,
            'objects': [{
                'type': object_type,
                'data': expression_set_data,
                'name': expression_set_name
            }]
        }

        dfu_oi = self.dfu.save_objects(save_object_params)[0]
        expression_set_ref = str(dfu_oi[6]) + '/' + str(dfu_oi[0]) + '/' + str(
            dfu_oi[4])

        return expression_set_ref

    def _generate_report(self,
                         obj_ref,
                         workspace_name,
                         result_directory,
                         exprMatrix_FPKM_ref=None,
                         exprMatrix_TPM_ref=None):
        """
        _generate_report: generate summary report
        """

        log('creating report')

        output_files = self._generate_output_file_list(result_directory)
        output_html_files = self._generate_html_report(result_directory,
                                                       obj_ref)

        expression_object = self.ws.get_objects2(
            {'objects': [{
                'ref': obj_ref
            }]})['data'][0]
        expression_info = expression_object['info']
        expression_data = expression_object['data']

        expression_object_type = expression_info[2]
        if re.match('KBaseRNASeq.RNASeqExpression-\d+.\d+',
                    expression_object_type):
            objects_created = [{
                'ref':
                obj_ref,
                'description':
                'Expression generated by Cufflinks'
            }]
        elif re.match('KBaseRNASeq.RNASeqExpressionSet-\d+.\d+',
                      expression_object_type):
            objects_created = [{
                'ref':
                obj_ref,
                'description':
                'Expression generated by Cufflinks'
            }]
        elif re.match('KBaseSets.ExpressionSet-\d+.\d+',
                      expression_object_type):
            objects_created = [{
                'ref':
                obj_ref,
                'description':
                'ExpressionSet generated by Cufflinks'
            }]
            items = expression_data['items']
            for item in items:
                objects_created.append({
                    'ref':
                    item['ref'],
                    'description':
                    'Expression generated by Cufflinks'
                })
            objects_created.append({
                'ref':
                exprMatrix_FPKM_ref,
                'description':
                'FPKM ExpressionMatrix generated by Cufflinks'
            })
            objects_created.append({
                'ref':
                exprMatrix_TPM_ref,
                'description':
                'TPM ExpressionMatrix generated by Cufflinks'
            })

        report_params = {
            'message': '',
            'workspace_name': workspace_name,
            'file_links': output_files,
            'objects_created': objects_created,
            'html_links': output_html_files,
            'direct_html_link_index': 0,
            'html_window_height': 366,
            'report_object_name': 'kb_cufflinks_report_' + str(uuid.uuid4())
        }

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {
            'report_name': output['name'],
            'report_ref': output['ref']
        }

        return report_output

    def _parse_FPKMtracking(self, filename, metric):
        result = {}
        pos1 = 0
        if metric == 'FPKM':
            pos2 = 7
        if metric == 'TPM':
            pos2 = 8

        with open(filename) as f:
            next(f)
            for line in f:
                larr = line.split("\t")
                if larr[pos1] != "":
                    try:
                        result[larr[pos1]] = math.log(float(larr[pos2]) + 1, 2)
                    except ValueError:
                        result[larr[pos1]] = math.log(1, 2)

        return result

    def _generate_output_file_list(self, result_directory):
        """
        _generate_output_file_list: zip result files and generate file_links for report
        """
        log('Start packing result files')
        output_files = list()

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_directory)
        result_file = os.path.join(output_directory, 'cufflinks_result.zip')

        with zipfile.ZipFile(result_file,
                             'w',
                             zipfile.ZIP_DEFLATED,
                             allowZip64=True) as zip_file:
            for root, dirs, files in os.walk(result_directory):
                for file in files:
                    if not (file.endswith('.DS_Store')):
                        zip_file.write(
                            os.path.join(root, file),
                            os.path.join(os.path.basename(root), file))

        output_files.append({
            'path': result_file,
            'name': os.path.basename(result_file),
            'label': os.path.basename(result_file),
            'description': 'File(s) generated by Cufflinks App'
        })

        return output_files

    def _generate_expression_data(self, result_directory, alignment_ref,
                                  gtf_file, workspace_name, expression_suffix):
        """
        _generate_expression_data: generate Expression object with cufflinks output files
        """
        alignment_data_object = self.ws.get_objects2(
            {'objects': [{
                'ref': alignment_ref
            }]})['data'][0]

        # set expression name
        alignment_object_name = alignment_data_object['info'][1]
        if re.match('.*_[Aa]lignment$', alignment_object_name):
            expression_name = re.sub('_[Aa]lignment$', expression_suffix,
                                     alignment_object_name)
        else:  # assume user specified suffix
            expression_name = alignment_object_name + expression_suffix

        expression_data = {
            'id': expression_name,
            'type': 'RNA-Seq',
            'numerical_interpretation': 'FPKM',
            'processing_comments': 'log2 Normalized',
            'tool_used': self.tool_used,
            'tool_version': self.tool_version
        }
        alignment_data = alignment_data_object['data']

        condition = alignment_data.get('condition')
        expression_data.update({'condition': condition})

        genome_id = alignment_data.get('genome_id')
        expression_data.update({'genome_id': genome_id})

        read_sample_id = alignment_data.get('read_sample_id')
        expression_data.update(
            {'mapped_rnaseq_alignment': {
                read_sample_id: alignment_ref
            }})

        exp_dict, tpm_exp_dict = self.parse_FPKMtracking_calc_TPM(
            os.path.join(result_directory, 'genes.fpkm_tracking'))

        expression_data.update({'expression_levels': exp_dict})

        expression_data.update({'tpm_expression_levels': tpm_exp_dict})

        handle = self.dfu.file_to_shock({
            'file_path': result_directory,
            'pack': 'zip',
            'make_handle': True
        })['handle']
        expression_data.update({'file': handle})

        return expression_data

    def _generate_expression_set_data(self, alignment_expression_map,
                                      alignment_set_ref, expression_set_name):
        """
        _generate_expression_set_data: generate ExpressionSet object with cufflinks output files
        """
        alignment_set_data_object = self.ws.get_objects2(
            {'objects': [{
                'ref': alignment_set_ref
            }]})['data'][0]

        alignment_set_data = alignment_set_data_object['data']

        expression_set_data = {
            'tool_used': self.tool_used,
            'tool_version': self.tool_version,
            'id': expression_set_name,
            'alignmentSet_id': alignment_set_ref,
            'genome_id': alignment_set_data.get('genome_id'),
            'sampleset_id': alignment_set_data.get('sampleset_id')
        }

        sample_expression_ids = []
        mapped_expression_objects = []
        mapped_expression_ids = []

        for alignment_expression in alignment_expression_map:
            alignment_ref = alignment_expression.get('alignment_ref')
            expression_ref = alignment_expression.get('expression_obj_ref')
            sample_expression_ids.append(expression_ref)
            mapped_expression_ids.append({alignment_ref: expression_ref})
            alignment_name = self.ws.get_object_info(
                [{
                    "ref": alignment_ref
                }], includeMetadata=None)[0][1]
            expression_name = self.ws.get_object_info(
                [{
                    "ref": expression_ref
                }], includeMetadata=None)[0][1]
            mapped_expression_objects.append({alignment_name: expression_name})

        expression_set_data['sample_expression_ids'] = sample_expression_ids
        expression_set_data[
            'mapped_expression_objects'] = mapped_expression_objects
        expression_set_data['mapped_expression_ids'] = mapped_expression_ids

        return expression_set_data

    def _process_alignment_set_object(self, params, alignment_object_type):
        """
        _process_alignment_set_object: process KBaseRNASeq.RNASeqAlignmentSet type input object
                                        and KBaseSets.ReadsAlignmentSet type object
        """
        log('start processing KBaseRNASeq.RNASeqAlignmentSet object or KBaseSets.ReadsAlignmentSet object'
            '\nparams:\n{}'.format(json.dumps(params, indent=1)))

        alignment_set_ref = params.get('alignment_set_ref')

        if re.match('^KBaseRNASeq.RNASeqAlignmentSet-\d*',
                    alignment_object_type):
            params['gtf_file'] = self._get_gtf_file(alignment_set_ref)
        else:
            if not '/' in params['genome_ref']:
                params['genome_ref'] = params['workspace_name'] + '/' + params[
                    'genome_ref']

            params['gtf_file'] = self._get_gtf_file_from_genome_ref(
                params['genome_ref'])

        alignment_set = self.set_api.get_reads_alignment_set_v1({
            'ref':
            alignment_set_ref,
            'include_item_info':
            0,
            'include_set_item_ref_paths':
            1
        })
        mul_processor_params = []
        for alignment in alignment_set["data"]["items"]:
            alignment_ref = alignment['ref_path']
            alignment_upload_params = params.copy()
            alignment_upload_params['alignment_ref'] = alignment_ref
            mul_processor_params.append(alignment_upload_params)
            # use the following when you want to run the cmd sequentially
            # self._process_kbasesets_alignment_object(mul_processor_params[0])

        cpus = min(params.get('num_threads'), multiprocessing.cpu_count())
        pool = Pool(ncpus=cpus)
        log('running _process_alignment_object with {} cpus'.format(cpus))
        alignment_expression_map = pool.map(
            self._process_kbasesets_alignment_object, mul_processor_params)

        result_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(result_directory)

        expression_items = list()
        for proc_alignment_return in alignment_expression_map:
            expression_obj_ref = proc_alignment_return.get(
                'expression_obj_ref')
            alignment_ref = proc_alignment_return.get('alignment_ref')
            alignment_info = self.ws.get_object_info3({
                'objects': [{
                    "ref": alignment_ref
                }],
                'includeMetadata':
                1
            })
            condition = alignment_info['infos'][0][10]['condition']
            expression_items.append({
                "ref": expression_obj_ref,
                "label": condition,
            })
            expression_name = self.ws.get_object_info(
                [{
                    "ref": expression_obj_ref
                }], includeMetadata=None)[0][1]
            self._run_command('cp -R {} {}'.format(
                proc_alignment_return.get('result_directory'),
                os.path.join(result_directory, expression_name)))

        expression_set = {
            "description": "generated by kb_cufflinks",
            "items": expression_items
        }

        expression_set_info = self.set_api.save_expression_set_v1({
            "workspace":
            params['workspace_name'],
            "output_object_name":
            params['expression_set_name'],
            "data":
            expression_set
        })

        returnVal = {
            'result_directory': result_directory,
            'expression_obj_ref': expression_set_info['set_ref']
        }

        widget_params = {
            "output": params.get('expression_set_name'),
            "workspace": params.get('workspace_name')
        }
        returnVal.update(widget_params)

        return returnVal

    def _generate_output_object_name(self, params, alignment_object_type,
                                     alignment_object_name):
        """
        Generates the output object name based on input object type and name and stores it in
        params with key equal to 'expression' or 'expression_set' based on whether the input
        object is an alignment or alignment_set.

        :param params: module input params
        :param alignment_object_type: input alignment object type
        :param alignment_object_name: input alignment object name
        :param alignment_object_data: input alignment object data
        """
        expression_set_suffix = params['expression_set_suffix']
        expression_suffix = params['expression_suffix']

        if re.match('^KBaseRNASeq.RNASeqAlignment-\d*', alignment_object_type):
            if re.match('.*_[Aa]lignment$', alignment_object_name):
                params['expression_name'] = re.sub('_[Aa]lignment$',
                                                   expression_suffix,
                                                   alignment_object_name)
            else:  # assume user specified suffix
                params[
                    'expression_name'] = alignment_object_name + expression_suffix
        if re.match('^KBaseRNASeq.RNASeqAlignmentSet-\d*',
                    alignment_object_type):
            if re.match('.*_[Aa]lignment_[Ss]et$', alignment_object_name):
                # set expression set name
                params['expression_set_name'] = re.sub('_[Aa]lignment_[Ss]et$',
                                                       expression_set_suffix,
                                                       alignment_object_name)
            else:  # assume user specified suffix
                params[
                    'expression_set_name'] = alignment_object_name + expression_set_suffix
        if re.match('^KBaseSets.ReadsAlignmentSet-\d*', alignment_object_type):
            if re.match('.*_[Aa]lignment_[Ss]et$', alignment_object_name):

                # set expression set name
                params['expression_set_name'] = re.sub('_[Aa]lignment_[Ss]et$',
                                                       expression_set_suffix,
                                                       alignment_object_name)
            else:  # assume user specified suffix
                params[
                    'expression_set_name'] = alignment_object_name + expression_set_suffix

    def _save_expression_matrix(self, expressionset_ref, workspace_name):
        """
        _save_expression_matrix: save FPKM and TPM ExpressionMatrix
        """

        log('start saving ExpressionMatrix object')

        expression_set_name = self.ws.get_object_info(
            [{
                "ref": expressionset_ref
            }], includeMetadata=None)[0][1]

        output_obj_name_prefix = re.sub('_*[Ee]xpression_*[Ss]et', '',
                                        expression_set_name)

        upload_expression_matrix_params = {
            'expressionset_ref': expressionset_ref,
            'output_obj_name': output_obj_name_prefix,
            'workspace_name': workspace_name
        }

        expression_matrix_refs = self.eu.get_expressionMatrix(
            upload_expression_matrix_params)

        return expression_matrix_refs

    def run_cufflinks_app(self, params):
        log('--->\nrunning CufflinksUtil.run_cufflinks_app\n' +
            'params:\n{}'.format(json.dumps(params, indent=1)))

        self._validate_run_cufflinks_params(params)

        alignment_object_ref = params.get('alignment_object_ref')
        alignment_object_info = self.ws.get_object_info3(
            {"objects": [{
                "ref": alignment_object_ref
            }]})['infos'][0]

        alignment_object_type = alignment_object_info[2]
        alignment_object_name = alignment_object_info[1]

        # get output object name
        self._generate_output_object_name(params, alignment_object_type,
                                          alignment_object_name)

        log('--->\nalignment object type: \n' +
            '{}'.format(alignment_object_type))

        if re.match('^KBaseRNASeq.RNASeqAlignment-\d*', alignment_object_type):
            params.update({'alignment_ref': alignment_object_ref})
            returnVal = self._process_rnaseq_alignment_object(params)
            report_output = self._generate_report(
                returnVal.get('expression_obj_ref'),
                params.get('workspace_name'),
                returnVal.get('result_directory'))
            returnVal.update(report_output)
        elif re.match('^KBaseRNASeq.RNASeqAlignmentSet-\d*', alignment_object_type) or \
             re.match('^KBaseSets.ReadsAlignmentSet-\d*', alignment_object_type):
            params.update({'alignment_set_ref': alignment_object_ref})
            returnVal = self._process_alignment_set_object(
                params, alignment_object_type)
            expression_matrix_refs = self._save_expression_matrix(
                returnVal['expression_obj_ref'], params.get('workspace_name'))
            returnVal.update(expression_matrix_refs)

            report_output = self._generate_report(
                returnVal['expression_obj_ref'], params.get('workspace_name'),
                returnVal['result_directory'],
                expression_matrix_refs['exprMatrix_FPKM_ref'],
                expression_matrix_refs['exprMatrix_TPM_ref'])
            returnVal.update(report_output)
        else:
            raise ValueError(
                'None RNASeqAlignment type\nObject info:\n{}'.format(
                    alignment_object_info))

        return returnVal
Beispiel #18
0
    def expand_input(self, input_refs):
        '''
        Expand input based on an input data reference for Kaiju

        input_refs can be a list of references to a PairedEndLibrary, a SingleEndLibrary, or a ReadsSet
        '''
        # config
        #SERVICE_VER = 'dev'
        SERVICE_VER = 'release'

        # expand any sets and build a non-redundant list of reads input objs
        ws = Workspace(self.ws_url)
        expanded_input = []
        input_ref_seen = dict()
        SE_types = [
            'KBaseFile.SingleEndLibrary', 'KBaseAssembly.SingleEndLibrary'
        ]
        PE_types = [
            'KBaseFile.PairedEndLibrary', 'KBaseAssembly.PairedEndLibrary'
        ]

        [
            OBJID_I, NAME_I, TYPE_I, SAVE_DATE_I, VERSION_I, SAVED_BY_I,
            WSID_I, WORKSPACE_I, CHSUM_I, SIZE_I, META_I
        ] = range(11)  # object_info tuple
        for input_ref in input_refs:
            input_info = ws.get_object_info3({'objects': [{
                'ref': input_ref
            }]})['infos'][0]
            obj_name = input_info[NAME_I]
            type_name = input_info[TYPE_I].split('-')[0]

            # ReadsSet
            if type_name in ['KBaseSets.ReadsSet']:
                try:
                    input_readsSet_obj = self.setAPI_Client.get_reads_set_v1({
                        'ref':
                        input_ref,
                        'include_item_info':
                        1
                    })

                except Exception as e:
                    raise ValueError(
                        'SetAPI FAILURE: Unable to get read library set object from workspace: ('
                        + str(input_ref) + ")\n" + str(e))

                for readsLibrary_obj in input_readsSet_obj['data']['items']:
                    this_reads_ref = readsLibrary_obj['ref']
                    if this_reads_ref in input_ref_seen:
                        continue
                    input_ref_seen[this_reads_ref] = True

                    this_reads_name = readsLibrary_obj['info'][NAME_I]
                    reads_item_type = readsLibrary_obj['info'][TYPE_I]
                    reads_item_type = re.sub(
                        '-[0-9]+\.[0-9]+$', "",
                        reads_item_type)  # remove trailing version
                    if reads_item_type in PE_types:
                        this_reads_type = self.PE_flag
                    elif reads_item_type in SE_types:
                        this_reads_type = self.SE_flag
                    else:
                        raise ValueError("Can't handle read item type '" +
                                         reads_item_type + "' obj_name: '" +
                                         this_reads_name + " in Set: '" +
                                         str(input_ref) + "'")
                    expanded_input.append({
                        'ref': this_reads_ref,
                        'name': this_reads_name,
                        'type': this_reads_type
                    })
            # SingleEnd Library
            elif type_name in SE_types:
                this_reads_ref = input_ref
                if this_reads_ref in input_ref_seen:
                    continue
                input_ref_seen[this_reads_ref] = True
                this_reads_name = obj_name
                this_reads_type = self.SE_flag
                expanded_input.append({
                    'ref': this_reads_ref,
                    'name': this_reads_name,
                    'type': this_reads_type
                })
            # PairedEnd Library
            elif type_name in PE_types:
                this_reads_ref = input_ref
                if this_reads_ref in input_ref_seen:
                    continue
                input_ref_seen[this_reads_ref] = True
                this_reads_name = obj_name
                this_reads_type = self.PE_flag
                expanded_input.append({
                    'ref': this_reads_ref,
                    'name': this_reads_name,
                    'type': this_reads_type
                })
            else:
                raise ValueError("Illegal type in input_refs: " +
                                 str(obj_name) + " (" + str(input_ref) +
                                 ") is of type: '" + str(type_name) + "'")

        return expanded_input
    def stage_input(self, input_ref, fasta_file_extension):
        '''
        Stage input based on an input data reference for CheckM

        input_ref can be a reference to an Assembly, BinnedContigs, or (not yet implemented) a Genome

        This method creates a directory in the scratch area with the set of Fasta files, names
        will have the fasta_file_extension parameter tacked on.

            ex:

            staged_input = stage_input('124/15/1', 'fna')

            staged_input
            {"input_dir": '...'}
        '''
        # config
        #SERVICE_VER = 'dev'
        SERVICE_VER = 'release'

        # generate a folder in scratch to hold the input
        suffix = str(int(time.time() * 1000))
        input_dir = os.path.join(self.scratch, 'bins_' + suffix)
        all_seq_fasta = os.path.join(
            self.scratch,
            'all_sequences_' + suffix + '.' + fasta_file_extension)
        if not os.path.exists(input_dir):
            os.makedirs(input_dir)

        # 2) based on type, download the files
        ws = Workspace(self.ws_url)
        input_info = ws.get_object_info3({'objects': [{
            'ref': input_ref
        }]})['infos'][0]
        # 0 obj_id objid - the numerical id of the object.
        # 1 obj_name name - the name of the object.
        # 2 type_string type - the type of the object.
        # 3 timestamp save_date - the save date of the object.
        # 4 obj_ver ver - the version of the object.
        # 5 username saved_by - the user that saved or copied the object.
        # 6 ws_id wsid - the workspace containing the object.
        # 7 ws_name workspace - the workspace containing the object.
        # 8 string chsum - the md5 checksum of the object.
        # 9 int size - the size of the object in bytes.
        # 10 usermeta meta - arbitrary user-supplied metadata about
        #     the object.
        [
            OBJID_I, NAME_I, TYPE_I, SAVE_DATE_I, VERSION_I, SAVED_BY_I,
            WSID_I, WORKSPACE_I, CHSUM_I, SIZE_I, META_I
        ] = range(11)  # object_info tuple
        obj_name = input_info[NAME_I]
        type_name = input_info[TYPE_I].split('-')[0]

        # auClient
        try:
            auClient = AssemblyUtil(self.callbackURL,
                                    token=self.ctx['token'],
                                    service_ver=SERVICE_VER)
        except Exception as e:
            raise ValueError(
                'Unable to instantiate auClient with callbackURL: ' +
                self.callbackURL + ' ERROR: ' + str(e))

        # setAPI_Client
        try:
            #setAPI_Client = SetAPI (url=self.callbackURL, token=self.ctx['token'])  # for SDK local.  local doesn't work for SetAPI
            setAPI_Client = SetAPI(
                url=self.serviceWizardURL,
                token=self.ctx['token'])  # for dynamic service
        except Exception as e:
            raise ValueError(
                'Unable to instantiate setAPI_Client with serviceWizardURL: ' +
                self.serviceWizardURL + ' ERROR: ' + str(e))

        # mguClient
        try:
            mguClient = MetagenomeUtils(self.callbackURL,
                                        token=self.ctx['token'],
                                        service_ver=SERVICE_VER)
        except Exception as e:
            raise ValueError(
                'Unable to instantiate mguClient with callbackURL: ' +
                self.callbackURL + ' ERROR: ' + str(e))

        # Standard Single Assembly
        #
        if type_name in [
                'KBaseGenomeAnnotations.Assembly', 'KBaseGenomes.ContigSet'
        ]:
            # create file data
            filename = os.path.join(input_dir,
                                    obj_name + '.' + fasta_file_extension)
            auClient.get_assembly_as_fasta({
                'ref': input_ref,
                'filename': filename
            })
            if not os.path.isfile(filename):
                raise ValueError(
                    'Error generating fasta file from an Assembly or ContigSet with AssemblyUtil'
                )
            # make sure fasta file isn't empty
            min_fasta_len = 1
            if not self.fasta_seq_len_at_least(filename, min_fasta_len):
                raise ValueError(
                    'Assembly or ContigSet is empty in filename: ' +
                    str(filename))

        # AssemblySet
        #
        elif type_name == 'KBaseSets.AssemblySet':

            # read assemblySet
            try:
                assemblySet_obj = setAPI_Client.get_assembly_set_v1({
                    'ref':
                    input_ref,
                    'include_item_info':
                    1
                })
            except Exception as e:
                raise ValueError('Unable to get object from workspace: (' +
                                 input_ref + ')' + str(e))
            assembly_refs = []
            assembly_names = []
            for assembly_item in assemblySet_obj['data']['items']:
                this_assembly_ref = assembly_item['ref']
                # assembly obj info
                try:
                    this_assembly_info = ws.get_object_info_new(
                        {'objects': [{
                            'ref': this_assembly_ref
                        }]})[0]
                    this_assembly_name = this_assembly_info[NAME_I]
                except Exception as e:
                    raise ValueError('Unable to get object from workspace: (' +
                                     this_assembly_ref + '): ' + str(e))
                assembly_refs.append(this_assembly_ref)
                assembly_names.append(this_assembly_name)

            # create file data (name for file is what's reported in results)
            for ass_i, assembly_ref in enumerate(assembly_refs):
                this_name = assembly_names[ass_i]
                filename = os.path.join(input_dir,
                                        this_name + '.' + fasta_file_extension)
                auClient.get_assembly_as_fasta({
                    'ref': assembly_ref,
                    'filename': filename
                })
                if not os.path.isfile(filename):
                    raise ValueError(
                        'Error generating fasta file from an Assembly or ContigSet with AssemblyUtil'
                    )
                # make sure fasta file isn't empty
                min_fasta_len = 1
                if not self.fasta_seq_len_at_least(filename, min_fasta_len):
                    raise ValueError(
                        'Assembly or ContigSet is empty in filename: ' +
                        str(filename))

        # Binned Contigs
        #
        elif type_name == 'KBaseMetagenomes.BinnedContigs':

            # download the bins as fasta and set the input folder name
            bin_file_dir = mguClient.binned_contigs_to_file({
                'input_ref': input_ref,
                'save_to_shock': 0
            })['bin_file_directory']
            os.rename(bin_file_dir, input_dir)
            # make sure fasta file isn't empty
            self.set_fasta_file_extensions(input_dir, fasta_file_extension)
            for (dirpath, dirnames, filenames) in os.walk(input_dir):
                for fasta_file in filenames:
                    fasta_path = os.path.join(input_dir, fasta_file)
                    min_fasta_len = 1
                    if not self.fasta_seq_len_at_least(fasta_path,
                                                       min_fasta_len):
                        raise ValueError(
                            'Binned Assembly is empty for fasta_path: ' +
                            str(fasta_path))
                break

        # Genome and GenomeSet
        #
        elif type_name == 'KBaseGenomes.Genome' or type_name == 'KBaseSearch.GenomeSet':
            genome_obj_names = []
            genome_sci_names = []
            genome_assembly_refs = []

            if type_name == 'KBaseGenomes.Genome':
                genomeSet_refs = [input_ref]
            else:  # get genomeSet_refs from GenomeSet object
                genomeSet_refs = []
                try:
                    genomeSet_object = ws.get_objects2(
                        {'objects': [{
                            'ref': input_ref
                        }]})['data'][0]['data']
                except Exception as e:
                    raise ValueError('Unable to fetch ' + str(input_ref) +
                                     ' object from workspace: ' + str(e))
                    #to get the full stack trace: traceback.format_exc()

                # iterate through genomeSet members
                for genome_id in genomeSet_object['elements'].keys():
                    if 'ref' not in genomeSet_object['elements'][genome_id] or \
                       genomeSet_object['elements'][genome_id]['ref'] == None or \
                       genomeSet_object['elements'][genome_id]['ref'] == '':
                        raise ValueError(
                            'genome_ref not found for genome_id: ' +
                            str(genome_id) + ' in genomeSet: ' +
                            str(input_ref))
                    else:
                        genomeSet_refs.append(
                            genomeSet_object['elements'][genome_id]['ref'])

            # genome obj data
            for i, this_input_ref in enumerate(genomeSet_refs):
                try:
                    objects = ws.get_objects2(
                        {'objects': [{
                            'ref': this_input_ref
                        }]})['data']
                    genome_obj = objects[0]['data']
                    genome_obj_info = objects[0]['info']
                    genome_obj_names.append(genome_obj_info[NAME_I])
                    genome_sci_names.append(genome_obj['scientific_name'])
                except:
                    raise ValueError("unable to fetch genome: " +
                                     this_input_ref)

                # Get genome_assembly_ref
                if ('contigset_ref' not in genome_obj or genome_obj['contigset_ref'] == None) \
                   and ('assembly_ref' not in genome_obj or genome_obj['assembly_ref'] == None):
                    msg = "Genome " + genome_obj_names[
                        i] + " (ref:" + input_ref + ") " + genome_sci_names[
                            i] + " MISSING BOTH contigset_ref AND assembly_ref.  Cannot process.  Exiting."
                    raise ValueError(msg)
                    continue
                elif 'assembly_ref' in genome_obj and genome_obj[
                        'assembly_ref'] != None:
                    msg = "Genome " + genome_obj_names[
                        i] + " (ref:" + input_ref + ") " + genome_sci_names[
                            i] + " USING assembly_ref: " + str(
                                genome_obj['assembly_ref'])
                    print(msg)
                    genome_assembly_refs.append(genome_obj['assembly_ref'])
                elif 'contigset_ref' in genome_obj and genome_obj[
                        'contigset_ref'] != None:
                    msg = "Genome " + genome_obj_names[
                        i] + " (ref:" + input_ref + ") " + genome_sci_names[
                            i] + " USING contigset_ref: " + str(
                                genome_obj['contigset_ref'])
                    print(msg)
                    genome_assembly_refs.append(genome_obj['contigset_ref'])

            # create file data (name for file is what's reported in results)
            for ass_i, assembly_ref in enumerate(genome_assembly_refs):
                this_name = genome_obj_names[ass_i]
                filename = os.path.join(input_dir,
                                        this_name + '.' + fasta_file_extension)
                auClient.get_assembly_as_fasta({
                    'ref': assembly_ref,
                    'filename': filename
                })
                if not os.path.isfile(filename):
                    raise ValueError(
                        'Error generating fasta file from an Assembly or ContigSet with AssemblyUtil'
                    )
                # make sure fasta file isn't empty
                min_fasta_len = 1
                if not self.fasta_seq_len_at_least(filename, min_fasta_len):
                    raise ValueError(
                        'Assembly or ContigSet is empty in filename: ' +
                        str(filename))

        # Unknown type slipped through
        #
        else:
            raise ValueError(
                'Cannot stage fasta file input directory from type: ' +
                type_name)

        # create summary fasta file with all bins
        self.cat_fasta_files(input_dir, fasta_file_extension, all_seq_fasta)

        return {
            'input_dir': input_dir,
            'folder_suffix': suffix,
            'all_seq_fasta': all_seq_fasta
        }
Beispiel #20
0
class EditAlignmentSet:
    """
     Constains a set of functions for expression levels calculations.
    """

    PARAM_IN_WS_NAME_ID = 'workspace_name'
    PARAM_IN_OBJ_NAME_ID = 'output_object_name'
    PARAM_IN_ALIGNSET_REF = 'alignment_set_ref'
    PARAM_IN_ALIGNS_ADD = 'alignments_to_add'
    PARAM_IN_ALIGNS_RM = 'alignments_to_remove'

    def __init__(self, config, logger=None):
        self.config = config
        self.logger = logger
        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.scratch = os.path.join(config['scratch'],
                                    'EAS_' + str(uuid.uuid4()))
        self.ws_url = config['workspace-url']
        self.ws_client = Workspace(self.ws_url)
        self.dfu = DataFileUtil(self.callback_url)
        self.setAPI = SetAPI(self.callback_url)
        pass

    def _process_params(self, params):
        """
        validates params passed to gen expression matrix method
        """
        for p in [
                self.PARAM_IN_ALIGNSET_REF, self.PARAM_IN_OBJ_NAME_ID,
                self.PARAM_IN_WS_NAME_ID
        ]:
            if p not in params:
                raise ValueError(
                    '"{}" parameter is required, but missing'.format(p))

        ws_name_id = params.get(self.PARAM_IN_WS_NAME_ID)
        if not isinstance(ws_name_id, int):
            try:
                ws_name_id = self.dfu.ws_name_to_id(ws_name_id)
            except DFUError as se:
                prefix = se.message.split('.')[0]
                raise ValueError(prefix)

        alignments_to_add = params.get(self.PARAM_IN_ALIGNS_ADD)
        alignments_to_remove = params.get(self.PARAM_IN_ALIGNS_RM)

        if alignments_to_add is None and alignments_to_remove is None:
            raise ValueError(
                'Either "alignments_to_remove" or "alignments_to_add" should be given'
            )

        return ws_name_id

    def _get_type_from_obj_info(self, info):
        return info[2].split('-')[0]

    def _get_obj_info(self, ref):
        return self.ws_client.get_object_info3({'objects': [{
            'ref': ref
        }]})['infos'][0]

    def _get_set_items(self, alignment_set_ref):

        obj_info = self._get_obj_info(alignment_set_ref)
        obj_type = self._get_type_from_obj_info(obj_info)

        if obj_type in ['KBaseSets.ReadsAlignmentSet']:
            set_data = self.setAPI.get_reads_alignment_set_v1(
                {'ref': alignment_set_ref})
            items = set_data['data']['items']
        elif obj_type in ['KBaseRNASeq.RNASeqAlignmentSet']:
            alignmentset_obj = self.ws_client.get_objects2(
                {'objects': [{
                    'ref': alignment_set_ref
                }]})['data'][0]
            """
            Add each alignment object to align_item and add it to items list
            """
            items = list()
            for alignment_ref in alignmentset_obj['data']['sample_alignments']:
                align_item = dict()
                align_item['ref'] = alignment_ref
                items.append(align_item)
        else:
            raise ValueError(
                '"alignment_set_ref" should be of type KBaseSets.ReadsAlignmentSet or '
                + 'KBaseRNASeq.RNASeqAlignmentSet')

        return items

    def _add_alignments(self, alignment_set_items, alignment_refs_list):

        for alignment_ref in alignment_refs_list:

            found = False
            for set_item in alignment_set_items:
                if set_item.get('ref') == alignment_ref:
                    print('{} already in the input Alignment Set. Not added'.
                          format(alignment_ref))
                    found = True
                    break

            if not found:
                alignment_set_items.append({'ref': alignment_ref})
        return alignment_set_items

    def _remove_alignments(self, input_alignment_set, alignment_set_items,
                           alignments_to_remove):

        for input_item in input_alignment_set:
            if not (input_item.get('ref') in alignments_to_remove):
                alignment_set_items.append(input_item)

        return alignment_set_items

    def _save_alignment_set(self, ws_name, obj_name, set_data):

        res = self.setAPI.save_reads_alignment_set_v1({
            "workspace": ws_name,
            "output_object_name": obj_name,
            "data": set_data
        })
        return res.get('set_ref')

    def edit_alignment_set(self, params):

        ws_name_id = self._process_params(params)
        obj_name = params.get(self.PARAM_IN_OBJ_NAME_ID)

        alignment_set_ref = params.get(self.PARAM_IN_ALIGNSET_REF)

        print('INPUT ALIGNMENT SET REF: ' + alignment_set_ref)

        input_alignment_set = self._get_set_items(alignment_set_ref)

        alignments_to_remove = params.get(self.PARAM_IN_ALIGNS_RM, None)
        alignments_to_add = params.get(self.PARAM_IN_ALIGNS_ADD, None)

        set_items = list()
        if alignments_to_remove is not None:
            set_items = self._remove_alignments(input_alignment_set, set_items,
                                                alignments_to_remove)
        if alignments_to_add is not None:
            set_items = self._add_alignments(set_items, alignments_to_add)

        set_data = {
            'description': 'Edited from {}'.format(alignment_set_ref),
            'items': set_items
        }

        output_alignment_set_ref = self._save_alignment_set(
            ws_name_id, obj_name, set_data)
        return output_alignment_set_ref
class FeatureSetBuilder:
    def _mkdir_p(self, path):
        """
        _mkdir_p: make directory for given path
        """
        if not path:
            return
        try:
            os.makedirs(path)
        except OSError as exc:
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise

    def _validate_upload_featureset_from_diff_expr_params(self, params):
        """
        _validate_upload_featureset_from_diff_expr_params:
                validates params passed to upload_featureset_from_diff_expr method
        """

        log('start validating upload_featureset_from_diff_expr params')

        # check for required parameters
        for p in [
                'diff_expression_ref', 'workspace_name', 'p_cutoff',
                'q_cutoff', 'fold_change_cutoff'
        ]:
            if p not in params:
                raise ValueError(
                    '"{}" parameter is required, but missing'.format(p))

        p = params.get('fold_scale_type')
        if p and p != 'logarithm':
            raise ValueError(
                '"fold_scale_type" parameter must be set to "logarithm", if used'
            )

    def _generate_report(self, up_feature_set_ref_list,
                         down_feature_set_ref_list,
                         filtered_expression_matrix_ref_list, workspace_name):
        """
        _generate_report: generate summary report
        """

        log('start creating report')

        output_html_files = self._generate_html_report(
            up_feature_set_ref_list, down_feature_set_ref_list)

        objects_created = list()
        for up_feature_set_ref in up_feature_set_ref_list:
            objects_created += [{
                'ref': up_feature_set_ref,
                'description': 'Upper FeatureSet Object'
            }]
        for down_feature_set_ref in down_feature_set_ref_list:
            objects_created += [{
                'ref': down_feature_set_ref,
                'description': 'Lower FeatureSet Object'
            }]

        for filtered_expression_matrix_ref in filtered_expression_matrix_ref_list:
            objects_created += [{
                'ref':
                filtered_expression_matrix_ref,
                'description':
                'Filtered ExpressionMatrix Object'
            }]

        report_params = {
            'message': '',
            'workspace_name': workspace_name,
            'objects_created': objects_created,
            'html_links': output_html_files,
            'direct_html_link_index': 0,
            'html_window_height': 333,
            'report_object_name':
            'kb_FeatureSetUtils_report_' + str(uuid.uuid4())
        }

        kbase_report_client = KBaseReport(self.callback_url)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {
            'report_name': output['name'],
            'report_ref': output['ref']
        }

        return report_output

    def _generate_html_report(self, up_feature_set_ref_list,
                              down_feature_set_ref_list):
        """
        _generate_html_report: generate html summary report
        """

        log('start generating html report')
        html_report = list()

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_directory)
        result_file_path = os.path.join(output_directory, 'report.html')

        uppper_feature_content = ''
        for up_feature_set_ref in up_feature_set_ref_list:
            feature_set_obj = self.ws.get_objects2(
                {'objects': [{
                    'ref': up_feature_set_ref
                }]})['data'][0]
            feature_set_data = feature_set_obj['data']
            feature_set_info = feature_set_obj['info']

            feature_set_name = feature_set_info[1]

            elements = feature_set_data.get('elements')
            feature_ids = elements.keys()

            uppper_feature_content += '<tr><td>{}</td><td>{}</td></tr>'.format(
                feature_set_name, len(feature_ids))

        lower_feature_content = ''
        for down_feature_set_ref in down_feature_set_ref_list:
            feature_set_obj = self.ws.get_objects2(
                {'objects': [{
                    'ref': down_feature_set_ref
                }]})['data'][0]
            feature_set_data = feature_set_obj['data']
            feature_set_info = feature_set_obj['info']

            feature_set_name = feature_set_info[1]

            elements = feature_set_data.get('elements')
            feature_ids = elements.keys()

            lower_feature_content += '<tr><td>{}</td><td>{}</td></tr>'.format(
                feature_set_name, len(feature_ids))

        with open(result_file_path, 'w') as result_file:
            with open(
                    os.path.join(os.path.dirname(__file__),
                                 'report_template.html'),
                    'r') as report_template_file:
                report_template = report_template_file.read()
                report_template = report_template.replace(
                    '<tr><td>Upper_FeatureSet</td></tr>',
                    uppper_feature_content)

                report_template = report_template.replace(
                    '<tr><td>Lower_FeatureSet</td></tr>',
                    lower_feature_content)

                result_file.write(report_template)

        html_report.append({
            'path': result_file_path,
            'name': os.path.basename(result_file_path),
            'label': os.path.basename(result_file_path),
            'description': 'HTML summary report'
        })
        return html_report

    def _process_diff_expression(self, diff_expression_set_ref,
                                 result_directory, condition_label_pair):
        """
        _process_diff_expression: process differential expression object info
        """

        log('start processing differential expression object')

        diff_expr_set_data = self.ws.get_objects2(
            {'objects': [{
                'ref': diff_expression_set_ref
            }]})['data'][0]['data']

        set_items = diff_expr_set_data['items']

        diff_expr_matrix_file_name = 'gene_results.csv'
        diff_expr_matrix_file = os.path.join(result_directory,
                                             diff_expr_matrix_file_name)

        with open(diff_expr_matrix_file, 'w') as csvfile:
            fieldnames = ['gene_id', 'log2_fold_change', 'p_value', 'q_value']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()

        for set_item in set_items:
            diff_expression_ref = set_item['ref']

            diff_expression_data = self.ws.get_objects2(
                {'objects': [{
                    'ref': diff_expression_ref
                }]})['data'][0]['data']

            label_string = set_item['label']
            label_list = map(lambda x: x.strip(), label_string.split(','))
            condition_1 = label_list[0]
            condition_2 = label_list[1]

            if condition_1 in condition_label_pair and condition_2 in condition_label_pair:
                genome_id = diff_expression_data['genome_ref']
                matrix_data = diff_expression_data['data']

                with open(diff_expr_matrix_file, 'ab') as csvfile:

                    row_ids = matrix_data.get('row_ids')
                    row_values = matrix_data.get('values')
                    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

                    for pos, row_id in enumerate(row_ids):
                        row_value = row_values[pos]
                        writer.writerow({
                            'gene_id': row_id,
                            'log2_fold_change': row_value[0],
                            'p_value': row_value[1],
                            'q_value': row_value[2]
                        })

        return diff_expr_matrix_file, genome_id

    def _generate_feature_set(self, feature_ids, genome_id, workspace_name,
                              feature_set_name):
        """
        _generate_feature_set: generate FeatureSet object

        KBaseCollections.FeatureSet type:
        typedef structure {
            string description;
            list<feature_id> element_ordering;
            mapping<feature_id, list<genome_ref>> elements;
        } FeatureSet;
        """

        log('start saving KBaseCollections.FeatureSet object')

        if isinstance(workspace_name, int) or workspace_name.isdigit():
            workspace_id = workspace_name
        else:
            workspace_id = self.dfu.ws_name_to_id(workspace_name)

        elements = {}
        map(lambda feature_id: elements.update({feature_id: [genome_id]}),
            feature_ids)
        feature_set_data = {
            'description': 'Generated FeatureSet from DifferentialExpression',
            'element_ordering': feature_ids,
            'elements': elements
        }

        object_type = 'KBaseCollections.FeatureSet'
        save_object_params = {
            'id':
            workspace_id,
            'objects': [{
                'type': object_type,
                'data': feature_set_data,
                'name': feature_set_name
            }]
        }

        dfu_oi = self.dfu.save_objects(save_object_params)[0]
        feature_set_obj_ref = str(dfu_oi[6]) + '/' + str(
            dfu_oi[0]) + '/' + str(dfu_oi[4])

        return feature_set_obj_ref

    def _process_matrix_file(self, diff_expr_matrix_file, comp_p_value,
                             comp_q_value, comp_fold_change_cutoff):
        """
        _process_matrix_file: filter matrix file by given cutoffs
        """

        log('start processing matrix file')

        up_feature_ids = []
        down_feature_ids = []

        if comp_fold_change_cutoff < 0:
            comp_fold_change_cutoff = -comp_fold_change_cutoff

        with open(diff_expr_matrix_file, 'r') as file:
            reader = csv.DictReader(file)

            for row in reader:
                feature_id = row['gene_id']
                row_p_value = row['p_value']
                row_q_value = row['q_value']
                row_fold_change_cutoff = row['log2_fold_change']

                null_value = set(['NA', 'null', ''])
                col_value = set(
                    [row_p_value, row_q_value, row_fold_change_cutoff])

                if not col_value.intersection(null_value):
                    p_value_condition = float(row_p_value) <= comp_p_value
                    q_value_condition = float(row_q_value) <= comp_q_value

                    up_matches_condition = (p_value_condition
                                            and q_value_condition
                                            and (float(row_fold_change_cutoff)
                                                 >= comp_fold_change_cutoff))

                    down_matches_condition = (p_value_condition
                                              and q_value_condition and
                                              (float(row_fold_change_cutoff) <=
                                               -comp_fold_change_cutoff))

                    if up_matches_condition:
                        up_feature_ids.append(feature_id)
                    elif down_matches_condition:
                        down_feature_ids.append(feature_id)

        return list(set(up_feature_ids)), list(set(down_feature_ids))

    def _filter_expression_matrix(self, expression_matrix_ref, feature_ids,
                                  workspace_name,
                                  filtered_expression_matrix_suffix):
        """
        _filter_expression_matrix: generated filtered expression matrix
        """

        log('start saving KBaseFeatureValues.ExpressionMatrix object')

        if isinstance(workspace_name, int) or workspace_name.isdigit():
            workspace_id = workspace_name
        else:
            workspace_id = self.dfu.ws_name_to_id(workspace_name)

        expression_matrix_obj = self.dfu.get_objects(
            {'object_refs': [expression_matrix_ref]})['data'][0]

        expression_matrix_info = expression_matrix_obj['info']
        expression_matrix_data = expression_matrix_obj['data']

        expression_matrix_name = expression_matrix_info[1]

        if re.match('.*_*[Ee]xpression_*[Mm]atrix', expression_matrix_name):
            filtered_expression_matrix_name = re.sub(
                '_*[Ee]xpression_*[Mm]atrix',
                filtered_expression_matrix_suffix, expression_matrix_name)
        else:
            filtered_expression_matrix_name = expression_matrix_name + \
                filtered_expression_matrix_suffix

        filtered_expression_matrix_data = expression_matrix_data.copy()

        data = filtered_expression_matrix_data['data']

        row_ids = data['row_ids']
        values = data['values']
        filtered_data = data.copy()

        filtered_row_ids = list()
        filtered_values = list()
        for pos, row_id in enumerate(row_ids):
            if row_id in feature_ids:
                filtered_row_ids.append(row_id)
                filtered_values.append(values[pos])

        filtered_data['row_ids'] = filtered_row_ids
        filtered_data['values'] = filtered_values

        filtered_expression_matrix_data['data'] = filtered_data

        object_type = 'KBaseFeatureValues.ExpressionMatrix'
        save_object_params = {
            'id':
            workspace_id,
            'objects': [{
                'type': object_type,
                'data': filtered_expression_matrix_data,
                'name': filtered_expression_matrix_name
            }]
        }

        dfu_oi = self.dfu.save_objects(save_object_params)[0]
        filtered_expression_matrix_ref = str(dfu_oi[6]) + '/' + str(
            dfu_oi[0]) + '/' + str(dfu_oi[4])

        return filtered_expression_matrix_ref

    def _xor(self, a, b):
        return bool(a) != bool(b)

    def _check_input_labels(self, condition_pairs, available_condition_labels):
        """
        _check_input_labels: check input condition pairs
        """
        checked = True
        for condition_pair in condition_pairs:

            label_string = condition_pair['label_string'][0].strip()
            label_list = map(lambda x: x.strip(), label_string.split(','))
            first_label = label_list[0]
            second_label = label_list[1]

            if first_label not in available_condition_labels:
                error_msg = 'Condition: {} is not availalbe. '.format(
                    first_label)
                error_msg += 'Available conditions: {}'.format(
                    available_condition_labels)
                raise ValueError(error_msg)

            if second_label not in available_condition_labels:
                error_msg = 'Condition: {} is not availalbe. '.format(
                    second_label)
                error_msg += 'Available conditions: {}'.format(
                    available_condition_labels)
                raise ValueError(error_msg)

            if first_label == second_label:
                raise ValueError('Input conditions are the same')

        return checked

    def _get_condition_labels(self, diff_expression_set_ref):
        """
        _get_condition_labels: get all possible condition label pairs
        """
        log('getting all possible condition pairs')

        condition_label_pairs = list()
        available_condition_labels = set()
        diff_expression_set_obj = self.ws.get_objects2(
            {'objects': [{
                'ref': diff_expression_set_ref
            }]})['data'][0]
        diff_expression_set_data = diff_expression_set_obj['data']
        items = diff_expression_set_data.get('items')
        for item in items:
            label_string = item['label']
            label_list = map(lambda x: x.strip(), label_string.split(','))
            condition_label_pairs.append(label_list)
            map(lambda x: available_condition_labels.add(x), label_list)

        log('all pssible conditon pairs:\n{}'.format(condition_label_pairs))

        return condition_label_pairs, available_condition_labels

    def __init__(self, config):
        self.ws_url = config["workspace-url"]
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        self.shock_url = config['shock-url']
        self.ws = Workspace(self.ws_url, token=self.token)
        self.dfu = DataFileUtil(self.callback_url)
        self.scratch = config['scratch']

    def upload_featureset_from_diff_expr(self, params):
        """
        upload_featureset_from_diff_expr: create FeatureSet from RNASeqDifferentialExpression
                                          based on given threshold cutoffs

        required params:
        diff_expression_ref: DifferetialExpressionMatrixSet object reference
        expression_matrix_ref: ExpressionMatrix object reference
        p_cutoff: p value cutoff
        q_cutoff: q value cutoff
        fold_scale_type: one of ["linear", "log2+1", "log10+1"]
        fold_change_cutoff: fold change cutoff
        feature_set_suffix: Result FeatureSet object name suffix
        filtered_expression_matrix_suffix: Result ExpressionMatrix object name suffix
        workspace_name: the name of the workspace it gets saved to

        return:
        result_directory: folder path that holds all files generated
        up_feature_set_ref_list: list of generated upper FeatureSet object reference
        down_feature_set_ref_list: list of generated down FeatureSet object reference
        filtered_expression_matrix_ref_list: list of generated filtered ExpressionMatrix object ref
        report_name: report name generated by KBaseReport
        report_ref: report reference generated by KBaseReport
        """

        self._validate_upload_featureset_from_diff_expr_params(params)

        diff_expression_set_ref = params.get('diff_expression_ref')
        diff_expression_set_info = self.ws.get_object_info3(
            {"objects": [{
                "ref": diff_expression_set_ref
            }]})['infos'][0]
        diff_expression_set_name = diff_expression_set_info[1]

        result_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(result_directory)

        (available_condition_label_pairs, available_condition_labels
         ) = self._get_condition_labels(diff_expression_set_ref)

        run_all_combinations = params.get('run_all_combinations')
        condition_pairs = params.get('condition_pairs')
        if not self._xor(run_all_combinations, condition_pairs):
            error_msg = "Invalid input:\nselect 'Run All Paired Condition Combinations' "
            error_msg += "or provide partial condition pairs. Don't do both or neither"
            raise ValueError(error_msg)

        if run_all_combinations:
            condition_label_pairs = available_condition_label_pairs
        else:
            if self._check_input_labels(condition_pairs,
                                        available_condition_labels):
                condition_label_pairs = list()
                for condition_pair in condition_pairs:
                    label_string = condition_pair['label_string'][0].strip()
                    condition_labels = map(lambda x: x.strip(),
                                           label_string.split(','))
                    condition_label_pairs.append(condition_labels)

        up_feature_set_ref_list = list()
        down_feature_set_ref_list = list()
        filtered_expression_matrix_ref_list = list()

        for condition_label_pair in condition_label_pairs:
            condition_string = '_' + '_'.join(condition_label_pair)
            diff_expr_matrix_file, genome_id = self._process_diff_expression(
                diff_expression_set_ref, result_directory,
                condition_label_pair)

            up_feature_ids, down_feature_ids = self._process_matrix_file(
                diff_expr_matrix_file, params.get('p_cutoff'),
                params.get('q_cutoff'), params.get('fold_change_cutoff'))
            filtered_expression_matrix_suffix = condition_string + \
                params.get('filtered_expression_matrix_suffix')
            if params.get('expression_matrix_ref'):
                filtered_expression_matrix_ref = self._filter_expression_matrix(
                    params.get('expression_matrix_ref'),
                    up_feature_ids + down_feature_ids,
                    params.get('workspace_name'),
                    filtered_expression_matrix_suffix)
                filtered_expression_matrix_ref_list.append(
                    filtered_expression_matrix_ref)

            feature_set_suffix = params.get('feature_set_suffix')
            up_feature_set_name = diff_expression_set_name + \
                condition_string + '_up' + feature_set_suffix
            up_feature_set_ref = self._generate_feature_set(
                up_feature_ids, genome_id, params.get('workspace_name'),
                up_feature_set_name)
            up_feature_set_ref_list.append(up_feature_set_ref)

            down_feature_set_name = diff_expression_set_name + \
                condition_string + '_down' + feature_set_suffix
            down_feature_set_ref = self._generate_feature_set(
                down_feature_ids, genome_id, params.get('workspace_name'),
                down_feature_set_name)
            down_feature_set_ref_list.append(down_feature_set_ref)

        returnVal = {
            'result_directory':
            result_directory,
            'up_feature_set_ref_list':
            up_feature_set_ref_list,
            'down_feature_set_ref_list':
            down_feature_set_ref_list,
            'filtered_expression_matrix_ref_list':
            filtered_expression_matrix_ref_list
        }

        report_output = self._generate_report(
            up_feature_set_ref_list, down_feature_set_ref_list,
            filtered_expression_matrix_ref_list, params.get('workspace_name'))
        returnVal.update(report_output)

        return returnVal
Beispiel #22
0
class BallgownUtil:

    def __init__(self, config):
        self.ws_url = config["workspace-url"]
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        self.shock_url = config['shock-url']
        self.dfu = DataFileUtil(self.callback_url)
        self.rau = ReadsAlignmentUtils(self.callback_url)
        self.fv = KBaseFeatureValues(self.callback_url)
        self.deu = DifferentialExpressionUtils(self.callback_url, service_ver='dev')
        self.ws = Workspace(self.ws_url, token=self.token)
        self.scratch = config['scratch']
        self.config = config

    def _xor(self, a, b):
        return bool(a) != bool(b)

    def _validate_run_ballgown_app_params(self, params):
        """
        _validate_run_ballgown_app_params:
                validates params passed to run_ballgown_app method
        """

        log('start validating run_ballgown_app params')

        # check for required parameters
        for p in ['expressionset_ref', 'diff_expression_matrix_set_suffix',
                  'workspace_name']:
            if p not in params:
                raise ValueError('"{}" parameter is required, but missing'.format(p))

        run_all_combinations = params.get('run_all_combinations')
        condition_pair_subset = params.get('condition_pair_subset')

        if not self._xor(run_all_combinations, condition_pair_subset):
            error_msg = "Invalid input:\nselect 'Run All Paired Condition Combinations' "
            error_msg += "or provide subset of condition pairs. Don't provide both, or neither."
            raise ValueError(error_msg)

    def _mkdir_p(self, path):
        """
        _mkdir_p: make directory for given path
        """
        if not path:
            return
        try:
            os.makedirs(path)
        except OSError as exc:
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise

    def _generate_html_report(self, result_directory, params, diff_expression_matrix_set_ref):
        """
        _generate_html_report: generate html summary report
        """

        log('start generating html report')
        html_report = list()

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_directory)
        result_file_path = os.path.join(output_directory, 'report.html')

        for file in glob.glob(os.path.join(result_directory, '*.tsv')):
            shutil.copy(file, output_directory)

        # volcano_plot exists only if there are two condition groups
        for file in glob.glob(os.path.join(result_directory, '*.png')):
            shutil.copy(file, output_directory)

        diff_expr_set = self.ws.get_objects2({'objects':
                                              [{'ref':
                                                diff_expression_matrix_set_ref[
                                                    'diffExprMatrixSet_ref']}]})['data'][0]
        diff_expr_set_data = diff_expr_set['data']
        diff_expr_set_info = diff_expr_set['info']
        diff_expr_set_name = diff_expr_set_info[1]

        overview_content = ''
        overview_content += '<br/><table><tr><th>Generated DifferentialExpressionMatrixSet'
        overview_content += ' Object</th></tr>'
        overview_content += '<tr><td>{} ({})'.format(diff_expr_set_name,
                                                     diff_expression_matrix_set_ref[
                                                         'diffExprMatrixSet_ref'])
        overview_content += '</td></tr></table>'

        overview_content += '<p><br/></p>'

        overview_content += '<br/><table><tr><th>Generated DifferentialExpressionMatrix'
        overview_content += ' Object</th><th></th><th></th><th></th></tr>'
        overview_content += '<tr><th>Differential Expression Matrix Name</th>'
        overview_content += '<th>Condition 1</th>'
        overview_content += '<th>Condition 2</th>'
        overview_content += '</tr>'

        for item in diff_expr_set_data['items']:
            item_diffexprmatrix_object = self.ws.get_objects2({'objects':
                                                               [{'ref': item['ref']}]})[
                'data'][0]
            item_diffexprmatrix_info = item_diffexprmatrix_object['info']
            item_diffexprmatrix_data = item_diffexprmatrix_object['data']
            diffexprmatrix_name = item_diffexprmatrix_info[1]

            overview_content += '<tr><td>{} ({})</td>'.format(diffexprmatrix_name,
                                                              item['ref'])
            overview_content += '<td>{}</td>'.format(item_diffexprmatrix_data.
                                                     get('condition_mapping').keys()[0])
            overview_content += '<td>{}</td>'.format(item_diffexprmatrix_data.
                                                     get('condition_mapping').values()[0])
            overview_content += '</tr>'
        overview_content += '</table>'

        # visualization
        image_content = ''
        for image in glob.glob(output_directory + "/*.png"):
            image = image.replace(output_directory + '/', '')
            caption = image.replace(output_directory + '/', '').replace('.png', '')
            image_content += '<p style="text-align:center"><img align="center" src="{}" ' \
                             'width="600" height="400"></a><a target="_blank"><br>' \
                             '<p align="center">{}</p></p>'.format(
                                 image, caption)

        with open(result_file_path, 'w') as result_file:
            with open(os.path.join(os.path.dirname(__file__), 'report_template.html'),
                      'r') as report_template_file:
                report_template = report_template_file.read()
                report_template = report_template.replace('<p>Overview_Content</p>',
                                                          overview_content)
                report_template = report_template.replace('<p>Image Gallery</p>',
                                                          image_content)
                result_file.write(report_template)

        report_shock_id = self.dfu.file_to_shock({'file_path': output_directory,
                                                  'pack': 'zip'})['shock_id']

        html_report.append({'shock_id': report_shock_id,
                            'name': os.path.basename(result_file_path),
                            'label': os.path.basename(result_file_path),
                            'description': 'HTML summary report for Ballgown App'})
        return html_report

    def _generate_output_file_list(self, result_directory):
        """
        _generate_output_file_list: zip result files and generate file_links for report
        """
        log('Start packing result files')
        output_files = list()

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_directory)
        result_file = os.path.join(output_directory, 'ballgown_result.zip')

        with zipfile.ZipFile(result_file, 'w',
                             zipfile.ZIP_DEFLATED,
                             allowZip64=True) as zip_file:
            for root, dirs, files in os.walk(result_directory):
                for file in files:
                    if not (file.endswith('.zip') or
                            file.endswith('.png') or
                            file.endswith('.DS_Store')):
                        zip_file.write(os.path.join(root, file), file)

        output_files.append({'path': result_file,
                             'name': os.path.basename(result_file),
                             'label': os.path.basename(result_file),
                             'description': 'File(s) generated by Ballgown App'})

        return output_files

    def _generate_report(self, params, result_directory, diff_expression_matrix_set_ref):
        """
        _generate_report: generate summary report
        """
        log('creating report')

        output_files = self._generate_output_file_list(result_directory)

        output_html_files = self._generate_html_report(
            result_directory, params, diff_expression_matrix_set_ref)

        report_params = {
            'message': '',
            'workspace_name': params.get('workspace_name'),
            'file_links': output_files,
            'html_links': output_html_files,
            'direct_html_link_index': 0,
            'html_window_height': 333,
            'report_object_name': 'kb_ballgown_report_' + str(uuid.uuid4())}

        kbase_report_client = KBaseReport(self.callback_url)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {'report_name': output['name'], 'report_ref': output['ref']}

        return report_output

    def get_sample_dir_group_file(self, mapped_expression_ids, condition_labels):

        ngroups = 0
        group_name_indices = {}
        group_counts = {}

        for group in condition_labels:
            if not group in group_name_indices:
                group_name_indices[group] = ngroups
                ngroups = ngroups + 1
            if not group in group_counts:
                group_counts[group] = 1
            else:
                group_counts[group] = group_counts[group] + 1

        # checks for proper ballgown execution:
        if ngroups < 2:
            raise Exception("At least two condition groups are needed for this analysis. ")
        for group in condition_labels:
            if group_counts[group] < 2:
                raise Exception(
                    "Condition group {0} has less than 2 members; ballgown will not run. "
                    "At least two condition groups are needed for this analysis. ".format(group))

        group_file_dir = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(group_file_dir)

        try:
            condition_labels_uniqued = list(set(condition_labels))
            sgf_name = os.path.join(group_file_dir, 'sample_dir_group_file_' +
                                    condition_labels_uniqued[0] + '_' +
                                    condition_labels_uniqued[1])
            sgf = open(sgf_name, "w")
        except Exception:
            raise Exception(
                "Can't open file {0} for writing {1}".format(
                    sgf_name, traceback.format_exc()))

        index = 0  # condition label index
        for ii in mapped_expression_ids:
            for alignment_id, expression_id in ii.items():
                expression_object = self.ws.get_objects2(
                    {'objects':
                     [{'ref': expression_id}]})['data'][0]
                handle_id = expression_object['data']['file']['hid']
                expression_name = expression_object['info'][1]

                expression_dir = os.path.join(group_file_dir, expression_name)
                self._mkdir_p(expression_dir)

                print('expression_name: ' + str(expression_dir) + ' ' +
                      str(group_name_indices[condition_labels[index]]))
                sgf.write("{0}  {1}\n".format(expression_dir,
                                              group_name_indices[condition_labels[index]]))

                self.dfu.shock_to_file({'handle_id': handle_id,
                                        'file_path': expression_dir,
                                        'unpack': 'unpack'})

                required_files = [
                    'e2t.ctab',
                    'e_data.ctab',
                    'i2t.ctab',
                    'i_data.ctab',
                    't_data.ctab']
                for file in glob.glob(expression_dir + '/*'):
                    if not os.path.basename(file) in required_files:
                        os.remove(file)

            index += 1

        return sgf_name

    def _cleanup(self, directory=None):
        """
        Clean up after the job.  At the moment this just means removing the working
        directory, but later could mean other things.
        """

        try:
            # it would not delete if fold is not empty
            shutil.rmtree(directory, ignore_errors=True)
            # need to iterate each entry
        except IOError as e:
            log("Unable to remove working directory {0}".format(directory))
            raise

    def _setupWorkingDir(self, directory=None):
        """
        Clean up an existing workingdir and create a new one
        """
        try:
            if os.path.exists(directory):
                self._cleanup(directory)
            os.mkdir(directory)
        except IOError:
            log("Unable to setup working dir {0}".format(directory))
            raise

    def _check_intron_measurements(self, sample_dir_group_table_file):
        """
        Check if intron measurements files are non-empty
        :param sample_dir_group_table_file:
        :return:
        """
        log('checking for intron level measurements... ')
        file = open(sample_dir_group_table_file, 'r')
        textFileLines = file.readlines()
        for line in textFileLines:
            expr_dir = line.split()[0]
            log(expr_dir)
            i2t_file = open(os.path.join(expr_dir, 'i2t.ctab'), 'r')
            if len(i2t_file.readlines()) <= 1:  # only header line exists
                raise Exception("No intron measurements found! Input expressions are possibly "
                                "from a prokaryote. Ballgown functions only on eukaryotic data."
                                " Consider using DeSeq2 or CuffDiff instead of BallGown.")
            idata_file = open(os.path.join(expr_dir, 'i_data.ctab'), 'r')
            if len(idata_file.readlines()) <= 1:  # only header line exists
                raise Exception("No intron measurements found! Input expressions are possibly "
                                "from a prokaryote. Ballgown functions only on eukaryotic data."
                                " Consider using DeSeq2 or CuffDiff instead of BallGown")

    def run_ballgown_diff_exp(self,
                              rscripts_dir,
                              sample_dir_group_table_file,
                              ballgown_output_dir,
                              output_csv,
                              volcano_plot_file
                              ):
        """ Make R call to execute the system

        :param rscripts_dir:
        :param sample_dir_group_table_file:

        :param ballgown_output_dir:
          sample_group_table is a listing of output Stringtie subdirectories,
         (full path specification) paired with group label (0 or 1), ie
            /path/WT_rep1_stringtie    0
            /path/WT_rep2_stringtie    0
            /path/EXP_rep1_stringtie   1
            /path/EXP_rep2_stringtie   1
          (order doesn't matter, but the directory-group correspondance does)

        :param output_csv:
        :param volcano_plot_file:
        :return:
        """
        # check if intron-level expression measurements are present
        self._check_intron_measurements(sample_dir_group_table_file)

        rcmd_list = ['Rscript', os.path.join(rscripts_dir, 'ballgown_fpkmgenematrix.R'),
                     '--sample_dir_group_table', sample_dir_group_table_file,
                     '--output_dir', ballgown_output_dir,
                     '--output_csvfile', output_csv,
                     '--volcano_plot_file', volcano_plot_file
                     ]
        rcmd_str = " ".join(str(x) for x in rcmd_list)
        log("rcmd_string is {0}".format(rcmd_str))
        openedprocess = subprocess.Popen(rcmd_str, shell=True)
        openedprocess.wait()
        # Make sure the openedprocess.returncode is zero (0)
        if openedprocess.returncode != 0:
            log("R script did not return normally, return code - "
                + str(openedprocess.returncode))
            raise Exception("Rscript failure")

    def load_diff_expr_matrix(self, ballgown_output_dir, output_csv):
        """
        Reads csv diff expr matrix file from Ballgown and returns as a
        dictionary of rows with the gene as key.  Each key gives a row of
        length three corresponding to fold_change, pval and qval in string form
        - can include 'NA'
        :param ballgown_output_dir
        :param output_csv:
        :return:
        """

        diff_matrix_file = os.path.join(ballgown_output_dir, output_csv)

        if not os.path.isfile(diff_matrix_file):
            raise Exception("differential expression matrix csvfile {0} doesn't exist!".format(
                diff_matrix_file))

        n = 0
        dm = {}
        with open(diff_matrix_file, "r") as csv_file:
            csv_rows = csv.reader(csv_file, delimiter="\t", quotechar='"')
            for row in csv_rows:
                n = n + 1
                if (n == 1):
                    if (row != ['id', 'fc', 'pval', 'qval']):
                        raise Exception(
                            "did not get expected column heading from {0}".format(
                                diff_matrix_file))
                else:
                    if (len(row) != 4):
                        raise Exception(
                            "did not get 4 elements in row {0} of csv file {1} ".format(
                                n, diff_matrix_file))
                    key = row[0]
                    # put in checks for NA or numeric for row[1] through 4
                    if (key in dm):
                        raise Exception(
                            "duplicate key {0} in row {1} of csv file {2} ".format(
                                key, n, diff_matrix_file))
                    dm[key] = row[1:5]

        return dm

    def _transform_expression_set_data(self, expression_set_data):
        """
        The stitch to connect KBaseSets.ExpressionSet-2.0 type data to
        the older KBaseRNASeq.RNASeqExpressionSet-3.0 that the implementation
        depends on. This is done by doing a dive into the nested alignment
        object ref and getting the required data
        :param expression_set_data:
        :return: transformed expression_set_data
        """
        transform = dict()
        # get genome id
        expression_ref = expression_set_data['items'][0]['ref']
        wsid, objid, ver = expression_ref.split('/')
        expression_obj = self.ws.get_objects([{'objid': objid, 'wsid': wsid}])
        transform['genome_id'] = expression_obj[0]['data']['genome_id']

        # get sampleset_id
        #alignment_ref = expression_obj[0]['data']['mapped_rnaseq_alignment'].values()[0]
        #wsid, objid, ver = alignment_ref.split('/')
        #alignment_obj = self.ws.get_objects([{'objid': objid, 'wsid': wsid}])
        #transform['sampleset_id'] = alignment_obj[0]['data']['sampleset_id']

        # build mapped_expression_ids
        mapped_expression_ids = list()
        for item in expression_set_data['items']:
            expression_ref = item['ref']
            wsid, objid, ver = expression_ref.split('/')
            expression_obj = self.ws.get_objects([{'objid': objid, 'wsid': wsid}])
            alignment_ref = expression_obj[0]['data']['mapped_rnaseq_alignment'].values()[0]
            mapped_expression_ids.append({alignment_ref: expression_ref})
        transform['mapped_expression_ids'] = mapped_expression_ids

        return transform

    def _build_condition_label_list(self, mapped_expression_ids):
        """
        Extracts the condition labels from each expression in the specified expression set data
        and builds a list of condition labels
        :param expression_set_data: expression set data
        :return: list of condition labels whose order resembles the expression order in
        the expression data
        """
        condition_labels = list()

        for ii in mapped_expression_ids:
            for alignment_id, expression_id in ii.items():
                expression_object = self.ws.get_objects2(
                    {'objects':
                     [{'ref': expression_id}]})['data'][0]
                condition_labels.append(expression_object['data']['condition'])

        return condition_labels

    def _update_output_file_header(self, output_file):
        """
        Modify header of output file (required by DifferentialExpressionUtils)
        :param output_file:
        :return:
        """
        f = open(output_file, 'r')
        filedata = f.read()
        f.close()

        modified_output = filedata.replace(
            '"id"\t"fc"\t"pval"\t"qval"',
            'gene_id\tlog2_fold_change\tp_value\tq_value')

        f = open(output_file, 'w')
        f.write(modified_output)
        f.close()

    def _check_input_labels(self, condition_pair_subset, available_condition_labels):
        """
        _check_input_labels: check input condition pairs
        """
        checked = True
        # example struct: [{u'condition': u'hy5'}, {u'condition': u'WT'}]
        condition_values = set()
        for condition in condition_pair_subset:
            condition_values.add(condition['condition'])

        if len(condition_values) < 2:
            error_msg = 'At least two unique conditions must be specified. '
            raise ValueError(error_msg)

        for condition in condition_pair_subset:

            label = condition['condition'].strip()
            if label not in available_condition_labels:
                error_msg = 'Condition label "{}" is not a valid condition. '.format(label)
                error_msg += 'Must be one of "{}"'.format(available_condition_labels)
                raise ValueError(error_msg)

        return checked

    def run_ballgown_app(self, params):
        """
        run_ballgown_app: run Ballgown app
        (https://www.bioconductor.org/packages/release/bioc/html/ballgown.html)

        required params:
            expressionset_ref: ExpressionSet object reference
            diff_expression_matrix_set_suffix: suffix to KBaseSets.DifferetialExpressionMatrixSet
            name
            condition_labels: conditions for expression set object
            alpha_cutoff: q value cutoff
            fold_change_cutoff: fold change cutoff
            workspace_name: the name of the workspace it gets saved to

        optional params:
            fold_scale_type: one of ["linear", "log2+1", "log10+1"]

        return:
            result_directory: folder path that holds all files generated by run_deseq2_app
            diff_expression_matrix_set_ref: generated KBaseSets.DifferetialExpressionMatrixSet
            object reference
            report_name: report name generated by KBaseReport
            report_ref: report reference generated by KBaseReport
        """
        log('--->\nrunning BallgownUtil.run_ballgown_app\n' +
            'params:\n{}'.format(json.dumps(params, indent=1)))

        self._validate_run_ballgown_app_params(params)

        expressionset_ref = params.get('expressionset_ref')

        expression_set_info = self.ws.get_object_info3({
            "objects": [{"ref": expressionset_ref}]})['infos'][0]
        expression_object_type = expression_set_info[2]

        # set output object name
        differential_expression_suffix = params['diff_expression_matrix_set_suffix']
        expression_name = expression_set_info[1]
        if re.match('.*_[Ee]xpression$', expression_name):
            params['diff_expression_matrix_set_name'] = re.sub(
                '_[Ee]xpression$', differential_expression_suffix, expression_name)
        if re.match('.*_[Ee]xpression_[Ss]et$', expression_name):
            params['diff_expression_matrix_set_name'] = re.sub(
                '_[Ee]xpression_[Ss]et$', differential_expression_suffix, expression_name)
        else:
            params['diff_expression_matrix_set_name'] = expression_name + \
                differential_expression_suffix

        log('--->\nexpression object type: \n' +
            '{}'.format(expression_object_type))

        if re.match('KBaseRNASeq.RNASeqExpressionSet-\d.\d', expression_object_type):
            expression_set_data = self.ws.get_objects2(
                {'objects':
                 [{'ref': expressionset_ref}]})['data'][0]['data']

        elif re.match('KBaseSets.ExpressionSet-\d.\d', expression_object_type):
            expression_set_data = self.ws.get_objects2(
                {'objects':
                 [{'ref': expressionset_ref}]})['data'][0]['data']

            expression_set_data = self._transform_expression_set_data(expression_set_data)

        mgroup = MultiGroup(self.ws)
        pairwise_mapped_expression_ids = mgroup.build_pairwise_groups(
            expression_set_data['mapped_expression_ids'])

        ballgown_output_dir = os.path.join(self.scratch, "ballgown_out")
        log("ballgown output dir is {0}".format(ballgown_output_dir))
        self._setupWorkingDir(ballgown_output_dir)

        # get set of all condition labels
        available_condition_labels = \
            self._build_condition_label_list(expression_set_data['mapped_expression_ids'])

        if params.get('run_all_combinations'):
            requested_condition_labels = available_condition_labels
        else:
            # get set of user specified condition labels
            condition_pair_subset = params.get('condition_pair_subset')
            if self._check_input_labels(condition_pair_subset, available_condition_labels):
                requested_condition_labels = list()
                # example: [{u'condition': u'hy5'}, {u'condition': u'WT'}]
                for condition in condition_pair_subset:
                    if condition.get('condition').strip() not in requested_condition_labels:
                        requested_condition_labels.append(condition.get('condition').strip())

        log("User requested pairwise combinations from condition label list : " +
            str(requested_condition_labels))

        diff_expr_files = list()

        for mapped_expression_ids in pairwise_mapped_expression_ids:
            print('processing pairwise combination: ')
            pprint(mapped_expression_ids)
            print('with condtion labels: ')
            condition_labels = self._build_condition_label_list(mapped_expression_ids)
            pprint(condition_labels)

            # skip if condition labels in this pairwise combination don't exist in
            # set of user requested condition labels
            skip = False
            for condition in condition_labels:
                if condition not in requested_condition_labels:
                    log("skipping " + str(condition_labels))
                    skip = True
            if skip:
                continue

            sample_dir_group_file = self.get_sample_dir_group_file(mapped_expression_ids,
                                                                   condition_labels)

            log("about to run_ballgown_diff_exp")
            rscripts_dir = '/kb/module/rscripts'

            condition_labels_uniqued = list()
            for condition in condition_labels:
                if condition not in condition_labels_uniqued:
                    condition_labels_uniqued.append(condition)

            output_csv = 'ballgown_diffexp_' + \
                condition_labels_uniqued[0] + '_' + condition_labels_uniqued[1] + '.tsv'
            volcano_plot_file = 'volcano_plot_' + \
                condition_labels_uniqued[0] + '_' + condition_labels_uniqued[1] + '.png'

            self.run_ballgown_diff_exp(rscripts_dir,
                                       sample_dir_group_file,
                                       ballgown_output_dir,
                                       output_csv,
                                       volcano_plot_file)

            log("back from run_ballgown_diff_exp, about to load diff exp matrix file")
            # diff_expr_matrix = self.load_diff_expr_matrix(ballgown_output_dir,
            # output_csv)  # read file before its zipped

            self._update_output_file_header(os.path.join(ballgown_output_dir, output_csv))

            diff_expr_file = dict()
            diff_expr_file.update({'condition_mapping':
                                   {condition_labels_uniqued[0]: condition_labels_uniqued[1]}})
            diff_expr_file.update(
                {'diffexpr_filepath': os.path.join(ballgown_output_dir, output_csv)})
            diff_expr_files.append(diff_expr_file)

        deu_param = {
            'destination_ref': params['workspace_name'] + '/' +
            params['diff_expression_matrix_set_name'],
            'diffexpr_data': diff_expr_files,
            'tool_used': TOOL_NAME,
            'tool_version': TOOL_VERSION,
            'genome_ref': expression_set_data.get('genome_id'),
        }

        diff_expression_matrix_set_ref = self.deu.save_differential_expression_matrix_set(
            deu_param)

        returnVal = {'result_directory': ballgown_output_dir,
                     'diff_expression_matrix_set_ref':
                         diff_expression_matrix_set_ref['diffExprMatrixSet_ref']}

        report_output = self._generate_report(params,
                                              ballgown_output_dir, diff_expression_matrix_set_ref)
        returnVal.update(report_output)

        return returnVal
Beispiel #23
0
class QualiMapRunner:

    QUALIMAP_PATH = '/kb/module/qualimap-bin/qualimap'
    JAVA_MEM_DEFAULT_SIZE = '16G'
    LARGE_BAM_FILE_SIZE = 20 * 1024 * 1024 * 1024  # 20 GB
    TIMEOUT = 72 * 60 * 60  # 72 hours

    def _get_file_size(self, file_path):
        file_size = os.path.getsize(file_path)
        print('File size: {} -- {}'.format(file_size, file_path))
        return file_size

    def _large_file(self, file_path):

        filename, file_extension = os.path.splitext(file_path)
        multiplier = 0

        if file_extension == '.txt':
            total_file_size = 0
            with open(file_path, 'r') as f:
                for line in f:
                    bam_file_path = line.split('\t')[1]
                    total_file_size += self._get_file_size(bam_file_path)
            print('Total file size: {}'.format(total_file_size))
            multiplier = int(total_file_size) / int(self.LARGE_BAM_FILE_SIZE)
        else:
            multiplier = int(self._get_file_size(file_path)) / int(
                self.LARGE_BAM_FILE_SIZE)

        print('setting number of windows multiplier to: {}'.format(multiplier))

        return multiplier

    def _timeout_handler(self, signum, frame):
        print('Signal handler called with signal', signum)
        raise ValueError('QualiMap takes too long')

    def __init__(self, scratch_dir, callback_url, workspace_url, srv_wiz_url):
        self.scratch_dir = scratch_dir
        self.rau = ReadsAlignmentUtils(callback_url)
        self.kbr = KBaseReport(callback_url)
        self.dfu = DataFileUtil(callback_url)
        self.gfu = GenomeFileUtil(callback_url)
        self.set_api = SetAPI(srv_wiz_url)
        self.ws = Workspace(workspace_url)
        self.valid_commands = ['bamqc', 'multi-bamqc']

    def run_app(self, params):
        self.validate_params(params)
        print('Validated Params = ')
        pprint(params)
        run_info = self.get_run_info(params)

        if run_info.get('mode') not in ['single', 'multi']:
            raise ValueError(
                'Error in fetching the type to determine run settings.')

        run_error = False
        try:
            signal.signal(signal.SIGALRM, self._timeout_handler)
            signal.alarm(self.TIMEOUT)
            if run_info['mode'] == 'single':
                result = self.run_bamqc(params['input_ref'],
                                        run_info['input_info'])
            elif run_info['mode'] == 'multi':
                result = self.run_multi_sample_qc(params['input_ref'],
                                                  run_info['input_info'])
            signal.alarm(0)
        except Exception:
            run_error = True

            workdir = os.path.join(self.scratch_dir,
                                   'qualimap_' + str(int(time.time() * 10000)))
            os.makedirs(workdir)

            with open(os.path.join(workdir, 'qualimapReport.html'),
                      'w') as report:
                report.write('<html><body><p></p></body></html>')

            package_info = self.package_output_folder(
                workdir, 'QualiMap_report',
                'EMPTY HTML report directory for QualiMap BAM QC',
                'qualimapReport.html')

            result = {
                'qc_result_folder_path': workdir,
                'qc_result_zip_info': package_info,
                'shock_id': None
            }
            error_msg = 'Running QualiMap returned an error:\n{}\n'.format(
                traceback.format_exc())
            error_msg += 'Generating simple report instead\n'
            print(error_msg)

        if params['create_report']:
            result = self.create_report(result, params['output_workspace'],
                                        run_error, params['input_ref'])

        return result

    def create_report(self,
                      result,
                      output_workspace,
                      run_error=None,
                      input_ref=None):

        if run_error:
            objects_created = []
            info = self.get_obj_info(input_ref)
            obj_type = self.get_type_from_obj_info(info)
            if obj_type in ['KBaseRNASeq.RNASeqAlignment']:
                objects_created.append({
                    'ref': input_ref,
                    'description': 'Alignment'
                })

            if obj_type in [
                    'KBaseRNASeq.RNASeqAlignmentSet',
                    'KBaseSets.ReadsAlignmentSet'
            ]:
                objects_created.append({
                    'ref': input_ref,
                    'description': 'AlignmentSet'
                })
                reads_alignment_info = self.get_alignments_from_set(input_ref)
                for alignment in reads_alignment_info:
                    alignment_ref = alignment.get('ref')
                    objects_created.append({
                        'ref': alignment_ref,
                        'description': 'Alignment'
                    })

            report_info = self.kbr.create_extended_report({
                'message':
                ' ',
                'objects_created':
                objects_created,
                'report_object_name':
                'qualimap_report' + str(uuid.uuid4()),
                'workspace_name':
                output_workspace
            })
            result['report_name'] = report_info['name']
            result['report_ref'] = report_info['ref']
            return result

        qc_result_zip_info = result['qc_result_zip_info']
        report_info = self.kbr.create_extended_report({
            'message':
            '',
            'objects_created': [],
            'direct_html_link_index':
            0,
            'html_links': [{
                'shock_id': qc_result_zip_info['shock_id'],
                'name': qc_result_zip_info['index_html_file_name'],
                'label': qc_result_zip_info['name']
            }],
            'report_object_name':
            'qualimap_report' + str(uuid.uuid4()),
            'workspace_name':
            output_workspace
        })
        result['report_name'] = report_info['name']
        result['report_ref'] = report_info['ref']
        return result

    def get_gtf_file(self, input_ref, set_op=False):

        print('Start fetching GFF file from genome')

        if set_op:
            set_data = self.set_api.get_reads_alignment_set_v1({
                'ref':
                input_ref,
                'include_item_info':
                1
            })
            input_ref = set_data['data']['items'][0]['ref']

        obj_data = self.dfu.get_objects({"object_refs":
                                         [input_ref]})['data'][0]['data']

        genome_ref = obj_data.get('genome_id')

        if not genome_ref:
            raise ValueError(
                'Alignment is not associated with a Genome object')

        result_directory = os.path.join(self.scratch_dir, str(uuid.uuid4()))
        os.makedirs(result_directory)

        genome_gtf_file = self.gfu.genome_to_gff({
            'genome_ref': genome_ref,
            'is_gtf': True,
            'target_dir': result_directory
        })['file_path']

        return genome_gtf_file

    def run_bamqc(self, input_ref, input_info):
        # download the input and setup a working dir
        alignment_info = self.rau.download_alignment({'source_ref': input_ref})
        bam_file_path = self.find_my_bam_file(
            alignment_info['destination_dir'])
        try:
            gtf_file = self.get_gtf_file(input_ref)
        except:
            gtf_file = ''

        workdir = os.path.join(self.scratch_dir,
                               'qualimap_' + str(int(time.time() * 10000)))

        options = [
            '-bam', bam_file_path, '-c', '-outdir', workdir, '-outformat',
            'html'
        ]

        if gtf_file:
            options += ['-gff', gtf_file]

        options.append('--java-mem-size={}'.format(
            self.JAVA_MEM_DEFAULT_SIZE))  # always use large mem
        multiplier = self._large_file(bam_file_path)
        if multiplier:
            window_size = multiplier * 400
            print('using larger window size: {} and Java memory: {}'.format(
                window_size, self.JAVA_MEM_DEFAULT_SIZE))
            options.append(
                '-nw {}'.format(window_size))  # increase size of windows

        self.run_cli_command('bamqc', options)

        package_info = self.package_output_folder(
            workdir, 'QualiMap_report',
            'HTML report directory for QualiMap BAM QC', 'qualimapReport.html')

        return {
            'qc_result_folder_path': workdir,
            'qc_result_zip_info': package_info
        }

    def run_multi_sample_qc(self, input_ref, input_info):
        # download the input and setup a working dir
        reads_alignment_info = self.get_alignments_from_set(input_ref)
        try:
            gtf_file = self.get_gtf_file(input_ref, set_op=True)
        except:
            gtf_file = ''
        suffix = 'qualimap_' + str(int(time.time() * 10000))
        workdir = os.path.join(self.scratch_dir, suffix)
        os.makedirs(workdir)

        input_file_path = self.create_multi_qualimap_cfg(
            reads_alignment_info, workdir)

        options = [
            '-d', input_file_path, '-r', '-c', '-outdir', workdir,
            '-outformat', 'html'
        ]

        if gtf_file:
            options += ['-gff', gtf_file]

        multiplier = self._large_file(input_file_path)
        if multiplier:
            window_size = multiplier * 400
            print('using larger window size: {} and Java memory: {}'.format(
                window_size, self.JAVA_MEM_DEFAULT_SIZE))
            options.append(
                '-nw {}'.format(window_size))  # increase size of windows
            options.append('--java-mem-size={}'.format(
                self.JAVA_MEM_DEFAULT_SIZE))

        self.run_cli_command('multi-bamqc', options)

        package_info = self.package_output_folder(
            workdir, 'QualiMap_report',
            'HTML report directory for QualiMap Multi-sample BAM QC',
            'multisampleBamQcReport.html')

        return {
            'qc_result_folder_path': workdir,
            'qc_result_zip_info': package_info
        }

    def get_alignments_from_set(self, alignment_set_ref):
        set_data = self.set_api.get_reads_alignment_set_v1({
            'ref':
            alignment_set_ref,
            'include_item_info':
            1
        })
        items = set_data['data']['items']

        reads_alignment_data = []
        for alignment in items:
            alignment_info = self.rau.download_alignment(
                {'source_ref': alignment['ref']})
            bam_file_path = self.find_my_bam_file(
                alignment_info['destination_dir'])
            label = None
            if 'label' in alignment:
                label = alignment['label']
            reads_alignment_data.append({
                'bam_file_path': bam_file_path,
                'ref': alignment['ref'],
                'label': label,
                'info': alignment['info']
            })
        return reads_alignment_data

    def create_multi_qualimap_cfg(self, reads_alignment_info, workdir):
        # Group by labels if there is at least one defined
        use_labels = False
        for alignment in reads_alignment_info:
            if alignment['label']:
                use_labels = True
                break

        # write the file
        input_file_path = os.path.join(workdir, 'multi_input.txt')
        input_file = open(input_file_path, 'w')
        name_lookup = {}
        for alignment in reads_alignment_info:
            name = alignment['info'][1]
            if name in name_lookup:
                name_lookup[name] += 1
                name = name + '_' + str(name_lookup[name])
            else:
                name_lookup[name] = 1

            input_file.write(name + '\t' + alignment['bam_file_path'])
            if use_labels:
                if alignment['label']:
                    input_file.write('\t' + alignment['label'])
                else:
                    input_file.write('\tunlabeled')
            input_file.write('\n')
        input_file.close()
        return input_file_path

    def get_run_info(self, params):
        info = self.get_obj_info(params['input_ref'])
        obj_type = self.get_type_from_obj_info(info)
        if obj_type in ['KBaseRNASeq.RNASeqAlignment']:
            return {'mode': 'single', 'input_info': info}
        if obj_type in [
                'KBaseRNASeq.RNASeqAlignmentSet', 'KBaseSets.ReadsAlignmentSet'
        ]:
            return {'mode': 'multi', 'input_info': info}
        raise ValueError('Object type of input_ref is not valid, was: ' +
                         str(obj_type))

    def validate_params(self, params):
        if 'input_ref' not in params:
            raise ValueError(
                'required parameter field "input_ref" was not set')

        create_report = False
        if 'create_report' in params:
            if int(params['create_report']) == 1:
                if 'output_workspace' not in params:
                    raise ValueError(
                        'If "create_report" was set, then "output_workspace" is required'
                    )
                if not params['output_workspace']:
                    raise ValueError(
                        'If "create_report" was set, then "output_workspace" is required'
                    )
                create_report = True
        params['create_report'] = create_report

    def run_cli_command(self, command, options, cwd=None):
        if command not in self.valid_commands:
            raise ValueError('Invalid QualiMap command: ' + str(command))
        command = [self.QUALIMAP_PATH, command] + options
        print('Running: ' + ' '.join(command))

        if not cwd:
            cwd = self.scratch_dir

        p = subprocess.Popen(command, cwd=cwd, shell=False)
        exitCode = p.wait()

        if (exitCode == 0):
            print('Success, exit code was: ' + str(exitCode))
        else:
            raise ValueError('Error running command: ' + ' '.join(command) +
                             '\n' + 'Exit Code: ' + str(exitCode))

    def find_my_bam_file(self, dirpath):
        bam_path = None
        for f in os.listdir(dirpath):
            fullpath = os.path.join(dirpath, f)
            if os.path.isfile(fullpath) and f.lower().endswith('.bam'):
                if bam_path is not None:
                    raise ValueError(
                        'Error! Too many BAM files were downloaded for this alignment!'
                    )
                bam_path = fullpath
        if bam_path is None:
            raise ValueError(
                'Error! No BAM files were downloaded for this alignment!')
        return bam_path

    def package_output_folder(self, folder_path, zip_file_name,
                              zip_file_description, index_html_file):
        ''' Simple utility for packaging a folder and saving to shock '''
        output = self.dfu.file_to_shock({
            'file_path': folder_path,
            'make_handle': 0,
            'pack': 'zip'
        })
        return {
            'shock_id': output['shock_id'],
            'name': zip_file_name,
            'description': zip_file_description,
            'index_html_file_name': index_html_file
        }

    def get_type_from_obj_info(self, info):
        return info[2].split('-')[0]

    def get_obj_info(self, ref):
        return self.ws.get_object_info3({'objects': [{
            'ref': ref
        }]})['infos'][0]
Beispiel #24
0
class FeatureSetDownload:
    def __init__(self, config):
        self.cfg = config
        self.scratch = config['scratch']
        self.gsu = GenomeSearchUtil(os.environ['SDK_CALLBACK_URL'])
        self.dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL'])
        self.ws = Workspace(config["workspace-url"])

    @staticmethod
    def validate_params(params, expected={"workspace_name", "featureset_name"}):
        expected = set(expected)
        pkeys = set(params)
        if expected - pkeys:
            raise ValueError("Required keys {} not in supplied parameters"
                             .format(", ".join(expected - pkeys)))

    def to_tsv(self, params):
        working_dir = os.path.join(self.scratch,
                                   'featureset-download-'+str(uuid.uuid4()))
        os.makedirs(working_dir)
        header = ['Feature Id', 'Aliases', 'Genome', 'Type', 'Function']

        fs_name, fs_dicts = self.make_featureset_dict(params['featureset_ref'])
        files = {'file_path': "{}/{}.tsv".format(working_dir, fs_name)}
        writer = csv.DictWriter(open(files['file_path'], 'w'), header, delimiter='\t',
                                lineterminator='\n')
        writer.writeheader()
        for feat in fs_dicts:
            writer.writerow(feat)
        return fs_name, files

    def make_featureset_dict(self, fs_ref):
        features = []
        ret = self.dfu.get_objects({'object_refs': [fs_ref]})['data'][0]
        feat_set = ret['data']
        fs_name = ret['info'][1]

        feat_by_genome = defaultdict(list)
        for k, v in feat_set['elements'].items():
            feat_by_genome[v[0]].append(k)

        for genome, fids in feat_by_genome.items():
            genome_name = self.ws.get_object_info3({'objects': [{'ref': genome}]})['infos'][0][1]
            res = self.gsu.search({'ref': genome,
                                   'structured_query': {'feature_id': fids},
                                   'sort_by': [['contig_id', 1]],
                                   'start': 0,
                                   'limit': len(fids)
                                   })

            for feat in res['features']:
                features.append({'Feature Id': feat['feature_id'],
                                 'Aliases': ", ".join(feat['aliases'].keys()),
                                 'Genome': "{} ({})".format(genome_name, genome),
                                 'Type': feat['feature_type'],
                                 'Function': feat['function']
                                 })
        return fs_name, features

    def export(self, files, name, params):
        export_package_dir = os.path.join(self.scratch, name+str(uuid.uuid4()))
        os.makedirs(export_package_dir)
        for file in files:
            shutil.move(file, os.path.join(export_package_dir,
                                           os.path.basename(file)))

        # package it up and be done
        package_details = self.dfu.package_for_download({
            'file_path': export_package_dir,
            'ws_refs': [params['featureset_ref']]
        })

        return {'shock_id': package_details['shock_id']}
class FeatureSetBuilder:

    def _mkdir_p(self, path):
        """
        _mkdir_p: make directory for given path
        """
        if not path:
            return
        try:
            os.makedirs(path)
        except OSError as exc:
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise

    def _validate_upload_featureset_from_diff_expr_params(self, params):
        """
        _validate_upload_featureset_from_diff_expr_params:
                validates params passed to upload_featureset_from_diff_expr method
        """

        log('start validating upload_featureset_from_diff_expr params')

        # check for required parameters
        for p in ['diff_expression_ref', 'workspace_name',
                  'p_cutoff', 'q_cutoff', 'fold_change_cutoff']:
            if p not in params:
                raise ValueError('"{}" parameter is required, but missing'.format(p))

        p = params.get('fold_scale_type')
        if p and p != 'logarithm':
            raise ValueError('"fold_scale_type" parameter must be set to "logarithm", if used')

    def _generate_report(self, up_feature_set_ref_list, down_feature_set_ref_list,
                         filtered_expression_matrix_ref_list, workspace_name):
        """
        _generate_report: generate summary report
        """

        log('start creating report')

        output_html_files = self._generate_html_report(up_feature_set_ref_list,
                                                       down_feature_set_ref_list)

        objects_created = list()
        for up_feature_set_ref in up_feature_set_ref_list:
            objects_created += [{'ref': up_feature_set_ref,
                                 'description': 'Upper FeatureSet Object'}]
        for down_feature_set_ref in down_feature_set_ref_list:
            objects_created += [{'ref': down_feature_set_ref,
                                 'description': 'Lower FeatureSet Object'}]

        for filtered_expression_matrix_ref in filtered_expression_matrix_ref_list:
            objects_created += [{'ref': filtered_expression_matrix_ref,
                                 'description': 'Filtered ExpressionMatrix Object'}]

        report_params = {'message': '',
                         'workspace_name': workspace_name,
                         'objects_created': objects_created,
                         'html_links': output_html_files,
                         'direct_html_link_index': 0,
                         'html_window_height': 333,
                         'report_object_name': 'kb_FeatureSetUtils_report_' + str(uuid.uuid4())}

        kbase_report_client = KBaseReport(self.callback_url)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {'report_name': output['name'], 'report_ref': output['ref']}

        return report_output

    def _generate_html_report(self, up_feature_set_ref_list, down_feature_set_ref_list):
        """
        _generate_html_report: generate html summary report
        """

        log('start generating html report')
        html_report = list()

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_directory)
        result_file_path = os.path.join(output_directory, 'report.html')

        uppper_feature_content = ''
        for up_feature_set_ref in up_feature_set_ref_list:
            feature_set_obj = self.ws.get_objects2({'objects':
                                                    [{'ref':
                                                     up_feature_set_ref}]})['data'][0]
            feature_set_data = feature_set_obj['data']
            feature_set_info = feature_set_obj['info']

            feature_set_name = feature_set_info[1]

            elements = feature_set_data.get('elements')
            feature_ids = elements.keys()

            uppper_feature_content += '<tr><td>{}</td><td>{}</td></tr>'.format(feature_set_name,
                                                                               len(feature_ids))

        lower_feature_content = ''
        for down_feature_set_ref in down_feature_set_ref_list:
            feature_set_obj = self.ws.get_objects2({'objects':
                                                    [{'ref':
                                                     down_feature_set_ref}]})['data'][0]
            feature_set_data = feature_set_obj['data']
            feature_set_info = feature_set_obj['info']

            feature_set_name = feature_set_info[1]

            elements = feature_set_data.get('elements')
            feature_ids = elements.keys()

            lower_feature_content += '<tr><td>{}</td><td>{}</td></tr>'.format(feature_set_name,
                                                                              len(feature_ids))

        with open(result_file_path, 'w') as result_file:
            with open(os.path.join(os.path.dirname(__file__), 'report_template.html'),
                      'r') as report_template_file:
                report_template = report_template_file.read()
                report_template = report_template.replace('<tr><td>Upper_FeatureSet</td></tr>',
                                                          uppper_feature_content)

                report_template = report_template.replace('<tr><td>Lower_FeatureSet</td></tr>',
                                                          lower_feature_content)

                result_file.write(report_template)

        html_report.append({'path': result_file_path,
                            'name': os.path.basename(result_file_path),
                            'label': os.path.basename(result_file_path),
                            'description': 'HTML summary report'})
        return html_report

    def _process_diff_expression(self, diff_expression_set_ref, result_directory,
                                 condition_label_pair):
        """
        _process_diff_expression: process differential expression object info
        """

        log('start processing differential expression object')

        diff_expr_set_data = self.ws.get_objects2({'objects':
                                                  [{'ref':
                                                   diff_expression_set_ref}]})['data'][0]['data']

        set_items = diff_expr_set_data['items']

        diff_expr_matrix_file_name = 'gene_results.csv'
        diff_expr_matrix_file = os.path.join(result_directory, diff_expr_matrix_file_name)

        with open(diff_expr_matrix_file, 'w') as csvfile:
            fieldnames = ['gene_id', 'log2_fold_change', 'p_value', 'q_value']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()

        for set_item in set_items:
            diff_expression_ref = set_item['ref']

            diff_expression_data = self.ws.get_objects2({'objects':
                                                        [{'ref':
                                                         diff_expression_ref}]})['data'][0]['data']

            label_string = set_item['label']
            label_list = map(lambda x: x.strip(), label_string.split(','))
            condition_1 = label_list[0]
            condition_2 = label_list[1]

            if condition_1 in condition_label_pair and condition_2 in condition_label_pair:
                genome_id = diff_expression_data['genome_ref']
                matrix_data = diff_expression_data['data']
                selected_diff_expression_ref = diff_expression_ref

                with open(diff_expr_matrix_file, 'ab') as csvfile:
                    row_ids = matrix_data.get('row_ids')
                    row_values = matrix_data.get('values')
                    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

                    for pos, row_id in enumerate(row_ids):
                        row_value = row_values[pos]
                        writer.writerow({'gene_id': row_id,
                                         'log2_fold_change': row_value[0],
                                         'p_value': row_value[1],
                                         'q_value': row_value[2]})

        return diff_expr_matrix_file, genome_id, selected_diff_expression_ref

    def _generate_feature_set(self, feature_ids, genome_id, workspace_name, feature_set_name):
        """
        _generate_feature_set: generate FeatureSet object

        KBaseCollections.FeatureSet type:
        typedef structure {
            string description;
            list<feature_id> element_ordering;
            mapping<feature_id, list<genome_ref>> elements;
        } FeatureSet;
        """

        log('start saving KBaseCollections.FeatureSet object')

        if isinstance(workspace_name, int) or workspace_name.isdigit():
            workspace_id = workspace_name
        else:
            workspace_id = self.dfu.ws_name_to_id(workspace_name)

        elements = {}
        map(lambda feature_id: elements.update({feature_id: [genome_id]}), feature_ids)
        feature_set_data = {'description': 'Generated FeatureSet from DifferentialExpression',
                            'element_ordering': feature_ids,
                            'elements': elements}

        object_type = 'KBaseCollections.FeatureSet'
        save_object_params = {
            'id': workspace_id,
            'objects': [{'type': object_type,
                         'data': feature_set_data,
                         'name': feature_set_name}]}

        dfu_oi = self.dfu.save_objects(save_object_params)[0]
        feature_set_obj_ref = str(dfu_oi[6]) + '/' + str(dfu_oi[0]) + '/' + str(dfu_oi[4])

        return feature_set_obj_ref

    def _process_matrix_file(self, diff_expr_matrix_file, comp_p_value, comp_q_value,
                             comp_fold_change_cutoff):
        """
        _process_matrix_file: filter matrix file by given cutoffs
        """

        log('start processing matrix file')

        up_feature_ids = []
        down_feature_ids = []

        if comp_fold_change_cutoff < 0:
            comp_fold_change_cutoff = -comp_fold_change_cutoff

        with open(diff_expr_matrix_file, 'r') as file:
            reader = csv.DictReader(file)

            for row in reader:
                feature_id = row['gene_id']
                row_p_value = row['p_value']
                row_q_value = row['q_value']
                row_fold_change_cutoff = row['log2_fold_change']

                null_value = set(['NA', 'null', ''])
                col_value = set([row_p_value, row_q_value, row_fold_change_cutoff])

                if not col_value.intersection(null_value):
                    p_value_condition = float(row_p_value) <= comp_p_value
                    q_value_condition = float(row_q_value) <= comp_q_value

                    up_matches_condition = (p_value_condition and q_value_condition and
                                                         (float(row_fold_change_cutoff) >=
                                                         comp_fold_change_cutoff))

                    down_matches_condition = (p_value_condition and q_value_condition and
                                             (float(row_fold_change_cutoff) <=
                                             -comp_fold_change_cutoff))

                    if up_matches_condition:
                        up_feature_ids.append(feature_id)
                    elif down_matches_condition:
                        down_feature_ids.append(feature_id)

        return list(set(up_feature_ids)), list(set(down_feature_ids))

    def _filter_expression_matrix(self, expression_matrix_ref, feature_ids,
                                  workspace_name, filtered_expression_matrix_suffix,
                                  diff_expression_matrix_ref):
        """
        _filter_expression_matrix: generated filtered expression matrix
        """

        log('start saving KBaseFeatureValues.ExpressionMatrix object')

        if isinstance(workspace_name, int) or workspace_name.isdigit():
            workspace_id = workspace_name
        else:
            workspace_id = self.dfu.ws_name_to_id(workspace_name)

        expression_matrix_obj = self.dfu.get_objects({'object_refs':
                                                     [expression_matrix_ref]})['data'][0]

        expression_matrix_info = expression_matrix_obj['info']
        expression_matrix_data = expression_matrix_obj['data']

        expression_matrix_name = expression_matrix_info[1]

        if re.match('.*_*[Ee]xpression_*[Mm]atrix', expression_matrix_name):
            filtered_expression_matrix_name = re.sub('_*[Ee]xpression_*[Mm]atrix',
                                                     filtered_expression_matrix_suffix,
                                                     expression_matrix_name)
        else:
            filtered_expression_matrix_name = expression_matrix_name + \
                filtered_expression_matrix_suffix

        filtered_expression_matrix_data = expression_matrix_data.copy()

        data = filtered_expression_matrix_data['data']

        row_ids = data['row_ids']
        values = data['values']
        filtered_data = data.copy()

        filtered_row_ids = list()
        filtered_values = list()
        for pos, row_id in enumerate(row_ids):
            if row_id in feature_ids:
                filtered_row_ids.append(row_id)
                filtered_values.append(values[pos])

        filtered_data['row_ids'] = filtered_row_ids
        filtered_data['values'] = filtered_values

        filtered_expression_matrix_data['data'] = filtered_data

        object_type = 'KBaseFeatureValues.ExpressionMatrix'
        save_object_params = {
            'id': workspace_id,
            'objects': [{'type': object_type,
                         'data': filtered_expression_matrix_data,
                         'name': filtered_expression_matrix_name,
                         'extra_provenance_input_refs': [diff_expression_matrix_ref]}]}

        dfu_oi = self.dfu.save_objects(save_object_params)[0]
        filtered_expression_matrix_ref = str(
            dfu_oi[6]) + '/' + str(dfu_oi[0]) + '/' + str(dfu_oi[4])

        return filtered_expression_matrix_ref

    def _xor(self, a, b):
        return bool(a) != bool(b)

    def _check_input_labels(self, condition_pairs, available_condition_labels):
        """
        _check_input_labels: check input condition pairs
        """
        checked = True
        for condition_pair in condition_pairs:

            label_string = condition_pair['label_string'][0].strip()
            label_list = map(lambda x: x.strip(), label_string.split(','))
            first_label = label_list[0]
            second_label = label_list[1]

            if first_label not in available_condition_labels:
                error_msg = 'Condition: {} is not availalbe. '.format(first_label)
                error_msg += 'Available conditions: {}'.format(available_condition_labels)
                raise ValueError(error_msg)

            if second_label not in available_condition_labels:
                error_msg = 'Condition: {} is not availalbe. '.format(second_label)
                error_msg += 'Available conditions: {}'.format(available_condition_labels)
                raise ValueError(error_msg)

            if first_label == second_label:
                raise ValueError('Input conditions are the same')

        return checked

    def _get_condition_labels(self, diff_expression_set_ref):
        """
        _get_condition_labels: get all possible condition label pairs
        """
        log('getting all possible condition pairs')

        condition_label_pairs = list()
        available_condition_labels = set()
        diff_expression_set_obj = self.ws.get_objects2({'objects':
                                                       [{'ref': diff_expression_set_ref}]
                                                        })['data'][0]
        diff_expression_set_data = diff_expression_set_obj['data']
        items = diff_expression_set_data.get('items')
        for item in items:
            label_string = item['label']
            label_list = map(lambda x: x.strip(), label_string.split(','))
            condition_label_pairs.append(label_list)
            map(lambda x: available_condition_labels.add(x), label_list)

        log('all pssible conditon pairs:\n{}'.format(condition_label_pairs))

        return condition_label_pairs, available_condition_labels

    def __init__(self, config):
        self.ws_url = config["workspace-url"]
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        self.shock_url = config['shock-url']
        self.ws = Workspace(self.ws_url, token=self.token)
        self.dfu = DataFileUtil(self.callback_url)
        self.scratch = config['scratch']

    def upload_featureset_from_diff_expr(self, params):
        """
        upload_featureset_from_diff_expr: create FeatureSet from RNASeqDifferentialExpression
                                          based on given threshold cutoffs

        required params:
        diff_expression_ref: DifferetialExpressionMatrixSet object reference
        expression_matrix_ref: ExpressionMatrix object reference
        p_cutoff: p value cutoff
        q_cutoff: q value cutoff
        fold_scale_type: one of ["linear", "log2+1", "log10+1"]
        fold_change_cutoff: fold change cutoff
        feature_set_suffix: Result FeatureSet object name suffix
        filtered_expression_matrix_suffix: Result ExpressionMatrix object name suffix
        workspace_name: the name of the workspace it gets saved to

        return:
        result_directory: folder path that holds all files generated
        up_feature_set_ref_list: list of generated upper FeatureSet object reference
        down_feature_set_ref_list: list of generated down FeatureSet object reference
        filtered_expression_matrix_ref_list: list of generated filtered ExpressionMatrix object ref
        report_name: report name generated by KBaseReport
        report_ref: report reference generated by KBaseReport
        """

        self._validate_upload_featureset_from_diff_expr_params(params)

        diff_expression_set_ref = params.get('diff_expression_ref')
        diff_expression_set_info = self.ws.get_object_info3({"objects":
                                                            [{"ref": diff_expression_set_ref}]}
                                                            )['infos'][0]
        diff_expression_set_name = diff_expression_set_info[1]

        result_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(result_directory)

        (available_condition_label_pairs,
         available_condition_labels) = self._get_condition_labels(diff_expression_set_ref)

        run_all_combinations = params.get('run_all_combinations')
        condition_pairs = params.get('condition_pairs')
        if not self._xor(run_all_combinations, condition_pairs):
            error_msg = "Invalid input:\nselect 'Run All Paired Condition Combinations' "
            error_msg += "or provide partial condition pairs. Don't do both or neither"
            raise ValueError(error_msg)

        if run_all_combinations:
            condition_label_pairs = available_condition_label_pairs
        else:
            if self._check_input_labels(condition_pairs, available_condition_labels):
                condition_label_pairs = list()
                for condition_pair in condition_pairs:
                    label_string = condition_pair['label_string'][0].strip()
                    condition_labels = map(lambda x: x.strip(), label_string.split(','))
                    condition_label_pairs.append(condition_labels)

        up_feature_set_ref_list = list()
        down_feature_set_ref_list = list()
        filtered_expression_matrix_ref_list = list()

        for condition_label_pair in condition_label_pairs:
            condition_string = '_' + '_'.join(condition_label_pair)
            diff_expr_matrix_file, genome_id, diff_expr_matrix_ref = self._process_diff_expression(
                                                                diff_expression_set_ref,
                                                                result_directory,
                                                                condition_label_pair)
            up_feature_ids, down_feature_ids = self._process_matrix_file(
                                                                diff_expr_matrix_file,
                                                                params.get('p_cutoff'),
                                                                params.get('q_cutoff'),
                                                                params.get('fold_change_cutoff'))
            filtered_expression_matrix_suffix = condition_string + \
                params.get('filtered_expression_matrix_suffix')
            if params.get('expression_matrix_ref'):
                filtered_expression_matrix_ref = self._filter_expression_matrix(
                                                params.get('expression_matrix_ref'),
                                                up_feature_ids + down_feature_ids,
                                                params.get('workspace_name'),
                                                filtered_expression_matrix_suffix,
                                                diff_expr_matrix_ref)
                filtered_expression_matrix_ref_list.append(filtered_expression_matrix_ref)

            feature_set_suffix = params.get('feature_set_suffix')
            up_feature_set_name = diff_expression_set_name + \
                condition_string + '_up' + feature_set_suffix
            up_feature_set_ref = self._generate_feature_set(up_feature_ids,
                                                            genome_id,
                                                            params.get('workspace_name'),
                                                            up_feature_set_name)
            up_feature_set_ref_list.append(up_feature_set_ref)

            down_feature_set_name = diff_expression_set_name + \
                condition_string + '_down' + feature_set_suffix
            down_feature_set_ref = self._generate_feature_set(down_feature_ids,
                                                              genome_id,
                                                              params.get('workspace_name'),
                                                              down_feature_set_name)
            down_feature_set_ref_list.append(down_feature_set_ref)

        returnVal = {'result_directory': result_directory,
                     'up_feature_set_ref_list': up_feature_set_ref_list,
                     'down_feature_set_ref_list': down_feature_set_ref_list,
                     'filtered_expression_matrix_ref_list': filtered_expression_matrix_ref_list}

        report_output = self._generate_report(up_feature_set_ref_list, down_feature_set_ref_list,
                                              filtered_expression_matrix_ref_list,
                                              params.get('workspace_name'))
        returnVal.update(report_output)

        return returnVal
Beispiel #26
0
class QualiMapRunner:

    QUALIMAP_PATH = '/kb/module/qualimap-bin/qualimap'

    def __init__(self, scratch_dir, callback_url, workspace_url, srv_wiz_url):
        self.scratch_dir = scratch_dir
        self.rau = ReadsAlignmentUtils(callback_url)
        self.kbr = KBaseReport(callback_url)
        self.dfu = DataFileUtil(callback_url)
        self.set_api = SetAPI(srv_wiz_url)
        self.ws = Workspace(workspace_url)
        self.valid_commands = ['bamqc', 'multi-bamqc']

    def run_app(self, params):
        self.validate_params(params)
        print('Validated Params = ')
        pprint(params)
        run_info = self.get_run_info(params)

        if run_info['mode'] == 'single':
            result = self.run_bamqc(params['input_ref'],
                                    run_info['input_info'])
        elif run_info['mode'] == 'multi':
            result = self.run_multi_sample_qc(params['input_ref'],
                                              run_info['input_info'])
        else:
            raise ValueError(
                'Error in fetching the type to determine run settings.')

        if params['create_report']:
            result = self.create_report(result, params['output_workspace'])

        return result

    def create_report(self, result, output_workspace):
        qc_result_zip_info = result['qc_result_zip_info']
        report_info = self.kbr.create_extended_report({
            'message':
            '',
            'objects_created': [],
            'direct_html_link_index':
            0,
            'html_links': [{
                'shock_id': qc_result_zip_info['shock_id'],
                'name': qc_result_zip_info['index_html_file_name'],
                'label': qc_result_zip_info['name']
            }],
            'report_object_name':
            'qualimap_report' + str(uuid.uuid4()),
            'workspace_name':
            output_workspace
        })
        result['report_name'] = report_info['name']
        result['report_ref'] = report_info['ref']
        return result

    def run_bamqc(self, input_ref, input_info):
        # download the input and setup a working dir
        alignment_info = self.rau.download_alignment({'source_ref': input_ref})
        bam_file_path = self.find_my_bam_file(
            alignment_info['destination_dir'])
        workdir = os.path.join(self.scratch_dir,
                               'qualimap_' + str(int(time.time() * 10000)))

        options = [
            '-bam', bam_file_path, '-outdir', workdir, '-outformat', 'html'
        ]
        self.run_cli_command('bamqc', options)

        package_info = self.package_output_folder(
            workdir, 'QualiMap_report',
            'HTML report directory for QualiMap BAM QC', 'qualimapReport.html')

        return {
            'qc_result_folder_path': workdir,
            'qc_result_zip_info': package_info
        }

    def run_multi_sample_qc(self, input_ref, input_info):
        # download the input and setup a working dir
        reads_alignment_info = self.get_alignments_from_set(input_ref)
        suffix = 'qualimap_' + str(int(time.time() * 10000))
        workdir = os.path.join(self.scratch_dir, suffix)
        os.makedirs(workdir)

        input_file_path = self.create_multi_qualimap_cfg(
            reads_alignment_info, workdir)

        options = [
            '-d', input_file_path, '-r', '-outdir', workdir, '-outformat',
            'html'
        ]
        self.run_cli_command('multi-bamqc', options)

        package_info = self.package_output_folder(
            workdir, 'QualiMap_report',
            'HTML report directory for QualiMap Multi-sample BAM QC',
            'multisampleBamQcReport.html')

        return {
            'qc_result_folder_path': workdir,
            'qc_result_zip_info': package_info
        }

    def get_alignments_from_set(self, alignment_set_ref):
        set_data = self.set_api.get_reads_alignment_set_v1({
            'ref':
            alignment_set_ref,
            'include_item_info':
            1
        })
        items = set_data['data']['items']

        reads_alignment_data = []
        for alignment in items:
            alignment_info = self.rau.download_alignment(
                {'source_ref': alignment['ref']})
            bam_file_path = self.find_my_bam_file(
                alignment_info['destination_dir'])
            label = None
            if 'label' in alignment:
                label = alignment['label']
            reads_alignment_data.append({
                'bam_file_path': bam_file_path,
                'ref': alignment['ref'],
                'label': label,
                'info': alignment['info']
            })
        return reads_alignment_data

    def create_multi_qualimap_cfg(self, reads_alignment_info, workdir):
        # Group by labels if there is at least one defined
        use_labels = False
        for alignment in reads_alignment_info:
            if alignment['label']:
                use_labels = True
                break

        # write the file
        input_file_path = os.path.join(workdir, 'multi_input.txt')
        input_file = open(input_file_path, 'w')
        name_lookup = {}
        for alignment in reads_alignment_info:
            name = alignment['info'][1]
            if name in name_lookup:
                name_lookup[name] += 1
                name = name + '_' + str(name_lookup[name])
            else:
                name_lookup[name] = 1

            input_file.write(name + '\t' + alignment['bam_file_path'])
            if use_labels:
                if alignment['label']:
                    input_file.write('\t' + alignment['label'])
                else:
                    input_file.write('\tunlabeled')
            input_file.write('\n')
        input_file.close()
        return input_file_path

    def get_run_info(self, params):
        info = self.get_obj_info(params['input_ref'])
        obj_type = self.get_type_from_obj_info(info)
        if obj_type in ['KBaseRNASeq.RNASeqAlignment']:
            return {'mode': 'single', 'input_info': info}
        if obj_type in [
                'KBaseRNASeq.RNASeqAlignmentSet', 'KBaseSets.ReadsAlignmentSet'
        ]:
            return {'mode': 'multi', 'input_info': info}
        raise ValueError('Object type of input_ref is not valid, was: ' +
                         str(obj_type))

    def validate_params(self, params):
        if 'input_ref' not in params:
            raise ValueError(
                'required parameter field "input_ref" was not set')

        create_report = False
        if 'create_report' in params:
            if int(params['create_report']) == 1:
                if 'output_workspace' not in params:
                    raise ValueError(
                        'If "create_report" was set, then "output_workspace" is required'
                    )
                if not params['output_workspace']:
                    raise ValueError(
                        'If "create_report" was set, then "output_workspace" is required'
                    )
                create_report = True
        params['create_report'] = create_report

    def run_cli_command(self, command, options, cwd=None):
        if command not in self.valid_commands:
            raise ValueError('Invalid QualiMap command: ' + str(command))
        command = [self.QUALIMAP_PATH, command] + options
        print('Running: ' + ' '.join(command))

        if not cwd:
            cwd = self.scratch_dir

        p = subprocess.Popen(command, cwd=cwd, shell=False)
        exitCode = p.wait()

        if (exitCode == 0):
            print('Success, exit code was: ' + str(exitCode))
        else:
            raise ValueError('Error running command: ' + ' '.join(command) +
                             '\n' + 'Exit Code: ' + str(exitCode))

    def find_my_bam_file(self, dirpath):
        bam_path = None
        for f in os.listdir(dirpath):
            fullpath = os.path.join(dirpath, f)
            if os.path.isfile(fullpath) and f.lower().endswith('.bam'):
                if bam_path is not None:
                    raise ValueError(
                        'Error! Too many BAM files were downloaded for this alignment!'
                    )
                bam_path = fullpath
        if bam_path is None:
            raise ValueError(
                'Error! No BAM files were downloaded for this alignment!')
        return bam_path

    def package_output_folder(self, folder_path, zip_file_name,
                              zip_file_description, index_html_file):
        ''' Simple utility for packaging a folder and saving to shock '''
        output = self.dfu.file_to_shock({
            'file_path': folder_path,
            'make_handle': 0,
            'pack': 'zip'
        })
        return {
            'shock_id': output['shock_id'],
            'name': zip_file_name,
            'description': zip_file_description,
            'index_html_file_name': index_html_file
        }

    def get_type_from_obj_info(self, info):
        return info[2].split('-')[0]

    def get_obj_info(self, ref):
        return self.ws.get_object_info3({'objects': [{
            'ref': ref
        }]})['infos'][0]
Beispiel #27
0
class Bowtie2IndexBuilder(object):

    def __init__(self, scratch_dir, ws_url, callback_url, service_wizard_url, provenance):
        self.scratch_dir = scratch_dir
        self.ws_url = ws_url
        self.ws = Workspace(self.ws_url)
        self.callback_url = callback_url
        self.service_wizard_url = service_wizard_url
        self.bowtie2 = Bowtie2Runner(self.scratch_dir)
        self.provenance = provenance


    def get_index(self, params):
        ''' The key function of this module- get a bowtie2 index for the specified input '''

        # validate the parameters and fetch assembly_info
        validated_params = self._validate_params(params)
        assembly_info = self._get_assembly_info(validated_params['ref'])

        # check the cache (keyed off of assembly_info)
        index_info = self._get_cached_index(assembly_info, validated_params)
        if index_info:
            index_info['from_cache'] = 1
            index_info['pushed_to_cache'] = 0
        else:
            # on a cache miss, build the index
            index_info = self._build_index(assembly_info, validated_params)
            index_info['from_cache'] = 0
            # pushed_to_cache will be set in return from _build_index

        index_info['assembly_ref'] = assembly_info['ref']
        index_info['genome_ref'] = assembly_info['genome_ref']

        return index_info


    def _validate_params(self, params):
        ''' validate parameters; can do some processing here to produce validated params '''
        validated_params = {'ref': None}
        if 'ref' in params and params['ref']:
            validated_params['ref'] = params['ref']
        else:
            raise ValueError('"ref" field indicating either an assembly or genome is required.')

        if 'output_dir' in params:
            validated_params['output_dir'] = params['output_dir']
        else:
            validated_params['output_dir'] = os.path.join(self.scratch_dir,
                                                          'bowtie2_index_' + str(int(time.time() * 100)))

        if os.path.exists(validated_params['output_dir']):
            raise('Output directory name specified (' + validated_params['output_dir'] +
                  ') already exists. Will not overwrite, so aborting.')

        if 'ws_for_cache' in params and params['ws_for_cache']:
            validated_params['ws_for_cache'] = params['ws_for_cache']
        else:
            print('WARNING: bowtie2 index if created will not be cached because "ws_for_cache" field not set')
            validated_params['ws_for_cache'] = None

        return validated_params


    def _get_assembly_info(self, ref):
        ''' given a ref to an assembly or genome, figure out the assembly and return its info '''
        info = self.ws.get_object_info3({'objects': [{'ref': ref}]})['infos'][0]
        obj_type = info[2]
        if obj_type.startswith('KBaseGenomeAnnotations.Assembly') or obj_type.startswith('KBaseGenomes.ContigSet'):
            return {'info': info, 'ref': ref, 'genome_ref': None}

        if obj_type.startswith('KBaseGenomes.Genome'):
            # we need to get the assembly for this genome
            ga = GenomeAnnotationAPI(self.service_wizard_url)
            assembly_ref = ga.get_assembly({'ref': ref})
            # using the path ensures we can access the assembly even if we don't have direct access
            ref_path = ref + ';' + assembly_ref
            info = self.ws.get_object_info3({'objects': [{'ref': ref_path}]})['infos'][0]
            return {'info': info, 'ref': ref_path, 'genome_ref': ref}

        raise ValueError('Input object was not of type: Assembly, ContigSet or Genome.  Cannot get Bowtie2 Index.')



    def _get_cached_index(self, assembly_info, validated_params):

        try:
            # note: list_reference_objects does not yet support reference paths, so we need to call
            # with the direct reference.  So we won't get a cache hit if you don't have direct access
            # to the assembly object right now (although you can still always build the assembly object)
            # Once this call supports paths, this should be changed to set ref = assembly_info['ref']
            info = assembly_info['info']
            ref = str(info[6]) + '/' + str(info[0]) + '/' + str(info[4])
            objs = self.ws.list_referencing_objects([{'ref': ref}])[0]

            # iterate through each of the objects that reference the assembly
            bowtie2_indexes = []
            for o in objs:
                if o[2].startswith('KBaseRNASeq.Bowtie2IndexV2'):
                    bowtie2_indexes.append(o)

            # Nothing refs this assembly, so cache miss
            if len(bowtie2_indexes) == 0:
                return False

            # if there is more than one hit, get the most recent one
            # (obj_info[3] is the save_date timestamp (eg 2017-05-30T22:56:49+0000), so we can sort on that)
            bowtie2_indexes.sort(key=lambda x: x[3])
            bowtie2_index_info = bowtie2_indexes[-1]
            index_ref = str(bowtie2_index_info[6]) + '/' + str(bowtie2_index_info[0]) + '/' + str(bowtie2_index_info[4])

            # get the object data
            index_obj_data = self.ws.get_objects2({'objects': [{'ref': index_ref}]})['data'][0]['data']

            # download the handle object
            os.makedirs(validated_params['output_dir'])

            dfu = DataFileUtil(self.callback_url)
            dfu.shock_to_file({'file_path': os.path.join(validated_params['output_dir'], 'bt2_index.tar.gz'),
                               'handle_id': index_obj_data['handle']['hid'],
                               'unpack': 'unpack'})
            print('Cache hit: ')
            pprint(index_obj_data)
            return {'output_dir': validated_params['output_dir'],
                    'index_files_basename': index_obj_data['index_files_basename']}


        except Exception:
            # if we fail in saving the cached object, don't worry
            print('WARNING: exception encountered when trying to lookup in cache:')
            print(traceback.format_exc())
            print('END WARNING: exception encountered when trying to lookup in cache.')


        return None

    def _put_cached_index(self, assembly_info, index_files_basename, output_dir, ws_for_cache):

        if not ws_for_cache:
            print('WARNING: bowtie2 index cannot be cached because "ws_for_cache" field not set')
            return False

        try:
            dfu = DataFileUtil(self.callback_url)
            result = dfu.file_to_shock({'file_path': output_dir,
                                        'make_handle': 1,
                                        'pack': 'targz'})

            bowtie2_index = {'handle': result['handle'], 'size': result['size'],
                             'assembly_ref': assembly_info['ref'],
                             'index_files_basename': index_files_basename}

            ws = Workspace(self.ws_url)
            save_params = {'objects': [{'hidden': 1,
                                        'provenance': self.provenance,
                                        'name': os.path.basename(output_dir),
                                        'data': bowtie2_index,
                                        'type': 'KBaseRNASeq.Bowtie2IndexV2'
                                        }]
                           }
            if ws_for_cache.strip().isdigit():
                save_params['id'] = int(ws_for_cache)
            else:
                save_params['workspace'] = ws_for_cache.strip()
            save_result = ws.save_objects(save_params)
            print('Bowtie2IndexV2 cached to: ')
            pprint(save_result[0])
            return True

        except Exception:
            # if we fail in saving the cached object, don't worry
            print('WARNING: exception encountered when trying to cache the index files:')
            print(traceback.format_exc())
            print('END WARNING: exception encountered when trying to cache the index files')

        return False


    def _build_index(self, assembly_info, validated_params):
        # get the assembly as a fasta file using AssemblyUtil
        au = AssemblyUtil(self.callback_url)
        fasta_info = au.get_assembly_as_fasta({'ref': assembly_info['ref']})

        # make the target destination folder (check again it wasn't created yet)
        if os.path.exists(validated_params['output_dir']):
            raise('Output directory name specified (' + validated_params['output_dir'] +
                  ') already exists. Will not overwrite, so aborting.')
        os.makedirs(validated_params['output_dir'])

        # configure the command line args and run it
        cli_params = self._build_cli_params(fasta_info['path'], fasta_info['assembly_name'], validated_params)
        self.bowtie2.run('bowtie2-build', cli_params)
        index_info = {'output_dir': validated_params['output_dir'],
                      'index_files_basename': fasta_info['assembly_name']}

        # cache the result, mark if it worked or not
        cache_success = self._put_cached_index(assembly_info,
                                               fasta_info['assembly_name'],
                                               validated_params['output_dir'],
                                               validated_params['ws_for_cache'])
        if cache_success:
            index_info['pushed_to_cache'] = 1
        else:
            index_info['pushed_to_cache'] = 0

        return index_info


    def _build_cli_params(self, fasta_file_path, index_files_basename, validated_params):
        cli_params = []

        # always run in quiet mode
        cli_params.append('--quiet')

        # positional args: first the fasta path, then the base name used for the index files
        cli_params.append(fasta_file_path)
        cli_params.append(os.path.join(validated_params['output_dir'], index_files_basename))

        return cli_params
Beispiel #28
0
    def exec_remove_adapters(self, ctx, params):
        """
        :param params: instance of type "RemoveAdaptersParams" -> structure:
           parameter "output_workspace" of String, parameter
           "output_object_name" of String, parameter "input_reads" of type
           "ws_ref" (@ref ws), parameter "five_prime" of type
           "FivePrimeOptions" (unfortunately, we have to name the fields
           uniquely between 3' and 5' options due to the current
           implementation of grouped parameters) -> structure: parameter
           "adapter_sequence_5P" of String, parameter "anchored_5P" of type
           "boolean" (@range (0, 1)), parameter "three_prime" of type
           "ThreePrimeOptions" -> structure: parameter "adapter_sequence_3P"
           of String, parameter "anchored_3P" of type "boolean" (@range (0,
           1)), parameter "error_tolerance" of Double, parameter
           "min_overlap_length" of Long, parameter "min_read_length" of Long,
           parameter "discard_untrimmed" of type "boolean" (@range (0, 1))
        :returns: instance of type "exec_RemoveAdaptersResult" -> structure:
           parameter "report" of String, parameter "output_reads_ref" of
           String
        """
        # ctx is the context object
        # return variables are: result
        #BEGIN exec_remove_adapters
        console = []
        self.log(console, 'Running exec_remove_adapters() with parameters: ')
        self.log(console, "\n" + pformat(params))
        self.log(console, "-----------------------------------------------\n")
        report = ''
        returnVal = dict()
        returnVal['output_reads_ref'] = None

        token = ctx['token']
        wsClient = workspaceService(self.config['workspace-url'], token=token)
        ws = Workspace(self.config['workspace-url'], token=token)
        #setAPI_Client = SetAPI (url=self.config['SDK_CALLBACK_URL'], token=token) # for SDK local, doesn't work for SetAPI
        setAPI_Client = SetAPI(url=self.config['service-wizard-url'],
                               token=token)  # for dynamic service
        headers = {'Authorization': 'OAuth ' + token}
        env = os.environ.copy()
        env['KB_AUTH_TOKEN'] = token

        # 0. param checks
        required_params = [
            'output_workspace', 'input_reads', 'output_object_name'
        ]
        for arg in required_params:
            if arg not in params or params[arg] == None or params[arg] == '':
                raise ValueError("Must define required param: '" + arg + "'")

        # 1. load provenance
        provenance = [{}]
        if 'provenance' in ctx:
            provenance = ctx['provenance']
        # add additional info to provenance here, in this case the input data object reference
        provenance[0]['input_ws_objects'] = [str(params['input_reads'])]

        # 2. Determine whether read library, ReadsSet or RNASeqSampleSet is input object
        #
        try:
            # object_info tuple
            [
                OBJID_I, NAME_I, TYPE_I, SAVE_DATE_I, VERSION_I, SAVED_BY_I,
                WSID_I, WORKSPACE_I, CHSUM_I, SIZE_I, META_I
            ] = range(11)

            input_reads_obj_info = wsClient.get_object_info_new(
                {'objects': [{
                    'ref': params['input_reads']
                }]})[0]
            input_reads_obj_type = input_reads_obj_info[TYPE_I]
            input_reads_obj_type = re.sub(
                '-[0-9]+\.[0-9]+$', "",
                input_reads_obj_type)  # remove trailing version
            #input_reads_obj_version = input_reads_obj_info[VERSION_I]  # this is object version, not type version
        except Exception as e:
            raise ValueError(
                'Unable to get read library object from workspace: (' +
                str(params['input_reads']) + ')' + str(e))

        acceptable_types = [
            "KBaseSets.ReadsSet", "KBaseRNASeq.RNASeqSampleSet",
            "KBaseFile.PairedEndLibrary", "KBaseFile.SingleEndLibrary",
            "KBaseAssembly.PairedEndLibrary", "KBaseAssembly.SingleEndLibrary"
        ]
        if input_reads_obj_type not in acceptable_types:
            raise ValueError("Input reads of type: '" + input_reads_obj_type +
                             "'.  Must be one of " +
                             ", ".join(acceptable_types))

        # 3. Retrieve the set details
        #
        readsSet_ref_list = []
        readsSet_names_list = []
        readsSet_types_list = []
        if "KBaseSets.ReadsSet" in input_reads_obj_type:
            try:
                input_readsSet_obj = setAPI_Client.get_reads_set_v1({
                    'ref':
                    params['input_reads'],
                    'include_item_info':
                    1
                })

            except Exception as e:
                raise ValueError(
                    'SetAPI FAILURE: Unable to get read library set object from workspace: ('
                    + str(params['input_reads']) + ")\n" + str(e))
            for readsLibrary_obj in input_readsSet_obj['data']['items']:
                readsSet_ref_list.append(readsLibrary_obj['ref'])
                NAME_I = 1
                TYPE_I = 2
                readsSet_names_list.append(readsLibrary_obj['info'][NAME_I])
                this_type = readsLibrary_obj['info'][TYPE_I]
                this_type = re.sub('-[0-9]+\.[0-9]+$', "",
                                   this_type)  # remove trailing version
                readsSet_types_list.append(this_type)

        elif "KBaseRNASeq.RNASeqSampleSet" in input_reads_obj_type:
            sample_set = ws.get_objects2(
                {"objects": [{
                    "ref": params['input_reads']
                }]})["data"][0]["data"]
            sample_refs = list()
            for i in range(len(sample_set["sample_ids"])):
                readsSet_ref_list.append(sample_set["sample_ids"][i])
                sample_refs.append({"ref": sample_set["sample_ids"][i]})

            info = ws.get_object_info3({"objects": sample_refs})
            for j in range(len(info["infos"])):
                NAME_I = 1
                TYPE_I = 2
                readsSet_names_list.append(info["infos"][j][NAME_I])
                sample_type = info["infos"][j][TYPE_I]
                sample_type = re.sub('-[0-9]+\.[0-9]+$', "",
                                     sample_type)  # remove trailing version
                readsSet_types_list.append(sample_type)
        else:
            readsSet_ref_list = [params['input_reads']]
            readsSet_names_list = [params['output_object_name']]
            readsSet_types_list = [input_reads_obj_type]

        # 4. Iterate through readsLibrary memebers of set
        #
        report = ''
        cutadapt_readsSet_ref = None
        cutadapt_readsLib_refs = []

        for reads_item_i, input_reads_library_ref in enumerate(
                readsSet_ref_list):
            exec_remove_adapters_OneLibrary_params = {
                'output_workspace': params['output_workspace'],
                'input_reads': input_reads_library_ref,
                'reads_type': readsSet_types_list[reads_item_i]
            }
            if (input_reads_obj_type != "KBaseSets.ReadsSet"
                    and input_reads_obj_type != "KBaseRNASeq.RNASeqSampleSet"):
                exec_remove_adapters_OneLibrary_params[
                    'output_object_name'] = params['output_object_name']
            else:
                exec_remove_adapters_OneLibrary_params[
                    'output_object_name'] = readsSet_names_list[
                        reads_item_i] + "_cutadapt"

            optional_params = [
                'float error_tolerance', 'min_overlap_length',
                'min_read_length', 'discard_untrimmed'
            ]
            optional_g_params = {
                'five_prime': ['adapter_sequence_5P', 'anchored_5P'],
                'three_prime': ['adapter_sequence_3P', 'anchored_3P']
            }
            for arg in optional_params:
                if arg in params and params[arg] != None:
                    exec_remove_adapters_OneLibrary_params[arg] = params[arg]

            for group in optional_g_params.keys():
                if group in params and params[group] != None:
                    exec_remove_adapters_OneLibrary_params[group] = dict()
                    for arg in optional_g_params[group]:
                        if arg in params[group] and params[group][arg] != None:
                            exec_remove_adapters_OneLibrary_params[group][
                                arg] = params[group][arg]

            msg = "\n\nRUNNING exec_remove_adapters_OneLibrary() ON LIBRARY: " + str(
                input_reads_library_ref) + " " + str(
                    readsSet_names_list[reads_item_i]) + "\n"
            msg += "----------------------------------------------------------------------------\n"
            report += msg
            self.log(console, msg)

            # RUN
            exec_remove_adapters_OneLibrary_retVal = self.exec_remove_adapters_OneLibrary(
                ctx, exec_remove_adapters_OneLibrary_params)[0]

            report += exec_remove_adapters_OneLibrary_retVal['report'] + "\n\n"
            cutadapt_readsLib_refs.append(
                exec_remove_adapters_OneLibrary_retVal['output_reads_ref'])

        # 5. Conclude
        # Just one Library
        if (input_reads_obj_type != "KBaseSets.ReadsSet"
                and input_reads_obj_type != "KBaseRNASeq.RNASeqSampleSet"):

            # create return output object
            result = {
                'report': report,
                'output_reads_ref': cutadapt_readsLib_refs[0],
            }
        # ReadsSet or SampleSet
        else:
            # save cutadapt readsSet
            some_cutadapt_output_created = False
            items = []
            for i, lib_ref in enumerate(cutadapt_readsLib_refs):

                if lib_ref == None:
                    #items.append(None)  # can't have 'None' items in ReadsSet
                    continue
                else:
                    some_cutadapt_output_created = True
                    try:
                        label = input_readsSet_obj['data']['items'][i]['label']
                    except:
                        NAME_I = 1
                        label = ws.get_object_info3(
                            {'objects': [{
                                'ref': lib_ref
                            }]})['infos'][0][NAME_I]
                    label = label + "_cutadapt"

                    items.append({
                        'ref': lib_ref,
                        'label': label
                        #'data_attachment': ,
                        #'info':
                    })
            if some_cutadapt_output_created:
                reads_desc_ext = " + Cutadapt"
                #reads_name_ext = "_cutadapt"
                descText = ""
                reads_name_ext = ""
                try:
                    descText = input_readsSet_obj['data']['description']
                except:
                    NAME_I = 1
                    descText = ws.get_object_info3(
                        {'objects': [{
                            'ref': params['input_reads']
                        }]})['infos'][0][NAME_I]
                descText = descText + reads_desc_ext

                output_readsSet_obj = {'description': descText, 'items': items}
                output_readsSet_name = str(
                    params['output_object_name']) + reads_name_ext
                cutadapt_readsSet_ref = setAPI_Client.save_reads_set_v1({
                    'workspace_name':
                    params['output_workspace'],
                    'output_object_name':
                    output_readsSet_name,
                    'data':
                    output_readsSet_obj
                })['set_ref']
            else:
                raise ValueError("No cutadapt output created")

            # create return output object
            result = {
                'report': report,
                'output_reads_ref': cutadapt_readsSet_ref
            }
        #END exec_remove_adapters

        # At some point might do deeper type checking...
        if not isinstance(result, dict):
            raise ValueError('Method exec_remove_adapters return value ' +
                             'result is not type dict as required.')
        # return the results
        return [result]
Beispiel #29
0
def fetch_reads_refs_from_sampleset(ref, ws_url, callback_url, params):
    """
    From the given object ref, return a list of all reads objects that are a part of that
    object. E.g., if ref is a ReadsSet, return a list of all PairedEndLibrary or SingleEndLibrary
    refs that are a member of that ReadsSet. This is returned as a list of dictionaries as follows:
    {
        "ref": reads object reference,
        "condition": condition string associated with that reads object
    }
    The only one required is "ref", all other keys may or may not be present, based on the reads
    object or object type in initial ref variable. E.g. a RNASeqSampleSet might have condition info
    for each reads object, but a single PairedEndLibrary may not have that info.
    If ref is already a Reads library, just returns a list with ref as a single element.
    """
    obj_type = get_object_type(ref, ws_url)
    ws = Workspace(ws_url)
    refs = list()
    refs_for_ws_info = list()
    if "KBaseSets.ReadsSet" in obj_type:
        print("Looking up reads references in ReadsSet object")
        set_client = SetAPI(callback_url)
        reads_set = set_client.get_reads_set_v1({
            "ref": ref,
            "include_item_info": 0
        })
        for reads in reads_set["data"]["items"]:
            refs.append({"ref": reads["ref"], "condition": reads["label"]})
            refs_for_ws_info.append({'ref': reads['ref']})
    elif "KBaseRNASeq.RNASeqSampleSet" in obj_type:
        print("Looking up reads references in RNASeqSampleSet object")
        sample_set = ws.get_objects2({"objects": [{
            "ref": ref
        }]})["data"][0]["data"]
        for i in range(len(sample_set["sample_ids"])):
            refs.append({
                "ref": sample_set["sample_ids"][i],
                "condition": sample_set["condition"][i]
            })
            refs_for_ws_info.append({'ref': sample_set['sample_ids'][i]})
    elif ("KBaseAssembly.SingleEndLibrary" in obj_type
          or "KBaseFile.SingleEndLibrary" in obj_type
          or "KBaseFile.SingleEndLibrary-2.0" in obj_type
          or "KBaseFile.SingleEndLibrary-2.1" in obj_type
          or "KBaseAssembly.PairedEndLibrary" in obj_type
          or "KBaseFile.PairedEndLibrary" in obj_type
          or "KBaseFile.PairedEndLibrary-2.0" in obj_type
          or "KBaseFile.PairedEndLibrary-2.1" in obj_type):
        refs.append({"ref": ref})
        refs_for_ws_info.append({'ref': ref})
    else:
        raise ValueError("Unable to fetch reads reference from object {} "
                         "which is a {}".format(ref, obj_type))

    # get object info so we can name things properly
    infos = ws.get_object_info3({'objects': refs_for_ws_info})['infos']

    name_ext = '_alignment'
    if ('alignment_suffix' in params
            and params['alignment_suffix'] is not None):
        ext = params['alignment_suffix'].replace(' ', '')
        if ext:
            name_ext = ext

    unique_names = get_unique_names(infos)
    for k in range(0, len(refs)):
        refs[k]['info'] = infos[k]
        name = unique_names[k] + name_ext
        refs[k]['alignment_output_name'] = name

    return refs
class FeatureSetBuilder:

    def _mkdir_p(self, path):
        """
        _mkdir_p: make directory for given path
        """
        if not path:
            return
        try:
            os.makedirs(path)
        except OSError as exc:
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise

    def _validate_upload_featureset_from_diff_expr_params(self, params):
        """
        _validate_upload_featureset_from_diff_expr_params:
                validates params passed to upload_featureset_from_diff_expr method
        """

        log('start validating upload_featureset_from_diff_expr params')

        # check for required parameters
        for p in ['diff_expression_ref', 'workspace_name',
                  'p_cutoff', 'q_cutoff', 'fold_scale_type', 'fold_change_cutoff']:
            if p not in params:
                raise ValueError('"{}" parameter is required, but missing'.format(p))

        fold_scale_type = params.get('fold_scale_type')
        validate_fold_scale_type = ['linear', 'logarithm']
        if fold_scale_type not in validate_fold_scale_type:
            error_msg = 'Input fold scale type value [{}] is not valid'.format(fold_scale_type)
            raise ValueError(error_msg)

    def _generate_report(self, up_feature_set_ref, down_feature_set_ref, 
                         filtered_expression_matrix_ref,
                         up_feature_ids, down_feature_ids, genome_id, workspace_name):
        """
        _generate_report: generate summary report
        """

        log('start creating report')

        output_html_files = self._generate_html_report(up_feature_ids, down_feature_ids, genome_id)
        objects_created = [{'ref': up_feature_set_ref,
                            'description': 'Upper FeatureSet Object'},
                           {'ref': down_feature_set_ref,
                            'description': 'Lower FeatureSet Object'}]

        if filtered_expression_matrix_ref:
            objects_created += [{'ref': filtered_expression_matrix_ref,
                                 'description': 'Filtered ExpressionMatrix Object'}]

        report_params = {'message': '',
                         'workspace_name': workspace_name,
                         'objects_created': objects_created,
                         'html_links': output_html_files,
                         'direct_html_link_index': 0,
                         'html_window_height': 333,
                         'report_object_name': 'kb_FeatureSetUtils_report_' + str(uuid.uuid4())}

        kbase_report_client = KBaseReport(self.callback_url)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {'report_name': output['name'], 'report_ref': output['ref']}

        return report_output

    def _generate_html_report(self, up_feature_ids, down_feature_ids, genome_id):
        """
        _generate_html_report: generate html summary report
        """

        log('start generating html report')
        html_report = list()

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_directory)
        result_file_path = os.path.join(output_directory, 'report.html')

        genome_name = self.ws.get_object_info([{"ref": genome_id}],
                                              includeMetadata=None)[0][1]

        reference_genome_info = ''
        reference_genome_info += '{} ({})'.format(genome_name, genome_id)

        uppper_feature_ids_content = ''
        for feature_id in up_feature_ids:
            uppper_feature_ids_content += '<tr><td>{}</td><td>'.format(feature_id)

        lower_feature_ids_content = ''
        for feature_id in down_feature_ids:
            lower_feature_ids_content += '<tr><td>{}</td><td>'.format(feature_id)

        with open(result_file_path, 'w') as result_file:
            with open(os.path.join(os.path.dirname(__file__), 'report_template.html'),
                      'r') as report_template_file:
                report_template = report_template_file.read()
                report_template = report_template.replace('Reference_Genome_Info',
                                                          reference_genome_info)

                report_template = report_template.replace('Upper_Filtered_Features',
                                                          str(len(up_feature_ids)))

                report_template = report_template.replace('Lower_Filtered_Features',
                                                          str(len(down_feature_ids)))

                report_template = report_template.replace('<tr><td>Upper_Feature_IDs</td><td>',
                                                          uppper_feature_ids_content)

                report_template = report_template.replace('<tr><td>Lower_Feature_IDs</td><td>',
                                                          lower_feature_ids_content)

                result_file.write(report_template)

        html_report.append({'path': result_file_path,
                            'name': os.path.basename(result_file_path),
                            'label': os.path.basename(result_file_path),
                            'description': 'HTML summary report'})
        return html_report

    def _process_diff_expression(self, diff_expression_set_ref, result_directory):
        """
        _process_diff_expression: process differential expression object info
        """

        log('start processing differential expression object')

        diff_expr_set_data = self.ws.get_objects2({'objects':
                                                  [{'ref': 
                                                   diff_expression_set_ref}]})['data'][0]['data']

        set_items = diff_expr_set_data['items']

        diff_expr_matrix_file_name = 'gene_results.csv'
        diff_expr_matrix_file = os.path.join(result_directory, diff_expr_matrix_file_name)

        with open(diff_expr_matrix_file, 'w') as csvfile:
            fieldnames = ['gene_id', 'log2_fold_change', 'p_value', 'q_value']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()

        for set_item in set_items:
            diff_expression_ref = set_item['ref']

            diff_expression_data = self.ws.get_objects2({'objects': 
                                                        [{'ref': 
                                                         diff_expression_ref}]})['data'][0]['data']

            genome_id = diff_expression_data['genome_ref']
            matrix_data = diff_expression_data['data']

            with open(diff_expr_matrix_file, 'ab') as csvfile:
                
                row_ids = matrix_data.get('row_ids')
                row_values = matrix_data.get('values')
                writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
                
                for pos, row_id in enumerate(row_ids):
                    row_value = row_values[pos]
                    writer.writerow({'gene_id': row_id,
                                     'log2_fold_change': row_value[0],
                                     'p_value': row_value[1],
                                     'q_value': row_value[2]})

        return diff_expr_matrix_file, genome_id

    def _generate_feature_set(self, feature_ids, genome_id, workspace_name, feature_set_name):
        """
        _generate_feature_set: generate FeatureSet object

        KBaseCollections.FeatureSet type:
        typedef structure {
            string description;
            list<feature_id> element_ordering;
            mapping<feature_id, list<genome_ref>> elements;
        } FeatureSet;
        """

        log('start saving KBaseCollections.FeatureSet object')

        if isinstance(workspace_name, int) or workspace_name.isdigit():
            workspace_id = workspace_name
        else:
            workspace_id = self.dfu.ws_name_to_id(workspace_name)

        elements = {}
        map(lambda feature_id: elements.update({feature_id: [genome_id]}), feature_ids)
        feature_set_data = {'description': 'Generated FeatureSet from DifferentialExpression',
                            'element_ordering': feature_ids,
                            'elements': elements}

        object_type = 'KBaseCollections.FeatureSet'
        save_object_params = {
            'id': workspace_id,
            'objects': [{'type': object_type,
                         'data': feature_set_data,
                         'name': feature_set_name}]}

        dfu_oi = self.dfu.save_objects(save_object_params)[0]
        feature_set_obj_ref = str(dfu_oi[6]) + '/' + str(dfu_oi[0]) + '/' + str(dfu_oi[4])

        return feature_set_obj_ref

    def _process_matrix_file(self, diff_expr_matrix_file, comp_p_value, comp_q_value,
                             fold_scale_type, comp_fold_change_cutoff):
        """
        _process_matrix_file: filter matrix file by given cutoffs
        """

        log('start processing matrix file')

        up_feature_ids = []
        down_feature_ids = []

        with open(diff_expr_matrix_file, 'r') as file:
            reader = csv.DictReader(file)

            for row in reader:
                feature_id = row['gene_id']
                row_p_value = row['p_value']
                row_q_value = row['q_value']
                row_fold_change_cutoff = row['log2_fold_change']

                null_value = set(['NA', 'null', ''])
                col_value = set([row_p_value, row_q_value, row_fold_change_cutoff])

                if not col_value.intersection(null_value):
                    p_value_condition = float(row_p_value) <= comp_p_value
                    q_value_condition = float(row_q_value) <= comp_q_value

                    if fold_scale_type == 'linear':
                        row_fold_change_cutoff = float(row_fold_change_cutoff)
                        if row_fold_change_cutoff <= 0:
                            error_msg = 'Invalid Fold Change value '
                            error_msg += '[{}] for linear value. \n'.format(row_fold_change_cutoff)
                            error_msg += 'Available linear FC value should be greater than 0'
                            raise ValueError(error_msg)
                        row_fold_change_cutoff = math.log(row_fold_change_cutoff, 2)

                    up_matches_condition = (p_value_condition and q_value_condition and
                                            (float(row_fold_change_cutoff) >=
                                             comp_fold_change_cutoff))

                    down_matches_condition = (p_value_condition and q_value_condition and
                                              (float(row_fold_change_cutoff) <=
                                               -comp_fold_change_cutoff))

                    if up_matches_condition:
                        up_feature_ids.append(feature_id)
                    elif down_matches_condition:
                        down_feature_ids.append(feature_id)

        return list(set(up_feature_ids)), list(set(down_feature_ids))

    def _filter_expression_matrix(self, expression_matrix_ref, feature_ids, 
                                  workspace_name, filtered_expression_matrix_suffix):
        """
        _filter_expression_matrix: generated filtered expression matrix
        """

        log('start saving KBaseFeatureValues.ExpressionMatrix object')

        if isinstance(workspace_name, int) or workspace_name.isdigit():
            workspace_id = workspace_name
        else:
            workspace_id = self.dfu.ws_name_to_id(workspace_name)

        expression_matrix_obj = self.dfu.get_objects({'object_refs': 
                                                     [expression_matrix_ref]})['data'][0]

        expression_matrix_info = expression_matrix_obj['info']
        expression_matrix_data = expression_matrix_obj['data']

        expression_matrix_name = expression_matrix_info[1]

        if re.match('.*_*[Ee]xpression_*[Mm]atrix', expression_matrix_name):
            filtered_expression_matrix_name = re.sub('_*[Ee]xpression_*[Mm]atrix',
                                                     filtered_expression_matrix_suffix,
                                                     expression_matrix_name)
        else:
            filtered_expression_matrix_name = expression_matrix_name + filtered_expression_matrix_suffix

        filtered_expression_matrix_data = expression_matrix_data.copy()

        data = filtered_expression_matrix_data['data']

        row_ids = data['row_ids']
        values = data['values']
        filtered_data = data.copy()

        filtered_row_ids = list()
        filtered_values = list()
        for pos, row_id in enumerate(row_ids):
            if row_id in feature_ids:
                filtered_row_ids.append(row_id)
                filtered_values.append(values[pos])

        filtered_data['row_ids'] = filtered_row_ids
        filtered_data['values'] = filtered_values

        filtered_expression_matrix_data['data'] = filtered_data

        object_type = 'KBaseFeatureValues.ExpressionMatrix'
        save_object_params = {
            'id': workspace_id,
            'objects': [{'type': object_type,
                         'data': filtered_expression_matrix_data,
                         'name': filtered_expression_matrix_name}]}

        dfu_oi = self.dfu.save_objects(save_object_params)[0]
        filtered_expression_matrix_ref = str(dfu_oi[6]) + '/' + str(dfu_oi[0]) + '/' + str(dfu_oi[4])

        return filtered_expression_matrix_ref

    def __init__(self, config):
        self.ws_url = config["workspace-url"]
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        self.shock_url = config['shock-url']
        self.ws = Workspace(self.ws_url, token=self.token)
        self.dfu = DataFileUtil(self.callback_url)
        self.scratch = config['scratch']

    def upload_featureset_from_diff_expr(self, params):
        """
        upload_featureset_from_diff_expr: create FeatureSet from RNASeqDifferentialExpression
                                          based on given threshold cutoffs

        required params:
        diff_expression_ref: DifferetialExpressionMatrixSet object reference
        expression_matrix_ref: ExpressionMatrix object reference
        p_cutoff: p value cutoff
        q_cutoff: q value cutoff
        fold_scale_type: one of ["linear", "log2+1", "log10+1"]
        fold_change_cutoff: fold change cutoff
        feature_set_suffix: Result FeatureSet object name suffix
        filtered_expression_matrix_suffix: Result ExpressionMatrix object name suffix
        workspace_name: the name of the workspace it gets saved to

        return:
        result_directory: folder path that holds all files generated
        feature_set_ref: generated FeatureSet object reference
        filtered_expression_matrix_ref: generated filtered ExpressionMatrix object reference
        report_name: report name generated by KBaseReport
        report_ref: report reference generated by KBaseReport
        """

        self._validate_upload_featureset_from_diff_expr_params(params)

        diff_expression_set_ref = params.get('diff_expression_ref')
        diff_expression_set_info = self.ws.get_object_info3({"objects": 
                                                            [{"ref": diff_expression_set_ref}]}
                                                            )['infos'][0]
        diff_expression_set_name = diff_expression_set_info[1]

        result_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(result_directory)

        diff_expr_matrix_file, genome_id = self._process_diff_expression(
                                                            diff_expression_set_ref,
                                                            result_directory)

        up_feature_ids, down_feature_ids = self._process_matrix_file(
                                                                diff_expr_matrix_file,
                                                                params.get('p_cutoff'),
                                                                params.get('q_cutoff'),
                                                                params.get('fold_scale_type'),
                                                                params.get('fold_change_cutoff'))

        if params.get('expression_matrix_ref'):
            filtered_expression_matrix_ref = self._filter_expression_matrix(
                                                params.get('expression_matrix_ref'),
                                                up_feature_ids + down_feature_ids,
                                                params.get('workspace_name'),
                                                params.get('filtered_expression_matrix_suffix'))
        else:
            filtered_expression_matrix_ref = None

        up_feature_set_ref = self._generate_feature_set(
                            up_feature_ids,
                            genome_id,
                            params.get('workspace_name'),
                            diff_expression_set_name + '_up' + params.get('feature_set_suffix'))

        down_feature_set_ref = self._generate_feature_set(
                            down_feature_ids,
                            genome_id,
                            params.get('workspace_name'),
                            diff_expression_set_name + '_down' + params.get('feature_set_suffix'))

        returnVal = {'result_directory': result_directory,
                     'up_feature_set_ref': up_feature_set_ref,
                     'down_feature_set_ref': down_feature_set_ref,
                     'filtered_expression_matrix_ref': filtered_expression_matrix_ref}

        report_output = self._generate_report(up_feature_set_ref, down_feature_set_ref,
                                              filtered_expression_matrix_ref,
                                              up_feature_ids, down_feature_ids,
                                              genome_id,
                                              params.get('workspace_name'))
        returnVal.update(report_output)

        return returnVal
Beispiel #31
0
    def stage_input(self, input_ref, fasta_file_extension):
        '''
        Stage input based on an input data reference for CheckM

        input_ref can be a reference to an Assembly, BinnedContigs, or (not yet implemented) a Genome

        This method creates a directory in the scratch area with the set of Fasta files, names
        will have the fasta_file_extension parameter tacked on.

            ex:

            staged_input = stage_input('124/15/1', 'fna')

            staged_input
            {"input_dir": '...'}
        '''
        # generate a folder in scratch to hold the input
        suffix = str(int(time.time() * 1000))
        input_dir = os.path.join(self.scratch, 'bins_' + suffix)
        all_seq_fasta = os.path.join(self.scratch, 'all_sequences_' + suffix + '.' + fasta_file_extension)


        # 2) based on type, download the files
        ws = Workspace(self.ws_url)
        input_info = ws.get_object_info3({'objects': [{'ref': input_ref}]})['infos'][0]
        # 0 obj_id objid - the numerical id of the object.
        # 1 obj_name name - the name of the object.
        # 2 type_string type - the type of the object.
        # 3 timestamp save_date - the save date of the object.
        # 4 obj_ver ver - the version of the object.
        # 5 username saved_by - the user that saved or copied the object.
        # 6 ws_id wsid - the workspace containing the object.
        # 7 ws_name workspace - the workspace containing the object.
        # 8 string chsum - the md5 checksum of the object.
        # 9 int size - the size of the object in bytes.
        # 10 usermeta meta - arbitrary user-supplied metadata about
        #     the object.
        obj_name = input_info[1]
        type_name = input_info[2].split('-')[0]
        if type_name in ['KBaseGenomeAnnotations.Assembly', 'KBaseGenomes.ContigSet']:
            au = AssemblyUtil(self.callback_url)
            os.makedirs(input_dir)
            filename = os.path.join(input_dir, obj_name + '.' + fasta_file_extension)
            au.get_assembly_as_fasta({'ref': input_ref, 'filename': filename})
            if not os.path.isfile(filename):
                raise ValueError('Error generating fasta file from an Assembly or ContigSet with AssemblyUtil')
            pass
        elif type_name == 'KBaseMetagenomes.BinnedContigs':
            # download the bins as fasta and set the input folder name
            au = MetagenomeUtils(self.callback_url)
            bin_file_dir = au.binned_contigs_to_file({'input_ref': input_ref, 'save_to_shock': 0})['bin_file_directory']
            os.rename(bin_file_dir, input_dir)
            self.set_fasta_file_extensions(input_dir, fasta_file_extension)
        elif type_name == 'KBaseGenomes.Genome':
            raise ValueError('Cannot yet stage fasta file input directory from KBaseGenomes.Genome')
        else:
            raise ValueError('Cannot stage fasta file input directory from type: ' + type_name)


        # create summary fasta file with all bins
        self.cat_fasta_files(input_dir, fasta_file_extension, all_seq_fasta)

        return {'input_dir': input_dir, 'folder_suffix': suffix, 'all_seq_fasta': all_seq_fasta}