def check_contigs_in_bin_cache(self, binnedcontigs_ref, bin_id, token):
        ws = Workspace(self.ws_url, token=token)

        info = ws.get_object_info3({"objects": [{
            "ref": binnedcontigs_ref
        }]})['infos'][0]

        # base64 encode the string so it is safe for filenames and still unique per contig id
        b64key = base64.urlsafe_b64encode(
            bin_id.encode("utf-8")).decode('utf-8')

        inner_chsum = info[8] + '_' + b64key
        index_file = os.path.join(
            self.metagenome_index_dir,
            inner_chsum + self.CONTIGS_SUFFIX + ".tsv.gz")
        if not os.path.isfile(index_file):
            if self.debug:
                t1 = time.time()

            # get the position in the array of the bin
            binnedcontigs = ws.get_objects2({
                'objects': [{
                    'ref': binnedcontigs_ref,
                    'included': ["/bins/[*]/bid"]
                }]
            })['data'][0]['data']
            pos = 0
            found = False
            for b in binnedcontigs['bins']:
                if b['bid'] == bin_id:
                    found = True
                    break
                pos += 1
            if not found:
                raise ValueError('No Bin with ID: "' + bin_id + '" found.')

            # get the contigs map
            selection = ws.get_objects2({
                'objects': [{
                    'ref':
                    binnedcontigs_ref,
                    'included': [
                        '/bins/' + str(pos) + '/contigs',
                        '/bins/' + str(pos) + '/bid'
                    ]
                }]
            })['data'][0]['data']
            if selection['bins'][0]['bid'] != bin_id:
                raise ValueError(
                    'Something went wrong- bin ids do not match on second ws get_objects2 call'
                )

            contigs = selection['bins'][0]['contigs']
            self.save_contigs_in_bin_tsv(contigs, inner_chsum)
            if self.debug:
                print("    (time=" + str(time.time() - t1) + ")")
        return inner_chsum
Esempio n. 2
0
    def check_assembly_cache(self, ref, token):
        ws = Workspace(self.ws_url, token=token)
        info = ws.get_object_info3({"objects": [{"ref": ref}]})['infos'][0]
        inner_chsum = info[8]
        index_file = os.path.join(
            self.assembly_index_dir,
            inner_chsum + self.ASSEMBLY_SUFFIX + ".tsv.gz")
        if not os.path.isfile(index_file):
            if self.debug:
                print("    Loading WS object...")
                t1 = time.time()

            if 'KBaseGenomeAnnotations.Assembly' in info[2]:
                included = ["/contigs"]
                assembly_data = ws.get_objects2(
                    {'objects': [{
                        'ref': ref,
                        'included': included
                    }]})['data'][0]['data']
                contigs = list(assembly_data['contigs'].values())
                self.save_assembly_tsv(contigs, inner_chsum)

            elif 'KBaseGenomes.ContigSet' in info[2]:
                included = [
                    "/contigs/[*]/id", "/contigs/[*]/length",
                    "/contigs/[*]/md5", "/contigs/[*]/description"
                ]
                cs_data = ws.get_objects2(
                    {'objects': [{
                        'ref': ref,
                        'included': included
                    }]})['data'][0]['data']
                contigs = []
                for c in cs_data['contigs']:
                    this_contig_data = {'contig_id': ''}
                    if 'id' in c:
                        this_contig_data['contig_id'] = c['id']
                    if 'md5' in c:
                        this_contig_data['md5'] = c['md5']
                    if 'length' in c:
                        this_contig_data['length'] = c['length']
                    if 'description' in c:
                        this_contig_data['description'] = c['description']
                    contigs.append(this_contig_data)

                self.save_assembly_tsv(contigs, inner_chsum)
            else:
                raise ValueError(
                    'The "ref" is not an Assembly or ContigSet data object.  It was a '
                    + info[2])

            if self.debug:
                print("    (time=" + str(time.time() - t1) + ")")
        return inner_chsum
Esempio n. 3
0
    def test_build_hisat2_index_from_assembly_ok(self):
        manager = Hisat2IndexManager(self.wsURL, self.callback_url,
                                     self.scratch)
        ws = Workspace(self.wsURL)
        genome_obj_info = ws.get_objects2({
            'objects': [{
                'ref': self.genome_ref
            }],
            'no_data': 1
        })
        # get the list of genome refs from the returned info.
        # if there are no refs (or something funky with the return), this will be an empty list.
        # this WILL fail if data is an empty list. But it shouldn't be, and we know because
        # we have a real genome reference, or get_objects2 would fail.
        genome_obj_refs = genome_obj_info.get('data', [{}])[0].get('refs', [])

        # see which of those are of an appropriate type (ContigSet or Assembly), if any.
        assembly_ref = list()
        ref_params = [{'ref': x} for x in genome_obj_refs]
        ref_info = ws.get_object_info3({'objects': ref_params})
        for idx, info in enumerate(ref_info.get('infos')):
            if "KBaseGenomeAnnotations.Assembly" in info[
                    2] or "KBaseGenomes.ContigSet" in info[2]:
                assembly_ref.append(";".join(ref_info.get('paths')[idx]))
        assembly_ref = assembly_ref[0]
        idx_prefix = manager.get_hisat2_index(assembly_ref)
        self.assertIn("kb_hisat2_idx", idx_prefix)
Esempio n. 4
0
    def get_fasta_file(self, genome_ref):
        ws = Workspace(self.ws_url)
        # test if genome references an assembly type
        # do get_objects2 without data. get list of refs
        genome_obj_info = ws.get_objects2({
            'objects': [{
                'ref': genome_ref
            }],
            'no_data': 1
        })
        # get the list of genome refs from the returned info.
        # if there are no refs (or something funky with the return), this will be an empty list.
        # this WILL fail if data is an empty list. But it shouldn't be, and we know because
        # we have a real genome reference, or get_objects2 would fail.
        genome_obj_refs = genome_obj_info.get('data', [{}])[0].get('refs', [])

        # see which of those are of an appropriate type (ContigSet or Assembly), if any.
        assembly_ref = list()
        ref_params = [{'ref': x} for x in genome_obj_refs]
        ref_info = ws.get_object_info3({'objects': ref_params})
        for idx, info in enumerate(ref_info.get('infos')):
            if "KBaseGenomeAnnotations.Assembly" in info[
                    2] or "KBaseGenomes.ContigSet" in info[2]:
                assembly_ref.append(";".join(ref_info.get('paths')[idx]))
        # now just get the file.
        au = AssemblyUtil(self.callback_url)
        fasta_file = au.get_assembly_as_fasta({'ref': assembly_ref[0]})
        return fasta_file["path"]
Esempio n. 5
0
def fetch_fasta_from_genome(genome_ref, ws_url, callback_url):
    """
    Returns an assembly or contigset as FASTA.
    """
    if not check_ref_type(genome_ref, ['KBaseGenomes.Genome'], ws_url):
        raise ValueError("The given genome_ref {} is not a KBaseGenomes.Genome type!")
    # test if genome references an assembly type
    # do get_objects2 without data. get list of refs
    ws = Workspace(ws_url)
    genome_obj_info = ws.get_objects2({
        'objects': [{'ref': genome_ref}],
        'no_data': 1
    })
    # get the list of genome refs from the returned info.
    # if there are no refs (or something funky with the return), this will be an empty list.
    # this WILL fail if data is an empty list. But it shouldn't be, and we know because
    # we have a real genome reference, or get_objects2 would fail.
    genome_obj_refs = genome_obj_info.get('data', [{}])[0].get('refs', [])

    # see which of those are of an appropriate type (ContigSet or Assembly), if any.
    assembly_ref = list()
    ref_params = [{'ref': genome_ref + ";" + x} for x in genome_obj_refs]
    ref_info = ws.get_object_info3({'objects': ref_params})
    for idx, info in enumerate(ref_info.get('infos')):
        if "KBaseGenomeAnnotations.Assembly" in info[2] or "KBaseGenomes.ContigSet" in info[2]:
            assembly_ref.append(";".join(ref_info.get('paths')[idx]))

    if len(assembly_ref) == 1:
        return fetch_fasta_from_assembly(assembly_ref[0], ws_url, callback_url)
    else:
        raise ValueError("Multiple assemblies found associated with the given genome ref {}! "
                         "Unable to continue.")
    def check_binnedcontigs_cache(self, ref, token):
        ws = Workspace(self.ws_url, token=token)
        info = ws.get_object_info3({"objects": [{"ref": ref}]})['infos'][0]
        inner_chsum = info[8]
        index_file = os.path.join(self.metagenome_index_dir,
                                  inner_chsum + self.BIN_SUFFIX + ".tsv.gz")
        if not os.path.isfile(index_file):
            if self.debug:
                print("    Loading WS object...")
                t1 = time.time()

            included = [
                "/bins/[*]/bid", "/bins/[*]/gc", "/bins/[*]/n_contigs",
                "/bins/[*]/sum_contig_len", "/bins/[*]/cov"
            ]

            binnedcontigs = ws.get_objects2(
                {'objects': [{
                    'ref': ref,
                    'included': included
                }]})['data'][0]['data']
            self.save_binnedcontigs_tsv(binnedcontigs["bins"], inner_chsum)
            if self.debug:
                print("    (time=" + str(time.time() - t1) + ")")
        return inner_chsum
Esempio n. 7
0
    def get_one_genome(self, params, token=None):
        """Fetch a genome using WSLargeDataIO and return it as a python dict"""

        callback_url = os.environ.get('SDK_CALLBACK_URL')
        if callback_url:
            print('fetching genome object using WsLargeDataIO')
            ws_large_data = WsLargeDataIO(callback_url)
            res = ws_large_data.get_objects(params)['data'][0]
            data = json.load(open(res['data_json_file']))
        else:
            print('fetching genome object using Workspace')
            ws_client = Workspace(self.ws_url, token=token)
            data = ws_client.get_objects2(params)["data"][0]["data"]

        return data
Esempio n. 8
0
    def search_orthologs_from_pangenome(self, token, ref, query, sort_by,
                                        start, limit, num_found):

        search_object = 'orthologs'
        info_included = [
            'id', 'type', 'function', 'md5', 'protein_translation', 'orthologs'
        ]
        table_indexer = TableIndexer(token, self.ws_url)

        ret = table_indexer.run_search(ref, self.pangenome_index_dir,
                                       self.ORTHOLOGS_SUFFIX, search_object,
                                       info_included, query, sort_by, start,
                                       limit, num_found, self.debug)

        for orthologs in ret['orthologs']:
            orthologs_string = orthologs['orthologs']
            if orthologs_string:
                orthologs['orthologs'] = list(eval(orthologs_string))
                if not isinstance(orthologs['orthologs'][0], list):
                    orthologs['orthologs'] = [orthologs['orthologs']]

        ws = Workspace(self.ws_url, token=token)
        genome_feature_function_map = {}
        for orthologs in ret['orthologs']:
            for orthologs_obj in orthologs['orthologs']:
                gene_id = orthologs_obj[0]

                if gene_id in genome_feature_function_map:
                    orthologs_obj.append(
                        genome_feature_function_map.get(gene_id))
                else:
                    included = ["/features/[*]/function", "/features/[*]/id"]
                    object_info = ws.get_objects2({
                        'objects': [{
                            'ref': orthologs_obj[2],
                            'included': included
                        }]
                    })['data'][0]['data']

                    map(
                        lambda feature: genome_feature_function_map.update(
                            {feature.get('id'): feature.get('function')}),
                        object_info['features'])

                    orthologs_obj.append(
                        genome_feature_function_map.get(gene_id))

        return ret
Esempio n. 9
0
    def check_object_cache(self, ref, search_object, info_included,
                           index_dir, object_suffix, debug):
        ws = Workspace(self.ws_url, token=self.token)
        info = ws.get_object_info3({"objects": [{"ref": ref}]})['infos'][0]
        inner_chsum = info[8]
        index_file = os.path.join(index_dir,
                                  inner_chsum + object_suffix + ".tsv.gz")
        if not os.path.isfile(index_file):
            if debug:
                print("    Loading WS object...")
                t1 = time.time()

            included = self.build_info_included(search_object, info_included)
            object = ws.get_objects2({'objects': [{'ref': ref,
                                                   'included': included}]})['data'][0]['data']
            self.save_object_tsv(object[search_object], inner_chsum, info_included,
                                 index_dir, object_suffix)
            if debug:
                print("    (time=" + str(time.time() - t1) + ")")
        return inner_chsum
Esempio n. 10
0
class WorkspaceAdminUtil:
    def __init__(self, config):
        wsurl = config.get('workspace-url')
        self.atoken = config.get('ws-admin-token')
        self.noadmin = False
        if self.atoken is None or self.atoken == '':
            self.noadmin = True
            self.atoken = config['token']
        self.ws = Workspace(wsurl, token=self.atoken)

    def list_objects(self, params):
        """
        Provide something that acts like a standard listObjects
        """
        if self.noadmin:
            return self.ws.list_objects(params)
        return self.ws.administer({'command': 'listObjects', 'params': params})

    def get_objects2(self, params):
        """
        Provide something that acts like a standard getObjects
        """
        if self.noadmin:
            return self.ws.get_objects2(params)
        return self.ws.administer({'command': 'getObjects', 'params': params})

    def get_workspace_info(self, params):
        """
        Provide something that acts like a standard getObjects
        """
        if self.noadmin:
            return self.ws.get_workspace_info(params)
        return self.ws.administer({
            'command': 'getWorkspaceInfo',
            'params': params
        })
Esempio n. 11
0
class AveExpressionMatrixBuilder:

    def _validate_calculate_average_expression_matrix_params(self, params):
        """
        _validate_calculate_average_expression_matrix_params:
                validates params passed to calculate_average_expression_matrix method
        """

        log('start validating calculate_average_expression_matrix params')

        # check for required parameters
        for p in ['expression_matrix_ref', 'output_suffix', 'workspace_name']:
            if p not in params:
                raise ValueError('"{}" parameter is required, but missing'.format(p))

    def _generate_report(self, expression_matrix_ref, workspace_name):
        """
        _generate_report: generate report
        """

        objects_created = [{'ref': expression_matrix_ref,
                            'description': 'Average ExpressionMatrix'}]

        report_params = {'message': '',
                         'workspace_name': workspace_name,
                         'objects_created': objects_created,
                         # 'html_links': output_html_files,
                         # 'direct_html_link_index': 0,
                         'html_window_height': 366,
                         'report_object_name': 'kb_ave_expr_matrix_report_' + str(uuid.uuid4())}

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {'report_name': output['name'], 'report_ref': output['ref']}

        return report_output

    def _save_expression_matrix(self, em_data, em_obj_name, workspace_name):
        """
        _save_expression_matrix: saving ExpressionMatrix
        """

        try:
            log('saving ExpressionMatrix [{}]'.format(em_obj_name))
        
            data_type = 'KBaseFeatureValues.ExpressionMatrix'
            obj_info = self.dfu.save_objects({'id': self.dfu.ws_name_to_id(workspace_name),
                                              'objects': [{'type': data_type,
                                                           'data': em_data,
                                                           'name': em_obj_name}]})[0]
        except Exception as e:
            log(e)
            raise Exception('Failed Saving ExpressionMatrix to Workspace')

        expression_matrix_ref = str(obj_info[6]) + '/' + str(obj_info[0]) + '/' + str(obj_info[4])

        return expression_matrix_ref

    def __init__(self, config):
        self.ws_url = config["workspace-url"]
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        self.shock_url = config['shock-url']
        self.ws = Workspace(self.ws_url, token=self.token)
        self.dfu = DataFileUtil(self.callback_url)
        self.scratch = config['scratch']

    def calculate_average_expression_matrix(self, params):
        """
        calculate_average_expression_matrix: create an average ExpressionMatrix object 
                                             from a ExpressionMatrix object

        required params:
        expression_matrix_ref: ExpressionMatrix object reference
        output_suffix: output average ExpressionMatrix name suffix
        workspace_name: the name of the workspace it gets saved to
        
        return:
        average_expression_matrix_ref: generated average ExpressionMatrix object reference
        report_name: report name generated by KBaseReport
        report_ref: report reference generated by KBaseReport
        """

        log('--->\nrunning AveExpressionMatrixBuilder.calculate_average_expression_matrix\n' +
            'params:\n{}'.format(json.dumps(params, indent=1)))

        self._validate_calculate_average_expression_matrix_params(params)

        expression_matrix_ref = params.get('expression_matrix_ref')
        expression_matrix = self.ws.get_objects2({'objects':
                                                  [{'ref': 
                                                    expression_matrix_ref}]})['data'][0]

        expression_matrix_data = expression_matrix['data']
        expression_matrix_info = expression_matrix['info']

        condition_map = expression_matrix_data['condition_mapping']

        ori_data = expression_matrix_data['data']
        ori_col_ids = ori_data['col_ids']
        ori_row_ids = ori_data['row_ids']
        ori_values = ori_data['values']

        labels = condition_map.keys()

        if set(labels) != set(ori_col_ids):
            error_msg = 'available labels: {}\n'.format(ori_col_ids)
            error_msg += 'labels in condition_mapping: {}'.format(labels)
            raise ValueError(error_msg)

        condition_pos = {}

        for label, condition in condition_map.iteritems():
            if condition not in condition_pos:
                condition_pos.update({condition: [ori_col_ids.index(label)]})
            else:
                condition_list = condition_pos[condition]
                condition_list.append(ori_col_ids.index(label))
                condition_pos.update({condition: condition_list})

        conditions = condition_pos.keys()

        ave_values = []
        for ori_value in ori_values:
            ave_value = [None] * len(conditions)
            for condition, poss in condition_pos.iteritems():
                ave_pos = conditions.index(condition)
                sum_value = 0.0
                for pos in poss:
                    sum_value += round(float(ori_value[pos]), 3) 
                average = sum_value / len(poss)
                ave_value[ave_pos] = round(average, 2)

            ave_values.append(ave_value)

        average_data = {}
        average_data.update({'row_ids': ori_row_ids})
        average_data.update({'col_ids': conditions})
        average_data.update({'values': ave_values})

        em_data = {}
        genome_ref = expression_matrix_data.get('genome_ref')
        if genome_ref:
            em_data.update({'genome_ref': genome_ref})
        em_data.update({'scale': expression_matrix_data.get('scale')})
        em_data.update({'type': expression_matrix_data.get('type')})
        em_data.update({'feature_mapping': expression_matrix_data.get('feature_mapping')})
        em_data.update({'condition_mapping': expression_matrix_data.get('condition_mapping')})
        em_data.update({'data': average_data})

        expression_matrix_name = expression_matrix_info[1]
        ave_expression_matrix_name = expression_matrix_name + params.get('output_suffix')

        workspace_name = params.get('workspace_name')

        ave_expression_matrix_ref = self._save_expression_matrix(em_data, 
                                                                 ave_expression_matrix_name, 
                                                                 workspace_name)

        returnVal = {'average_expression_matrix_ref': ave_expression_matrix_ref}

        report_output = self._generate_report(ave_expression_matrix_ref,
                                              workspace_name)
        returnVal.update(report_output)

        return returnVal
Esempio n. 12
0
class DESeqUtil:

    PREPDE_TOOLKIT_PATH = '/kb/deployment/bin/prepDE'

    def _validate_run_deseq2_app_params(self, params):
        """
        _validate_run_deseq2_app_params:
                validates params passed to run_deseq2_app method
        """

        log('start validating run_deseq2_app params')

        # check for required parameters
        for p in [
                'expressionset_ref', 'differential_expression_set_suffix',
                'workspace_name'
        ]:
            if p not in params:
                raise ValueError(
                    '"{}" parameter is required, but missing'.format(p))

    def _mkdir_p(self, path):
        """
        _mkdir_p: make directory for given path
        """
        if not path:
            return
        try:
            os.makedirs(path)
        except OSError as exc:
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise

    def _xor(self, a, b):
        return bool(a) != bool(b)

    def _run_command(self, command):
        """
        _run_command: run command and print result
        """
        log('Start executing command:\n{}'.format(command))
        pipe = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True)
        output = pipe.communicate()[0]
        exitCode = pipe.returncode

        if (exitCode == 0):
            log('Executed commend:\n{}\n'.format(command) +
                'Exit Code: {}\nOutput:\n{}'.format(exitCode, output))
        else:
            error_msg = 'Error running commend:\n{}\n'.format(command)
            error_msg += 'Exit Code: {}\nOutput:\n{}'.format(exitCode, output)
            raise ValueError(error_msg)

    def _generate_html_report(self, result_directory, diff_expression_obj_ref,
                              params):
        """
        _generate_html_report: generate html summary report
        """

        log('start generating html report')
        html_report = list()

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_directory)
        result_file_path = os.path.join(output_directory, 'report.html')

        result_directory

        result_dirs = os.listdir(result_directory)
        visualization_content = ''
        for result_dir in result_dirs:
            dispersion_plots_name = result_dir + '_dispersion_plots.png'
            dispersion_plots_display_name = '{} {} dispersion plot'.format(
                result_dir.split('_')[0],
                result_dir.split('_')[1])

            shutil.copy2(
                os.path.join(result_directory, result_dir,
                             'deseq2_MAplot.png'),
                os.path.join(output_directory, dispersion_plots_name))
            visualization_content += '<div class="gallery">'
            visualization_content += '<a target="_blank" href="{}">'.format(
                dispersion_plots_name)
            visualization_content += '<img src="{}" '.format(
                dispersion_plots_name)
            visualization_content += 'alt="{}" width="600" height="400">'.format(
                dispersion_plots_display_name)
            visualization_content += '</a><div class="desc">{}</div></div>'.format(
                dispersion_plots_display_name)

            pca_plots_name = result_dir + '_PCA_MAplot.png'
            pca_plots_display_name = '{} {} PCA plot'.format(
                result_dir.split('_')[0],
                result_dir.split('_')[1])

            shutil.copy2(
                os.path.join(result_directory, result_dir, 'PCA_MAplot.png'),
                os.path.join(output_directory, pca_plots_name))
            visualization_content += '<div class="gallery">'
            visualization_content += '<a target="_blank" href="{}">'.format(
                pca_plots_name)
            visualization_content += '<img src="{}" '.format(pca_plots_name)
            visualization_content += 'alt="{}" width="600" height="400">'.format(
                pca_plots_display_name)
            visualization_content += '</a><div class="desc">{}</div></div>'.format(
                pca_plots_display_name)

        diff_expr_set_data = self.ws.get_objects2(
            {'objects': [{
                'ref': diff_expression_obj_ref
            }]})['data'][0]['data']

        items = diff_expr_set_data['items']

        # expression_ref = self.expression_set_data['items'][0]['ref']
        # expression_object = self.ws.get_objects2({'objects':
        #                                          [{'ref': expression_ref}]})['data'][0]
        # expression_data = expression_object['data']
        # genome_ref = expression_data['genome_id']
        # genome_name = self.ws.get_object_info([{"ref": genome_ref}], includeMetadata=None)[0][1]

        # feature_num = self.gsu.search({'ref': genome_ref})['num_found']
        # genome_features = self.gsu.search({'ref': genome_ref,
        #                                    'limit': feature_num,
        #                                    'sort_by': [['feature_id', True]]})['features']
        # feature_ids = []
        # for genome_feature in genome_features:
        #     if not re.match('.*\.\d*', genome_feature.get('feature_id')):
        #         feature_ids.append(genome_feature.get('feature_id'))
        # total_feature_num = len(feature_ids)

        overview_content = ''
        overview_content += '<br/><table><tr><th>Generated DifferentialExpressionMatrixSet'
        overview_content += ' Object</th></tr>'
        overview_content += '<tr><td>{} ({})'.format(
            params.get('diff_expression_obj_name'), diff_expression_obj_ref)
        overview_content += '</td></tr></table>'

        overview_content += '<p><br/></p>'

        overview_content += '<br/><table><tr><th>Generated DifferentialExpressionMatrix'
        overview_content += ' Object</th><th></th><th></th><th></th></tr>'
        overview_content += '<tr><th>Differential Expression Matrix Name</th>'
        # overview_content += '<th>Reference Genome</th>'
        # overview_content += '<th>Reference Genome Feature Count</th>'
        overview_content += '<th>Feature Count</th>'
        overview_content += '</tr>'
        for item in items:
            diff_expr_ref = item['ref']
            diff_expr_object = self.ws.get_objects2(
                {'objects': [{
                    'ref': diff_expr_ref
                }]})['data'][0]

            diff_expr_data = diff_expr_object['data']
            diff_expr_info = diff_expr_object['info']
            diff_expr_name = diff_expr_info[1]
            number_features = len(diff_expr_data['data']['row_ids'])

            overview_content += '<tr><td>{} ({})</td>'.format(
                diff_expr_name, diff_expr_ref)
            # overview_content += '<td>{} ({})</td>'.format(genome_name, genome_ref)
            # overview_content += '<td>{}</td>'.format(total_feature_num)
            overview_content += '<td>{}</td></tr>'.format(number_features)
        overview_content += '</table>'

        with open(result_file_path, 'w') as result_file:
            with open(
                    os.path.join(os.path.dirname(__file__),
                                 'report_template.html'),
                    'r') as report_template_file:
                report_template = report_template_file.read()
                report_template = report_template.replace(
                    '<p>Overview_Content</p>', overview_content)
                report_template = report_template.replace(
                    '<p>Visualization_Content</p>', visualization_content)
                result_file.write(report_template)

        report_shock_id = self.dfu.file_to_shock({
            'file_path': output_directory,
            'pack': 'zip'
        })['shock_id']

        html_report.append({
            'shock_id': report_shock_id,
            'name': os.path.basename(result_file_path),
            'label': os.path.basename(result_file_path),
            'description': 'HTML summary report for DESeq2 App'
        })
        return html_report

    def _generate_output_file_list(self, result_directory):
        """
        _generate_output_file_list: zip result files and generate file_links for report
        """

        log('start packing result files')
        output_files = list()

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_directory)
        result_file = os.path.join(output_directory, 'DESeq2_result.zip')
        plot_file = os.path.join(output_directory, 'DESeq2_plot.zip')

        with zipfile.ZipFile(result_file,
                             'w',
                             zipfile.ZIP_DEFLATED,
                             allowZip64=True) as zip_file:
            for root, dirs, files in os.walk(result_directory):
                for file in files:
                    if not (file.endswith('.zip') or file.endswith('.png')
                            or file.endswith('.DS_Store')):
                        zip_file.write(
                            os.path.join(root, file),
                            os.path.join(os.path.basename(root), file))

        output_files.append({
            'path': result_file,
            'name': os.path.basename(result_file),
            'label': os.path.basename(result_file),
            'description': 'File(s) generated by DESeq2 App'
        })

        with zipfile.ZipFile(plot_file,
                             'w',
                             zipfile.ZIP_DEFLATED,
                             allowZip64=True) as zip_file:
            for root, dirs, files in os.walk(result_directory):
                for file in files:
                    if file.endswith('.png'):
                        zip_file.write(
                            os.path.join(root, file),
                            os.path.join(os.path.basename(root), file))

        output_files.append({
            'path': plot_file,
            'name': os.path.basename(plot_file),
            'label': os.path.basename(plot_file),
            'description': 'Visualization plots by DESeq2 App'
        })

        return output_files

    def _generate_report(self, diff_expression_obj_ref, params,
                         result_directory):
        """
        _generate_report: generate summary report
        """

        log('creating report')

        output_files = self._generate_output_file_list(result_directory)

        output_html_files = self._generate_html_report(
            result_directory, diff_expression_obj_ref, params)

        diff_expr_set_data = self.ws.get_objects2(
            {'objects': [{
                'ref': diff_expression_obj_ref
            }]})['data'][0]['data']

        items = diff_expr_set_data['items']

        description_set = 'DifferentialExpressionMatrixSet generated by DESeq2'
        description_object = 'DifferentialExpressionMatrix generated by DESeq2'
        objects_created = []
        objects_created.append({
            'ref': diff_expression_obj_ref,
            'description': description_set
        })

        for item in items:
            diff_expr_ref = item['ref']
            objects_created.append({
                'ref': diff_expr_ref,
                'description': description_object
            })

        report_params = {
            'message': '',
            'workspace_name': params.get('workspace_name'),
            'objects_created': objects_created,
            'file_links': output_files,
            'html_links': output_html_files,
            'direct_html_link_index': 0,
            'html_window_height': 333,
            'report_object_name': 'kb_deseq2_report_' + str(uuid.uuid4())
        }

        kbase_report_client = KBaseReport(self.callback_url)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {
            'report_name': output['name'],
            'report_ref': output['ref']
        }

        return report_output

    def _save_count_matrix_file(self, expressionset_ref, result_directory):
        """
        _save_count_matrix_file: download gtf file for each expression
                                 run prepDE.py on them and save reault count matrix file
        """

        log('generating count matrix file')

        items = self.expression_set_data['items']

        gtf_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(gtf_directory)

        transcript_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(transcript_directory)

        for item in items:
            expression_ref = item['ref']
            expression_object = self.ws.get_objects2(
                {'objects': [{
                    'ref': expression_ref
                }]})['data'][0]
            expression_data = expression_object['data']
            expression_info = expression_object['info']
            handle_id = expression_data.get('file').get('hid')
            expression_name = expression_info[1]

            tmp_gtf_directory = os.path.join(gtf_directory, expression_name)
            self._mkdir_p(tmp_gtf_directory)

            self.dfu.shock_to_file({
                'handle_id': handle_id,
                'file_path': tmp_gtf_directory,
                'unpack': 'unpack'
            })

            tmp_transcript_directory = os.path.join(transcript_directory,
                                                    expression_name)
            self._mkdir_p(tmp_transcript_directory)

            cp_command = 'cp {} {}'.format(
                os.path.join(tmp_gtf_directory, 'transcripts.gtf'),
                tmp_transcript_directory)

            self._run_command(cp_command)

        self._run_prepDE(result_directory, transcript_directory)

    def _run_prepDE(self, result_directory, input_directory):
        """
        _run_prepDE: run prepDE.py script

        ref: http://ccb.jhu.edu/software/stringtie/index.shtml?t=manual#deseq
        """

        log('generating matrix of read counts')

        command = self.PREPDE_TOOLKIT_PATH + '/prepDE.py '
        command += '-i {} '.format(input_directory)
        command += '-g {} '.format(
            os.path.join(result_directory, 'gene_count_matrix.csv'))
        command += '-t {} '.format(
            os.path.join(result_directory, 'transcript_count_matrix.csv'))

        self._run_command(command)

    def _generate_diff_expression_csv(self, result_directory, alpha_cutoff,
                                      fold_change_cutoff, condition_string):
        """
        _generate_diff_expression_csv: get different expression matrix with DESeq2
        """

        result_files = os.listdir(result_directory)
        if 'gene_count_matrix.csv' not in result_files:
            error_msg = 'Missing gene_count_matrix.csv, available files: {}'.format(
                result_files)
            raise ValueError(error_msg)

        rcmd_list = [
            'Rscript',
            os.path.join(os.path.dirname(__file__), 'run_DESeq.R')
        ]
        rcmd_list.extend(['--result_directory', result_directory])
        rcmd_list.extend(['--alpha_cutoff', alpha_cutoff])
        rcmd_list.extend(['--fold_change_cutoff', fold_change_cutoff])
        rcmd_list.extend(['--condition_string', condition_string])

        rcmd_str = " ".join(str(x) for x in rcmd_list)

        self._run_command(rcmd_str)

    def _get_condition_string(self, result_directory, condition_labels):
        """
        _get_condition_string: get condition string corresponding to givin condition_labels
        """

        log('generating condition string')

        count_matrix_file = os.path.join(result_directory,
                                         'gene_count_matrix.csv')
        tmp_count_matrix_file = os.path.join(result_directory,
                                             'tmp_gene_count_matrix.csv')

        with open(count_matrix_file, "rb") as f:
            reader = csv.reader(f)
            columns = reader.next()[1:]

        condition_list = [None] * len(columns)

        items = self.expression_set_data.get('items')
        expr_name_condition_mapping = {}
        for item in items:
            expression_ref = item['ref']
            expr_object = self.ws.get_objects2(
                {'objects': [{
                    'ref': expression_ref
                }]})['data'][0]
            expr_data = expr_object['data']
            expr_info = expr_object['info']
            expr_name = expr_info[1]
            expr_condition = expr_data['condition']
            expr_name_list = expr_name_condition_mapping.get(expr_condition)
            if expr_name_list:
                expr_name_list.append(expr_name)
                expr_name_condition_mapping.update(
                    {expr_condition: expr_name_list})
            else:
                expr_name_condition_mapping.update(
                    {expr_condition: [expr_name]})

        for condition_label in condition_labels:
            if condition_label in expr_name_condition_mapping.keys():
                expression_names = expr_name_condition_mapping.get(
                    condition_label)
                for expression_name in expression_names:
                    pos = columns.index(expression_name)
                    condition_list[pos] = condition_label
            else:
                error_msg = 'Condition: {} is not availalbe. '.format(
                    condition_label)
                error_msg += 'Available conditions: {}'.format(
                    expr_name_condition_mapping.keys())
                raise ValueError(error_msg)

        if None in condition_list:
            filtered_pos = [0]
            filtered_condition_list = []
            for condition in condition_list:
                if condition:
                    pos = [
                        i + 1 for i, val in enumerate(condition_list)
                        if val == condition
                    ]
                    filtered_pos += pos
                    filtered_condition_list.append(condition)
            filtered_pos = list(set(filtered_pos))
            with open(count_matrix_file, "rb") as source:
                rdr = csv.reader(source)
                with open(tmp_count_matrix_file, "wb") as result:
                    wtr = csv.writer(result)
                    for r in rdr:
                        wtr.writerow(tuple(list(numpy.array(r)[filtered_pos])))
            os.rename(tmp_count_matrix_file, count_matrix_file)
            condition_string = ','.join(filtered_condition_list)
        else:
            condition_string = ','.join(condition_list)

        return condition_string

    def _save_diff_expression(self, result_directory, params):
        """
        _save_diff_expression: save DifferentialExpression object to workspace
        """

        log('start saving KBaseFeatureValues.DifferentialExpressionMatrix object'
            )

        workspace_name = params.get('workspace_name')
        diff_expression_obj_name = params.get('diff_expression_obj_name')

        destination_ref = workspace_name + '/' + diff_expression_obj_name

        result_dirs = os.listdir(result_directory)

        diff_expr_files = list()

        for result_dir in result_dirs:

            diff_expr_file = dict()
            condition_labels = result_dir.split('_')

            genes_results_filepath = os.path.join(result_directory, result_dir,
                                                  'deseq_results.csv')

            with open(genes_results_filepath, "rb") as f:
                reader = csv.reader(f)
                columns = reader.next()[1:]

            columns[columns.index('log2FoldChange')] = 'log2_fold_change'
            columns[columns.index('pvalue')] = 'p_value'
            columns[columns.index('padj')] = 'q_value'
            for line in fileinput.input(genes_results_filepath, inplace=True):
                if fileinput.isfirstline():
                    print 'gene_id,' + ','.join(columns)
                else:
                    print line,

            reader = csv.DictReader(open(genes_results_filepath))

            diffexpr_filepath = os.path.join(
                result_directory, result_dir,
                'differential_expression_result.csv')
            with open(diffexpr_filepath, 'w') as csvfile:
                fieldnames = [
                    'gene_id', 'log2_fold_change', 'p_value', 'q_value'
                ]
                writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

                writer.writeheader()

                for row in reader:
                    writer.writerow({
                        'gene_id':
                        row.get('gene_id'),
                        'log2_fold_change':
                        row.get('log2_fold_change'),
                        'p_value':
                        row.get('p_value'),
                        'q_value':
                        row.get('q_value')
                    })

            diff_expr_file.update({
                'condition_mapping': {
                    condition_labels[0]: condition_labels[1]
                }
            })
            diff_expr_file.update({'diffexpr_filepath': diffexpr_filepath})

            diff_expr_files.append(diff_expr_file)

        expression_ref = self.expression_set_data['items'][0]['ref']
        expression_data = self.ws.get_objects2(
            {'objects': [{
                'ref': expression_ref
            }]})['data'][0]['data']
        genome_ref = expression_data['genome_id']

        upload_diff_expr_params = {
            'destination_ref': destination_ref,
            'diffexpr_data': diff_expr_files,
            'tool_used': 'deseq',
            'tool_version': '1.16.1',
            'genome_ref': genome_ref
        }

        deu_upload_return = self.deu.save_differential_expression_matrix_set(
            upload_diff_expr_params)

        diff_expression_obj_ref = deu_upload_return['diffExprMatrixSet_ref']

        return diff_expression_obj_ref

    def _generate_deseq_files(self, result_directory, params):
        """
        _generate_deseq_files: generate DESeq files
        """
        gene_result_file = os.path.join(result_directory,
                                        'gene_count_matrix.csv')
        with open(gene_result_file, "rb") as f:
            reader = csv.reader(f)
            columns = reader.next()[1:]

        for line in fileinput.input(gene_result_file, inplace=True):
            if fileinput.isfirstline():
                print 'gene_id,' + ','.join(columns)
            else:
                print line,

        condition_string = self._get_condition_string(
            result_directory, params.get('condition_labels'))

        self._generate_diff_expression_csv(result_directory,
                                           params.get('alpha_cutoff'),
                                           params.get('fold_change_cutoff'),
                                           condition_string)

    def _get_condition_labels(self):
        """
        _get_condition_labels: get all possible condition label pairs
        """
        log('getting all possible condition pairs')

        condition_label_pairs = []
        items = self.expression_set_data.get('items')
        condition_replicate_name_mapping = {}
        for item in items:
            expression_ref = item['ref']
            expr_object = self.ws.get_objects2(
                {'objects': [{
                    'ref': expression_ref
                }]})['data'][0]
            expr_data = expr_object['data']
            expr_info = expr_object['info']
            expr_name = expr_info[1]
            expr_condition = expr_data['condition']
            expr_name_list = condition_replicate_name_mapping.get(
                expr_condition)
            if expr_name_list:
                expr_name_list.append(expr_name)
                condition_replicate_name_mapping.update(
                    {expr_condition: expr_name_list})
            else:
                condition_replicate_name_mapping.update(
                    {expr_condition: [expr_name]})

        condition_labels = condition_replicate_name_mapping.keys()

        condition_label_pairs = [
            list(pair) for pair in itertools.combinations(condition_labels, 2)
        ]

        log('all pssible conditon pairs:\n{}'.format(condition_label_pairs))

        return condition_label_pairs, condition_labels

    def _check_input_labels(self, condition_pairs, available_condition_labels):
        """
        _check_input_labels: check input condition pairs
        """
        checked = True
        for condition_pair in condition_pairs:

            first_label = condition_pair['condition_label_1'][0].strip()
            second_label = condition_pair['condition_label_2'][0].strip()
            if first_label not in available_condition_labels:
                error_msg = 'Condition: {} is not availalbe. '.format(
                    first_label)
                error_msg += 'Available conditions: {}'.format(
                    available_condition_labels)
                raise ValueError(error_msg)

            if second_label not in available_condition_labels:
                error_msg = 'Condition: {} is not availalbe. '.format(
                    second_label)
                error_msg += 'Available conditions: {}'.format(
                    available_condition_labels)
                raise ValueError(error_msg)

            if first_label == second_label:
                raise ValueError('Input conditions are the same')

        return checked

    def __init__(self, config):
        self.ws_url = config["workspace-url"]
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        self.shock_url = config['shock-url']
        self.dfu = DataFileUtil(self.callback_url)
        self.rau = ReadsAlignmentUtils(self.callback_url)
        self.deu = DifferentialExpressionUtils(self.callback_url,
                                               service_ver='dev')
        self.gsu = GenomeSearchUtil(self.callback_url)
        self.ws = Workspace(self.ws_url, token=self.token)
        self.scratch = config['scratch']

    def run_deseq2_app(self, params):
        """
        run_deseq2_app: run DESeq2 app
        (https://www.bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html)

        required params:
            expressionset_ref: ExpressionSet object reference
            differential_expression_set_suffix: DifferentialExpressoinMatrixSet object suffix
            workspace_name: the name of the workspace it gets saved to

        optional params:
            run_all_combinations: run all paired condition combinations
            condition_labels: conditions for expression set object
            alpha_cutoff: q value cutoff
            fold_change_cutoff: fold change cutoff
            num_threads: number of threads
            fold_scale_type: one of ["linear", "log2+1", "log10+1"]

        return:
            result_directory: folder path that holds all files generated by run_deseq2_app
            diff_expression_obj_ref: generated RNASeqDifferetialExpression object reference
            report_name: report name generated by KBaseReport
            report_ref: report reference generated by KBaseReport
        """
        log('--->\nrunning DESeqUtil.run_deseq2_app\n' +
            'params:\n{}'.format(json.dumps(params, indent=1)))

        self._validate_run_deseq2_app_params(params)

        result_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(result_directory)

        expressionset_ref = params.get('expressionset_ref')
        expression_set_obj = self.ws.get_objects2(
            {'objects': [{
                'ref': expressionset_ref
            }]})['data'][0]
        self.expression_set_data = expression_set_obj['data']
        expression_set_name = expression_set_obj['info'][1]

        differential_expression_set_suffix = params.get(
            'differential_expression_set_suffix')
        if re.match('.*_*[Ee]xpression_*[Ss]et', expression_set_name):
            diff_expression_obj_name = re.sub(
                '_*[Ee]xpression_*[Ss]et', differential_expression_set_suffix,
                expression_set_name)
        else:
            diff_expression_obj_name = expression_set_name + differential_expression_set_suffix

        params['diff_expression_obj_name'] = diff_expression_obj_name

        available_condition_label_pairs, available_condition_labels = self._get_condition_labels(
        )

        run_all_combinations = params.get('run_all_combinations')
        condition_pairs = params.get('condition_pairs')
        if not self._xor(run_all_combinations, condition_pairs):
            error_msg = "Invalid input:\nselect 'Run All Paired Condition Combinations' "
            error_msg += "or provide partial condition pairs. Don't do both"
            raise ValueError(error_msg)

        if run_all_combinations:
            condition_label_pairs = available_condition_label_pairs
        else:
            if self._check_input_labels(condition_pairs,
                                        available_condition_labels):
                condition_label_pairs = list()
                for condition_pair in condition_pairs:
                    condition_labels = [
                        condition_pair.get('condition_label_1')[0].strip(),
                        condition_pair.get('condition_label_2')[0].strip()
                    ]
                    condition_label_pairs.append(condition_labels)

        for condition_label_pair in condition_label_pairs:
            params['condition_labels'] = condition_label_pair

            dir_suffix = '_'.join(condition_label_pair)

            sub_result_directory = os.path.join(result_directory, dir_suffix)
            self._mkdir_p(sub_result_directory)

            # run prepDE.py and save count matrix file
            self._save_count_matrix_file(expressionset_ref,
                                         sub_result_directory)

            self._generate_deseq_files(sub_result_directory, params)

        diff_expression_obj_ref = self._save_diff_expression(
            result_directory, params)

        returnVal = {
            'result_directory': result_directory,
            'diff_expression_obj_ref': diff_expression_obj_ref
        }

        report_output = self._generate_report(diff_expression_obj_ref, params,
                                              result_directory)
        returnVal.update(report_output)

        return returnVal
Esempio n. 13
0
class Service:
    def __init__(self, fba_url, ws_url, ctx):
        self.ws_client = Workspace(ws_url, token=ctx['token'])
        self.fba_client = fba_tools(fba_url)

    def get_object(self, objid, wsid, name=None):
        """
        Returns an object and it's associated KBase information

        Returns an ObjectData (dictionary) like what is returned in the workspace service 'get_objects' function:

        /* The data and supplemental info for an object.

            UnspecifiedObject data - the object's data or subset data.
            object_info info - information about the object.
            list<ProvenanceAction> provenance - the object's provenance.
            username creator - the user that first saved the object to the
                workspace.
            timestamp created - the date the object was first saved to the
                workspace.
            list<obj_ref> - the references contained within the object.
            obj_ref copied - the reference of the source object if this object is
                a copy and the copy source exists and is accessible.
                null otherwise.
            boolean copy_source_inaccessible - true if the object was copied from
                another object, but that object is no longer accessible to the
                user. False otherwise.
            mapping<id_type, list<extracted_id>> extracted_ids - any ids extracted
                from the object.
            string handle_error - if an error occurs while setting ACLs on
                embedded handle IDs, it will be reported here.
            string handle_stacktrace - the stacktrace for handle_error.

        */
        typedef structure {
            UnspecifiedObject data;
            object_info info;
            list<ProvenanceAction> provenance;
            username creator;
            timestamp created;
            list<obj_ref> refs;
            obj_ref copied;
            boolean copy_source_inaccessible;
            mapping<id_type, list<extracted_id>> extracted_ids;
            string handle_error;
            string handle_stacktrace;
        } ObjectData;

        :param name: (optional) the name for the object to be retrieved. if included, favored over ID
        :param wsid: the workspace to retrieve the object from
        :param objid: the id of the object to be retrieved

        """
        if name is None:
            result = self.ws_client.get_objects2(
                {'objects': [{
                    'objid': objid,
                    'workspace': wsid
                }]})['data'][0]
        else:
            result = self.ws_client.get_objects2(
                {'objects': [{
                    'name': name,
                    'workspace': wsid
                }]})[0]
        return result['data'], result['info']

    def get_info(self, wsid, objid=None, name=None):
        if name is None:
            return self.ws_client.get_object_info_new(
                {'objects': [{
                    'objid': objid,
                    'workspace': wsid
                }]})[0]
        else:
            return self.ws_client.get_object_info_new(
                {'objects': [{
                    'name': name,
                    'workspace': wsid
                }]})[0]

    def save_object(self, data, type, wsid, objid=None, name=None):
        """
        Saves an object in KBase

        :param data: data representing the object to be saved
        :param type: a string representing the KBase type of the object
        :param wsid: destination workspace
        :param objid: (optional) ID for location of object to be saved (use with care, overwriting/failures are at KBase's
            discretion).
        :param name: (optional) string name for the pbject to be saved
        :return: a list of information about the object as it is stored in KBase
        """
        sv = {u'data': data, u'type': type, u'name': name}
        if objid is not None:
            sv[u'objid'] = objid
        if name is not None:
            sv[u'name'] = name
        info = self.ws_client.save_objects({
            u'workspace': wsid,
            u'objects': [sv]
        })[0]
        return info[0], info[7]

    def list_objects(self, workspace_id, typestr=None):
        """
        returns a list of all the objects within a workspace in tuples (obj_id, ws_id, object_name)

        :rtype: list
        :param typestr: (optional) if set, lists only objects of this type (filter over default case)
        :param workspace_id: the workspace to list the objects from
        :return: a list of tuples of objects
        """
        objects = self.ws_client.list_objects({'workspaces': [workspace_id]})
        result = list()
        for obj in objects:
            object_type = obj[2]
            if typestr is None or typestr in object_type or types(
            )[typestr] in object_type:  # type filtering of our list
                result.append((obj[0], obj[6], obj[1], obj[2]))
        return result

    def clear_workspace(self, workspace_id):
        """
        clear all objects in a workspace (except for a Narrative object if applicable)
        :param workspace_id: workspace to clear
        :return: None
        """
        object_ids = [{
            'objid': info[0],
            'wsid': workspace_id
        } for info in self.ws_client.list_objects({'ids': [workspace_id]})
                      if not info[2].startswith('KBaseNarrative')]
        if len(object_ids) > 0:
            self.ws_client.delete_objects(object_ids)

    def delete_objects(self, object_tuples):
        """
        delete objects
        :param object_tuples: list of tuples representing objects to delete of the form (obj_id, ws_id)
        :return: None
        """
        object_ids = [{
            'objid': info[0],
            'wsid': info[1]
        } for info in object_tuples]
        if len(object_ids) > 0:
            self.ws_client.delete_objects(object_ids)

    def copy_object(self, from_tuple, to_tuple):
        """
        Copies an object in the service to another location in the service

        :param from_tuple: (objid, wsid) of the object to be copied
        :param to_tuple: (name, wsid) of the destination. workspace may differ. NOTE NAME IS A STRING
        :return: a tuple with information on the new objectmodel
        """
        info = self.ws_client.copy_object({
            'from': {
                'workspace': from_tuple[1],
                'objid': from_tuple[0]
            },
            'to': {
                'workspace': to_tuple[1],
                'name': to_tuple[0]
            }
        })
        return info[0], info[7]

    def gapfill_model(self, model, media, workspace=None):
        """

        :param model: FBAModel to gapfill
        :param media: Media to gapfill the model to
        :param workspace: destination workspace for new model and gapfill object
        :param name: (optional) name for new model. KBase will overwrite original if left unspecified.
        :return: the information for a new gap-filled model
        """
        if workspace is None:
            workspace = model.workspace_id
        params = {
            u'fbamodel_id': str(model.object_id),
            u'fbamodel_workspace': str(model.workspace_id),
            u'fbamodel_output_id': str(model.name),
            u'workspace': workspace,
            u'media_id': media.object_id,
            u'media_workspace': media.workspace_id,
            u'comprehensive_gapfill': False
        }
        self.fba_client.gapfill_metabolic_model(params)
        return model.object_id, model.workspace_id

    def _gapfill_solution(self, fba):
        """
            If this FBA was created as a gapfilling solution, then this returns a list of reactions to be added/adjusted
            :return: list(tuple) (rxn_id, direction, etc.)
            """
        # For now, naively assume first = best = only gap-filling solution
        solutions = fba['gapfillingSolutions']
        if len(solutions) < 1:
            raise ValueError("This is not a gapfilling solution")
        gsol = solutions[0]['gapfillingSolutionReactions']
        result = []
        for r in gsol:
            reaction_id = r['reaction_ref'].split('/')[-1] + '_' + \
                          r['compartment_ref'].split('/')[-1] + str(r['compartmentIndex'])
            direction = r['direction']
            result.append((reaction_id, direction))
        return result

    def fba_formulation(self, media):
        return {
            u'media': str(media.object_id),
            u'media_workspace': str(media.workspace_id)
        }

    def runfba(self, model, media, workspace=None):
        """
        runs Flux Balance Analysis on an FBAModel in the fba modeling service

        :param model: FBAModel to run flux balance analysis on
        :param media: Media to run FBA with
        :param workspace: (optional) workspace for the FBA object to be left in, default is model workspace
        :return: tuple identity of the FBA stored in the service
        """
        if workspace is None:
            workspace = model.workspace_id
        fba_params = {
            u'workspace': workspace,
            u'fbamodel_id': model.object_id,
            u'fbamodel_workspace': model.workspace_id,
            u'media_workspace': str(media.workspace_id),
            u'media_id': str(media.object_id),
            u'fba_output_id': model.name + '_fba'
        }
        info = self.fba_client.run_flux_balance_analysis(fba_params)
        obj_id = info['new_fba_ref'].split('/')[1]
        return obj_id, workspace

    def runfva(self, model, media, workspace=None):
        """
        runs Flux Balance Analysis on an FBAModel in the fba modeling service

        :param model: FBAModel to run flux balance analysis on
        :param media: Media to run FBA with
        :param workspace: (optional) workspace for the FBA object to be left in, default is model workspace
        :return: tuple identity of the FBA stored in the service
        """
        if workspace is None:
            workspace = model.workspace_id
        fba_params = {
            u'workspace': workspace,
            u'model': model.object_id,
            u'model_workspace': model.workspace_id,
            u'formulation': self.fba_formulation(media),
            u'fva': True
        }
        info = self.fba_client.runfba(fba_params)
        obj_id = info['new_fba_ref'].split('/')[1]
        return obj_id, workspace

    def translate_model(self, src_model, protcomp, workspace=None):
        """
        Uses the service to translate an FBAModel to a close genome relative
        :param protcomp: ProteomeComparison with source and target Genome
        :param src_model: FBAModel of source
        return: tuple identity of the translated model stored in the service
        """
        if workspace is None:
            workspace = src_model.workspace_id
        trans_params = {
            u'keep_nogene_rxn': 1,
            u'proteincomparison_id': protcomp.object_id,
            u'proteincomparison_workspace': protcomp.workspace_id,
            u'fbamodel_id': src_model.object_id,
            u'fbamodel_output_id': 'translated_' + src_model.name,
            u'fbamodel_workspace': src_model.workspace_id,
            u'workspace': workspace
        }
        info = self.fba_client.propagate_model_to_new_genome(trans_params)
        obj_id = info['new_fbamodel_ref'].split('/')[1]
        return obj_id, workspace

    def reconstruct_genome(self, genome, workspace=None):
        """
        Reconstructs a genome and returns the identity of a stored draft recon model (FBAModel)
        :param workspace: (optional) destination workspace. Default is genome.workspace_id
        :param genome: Genome to draft a reconstruction for
        :return: tuple identity of the draft model stored in the service (FBAModel)
        """
        if workspace is None:
            workspace = genome.workspace_id
        recon_params = {
            u'genome_id': genome.object_id,
            u'genome_workspace': genome.workspace_id,
            u'fbamodel_output_id': 'recon_' + genome.name,
            u'gapfill_model': False,  # TODO parameterize as option
            u'workspace': workspace
        }
        info = self.fba_client.build_metabolic_model(recon_params)
        # references returned here are sometimes inconsistent from other fba_tools APIs. Fetch obj info from ws service
        obj_name = info['new_fbamodel_ref'].split('/')[1]
        try:
            return int(obj_name), workspace
        except ValueError:
            ws_object_info = self.ws_client.get_object_info_new(
                {'objects': [{
                    'name': obj_name,
                    'workspace': workspace
                }]})[0]
            return ws_object_info[0], workspace

    def remove_reactions_in_place(self, model, reactions_to_remove):
        """
        Removes reactions from an FBAModel IN PLACE (changes object as it is stored)

        Recommended to make a copy first

        :param model: FBAModel to remove reactions form
        :param reactions_to_remove: reactions to remove (removal_id's)
        :return:
        """
        model_data, model_info = self.get_object(model.object_id,
                                                 model.workspace_id)
        rxns_to_remove = set(reactions_to_remove)
        prior_ids = set([r['id'] for r in model_data['modelreactions']])
        model_data['modelreactions'] = [
            r for r in model_data['modelreactions']
            if r['id'] not in rxns_to_remove
        ]
        current_ids = set([r['id'] for r in model_data['modelreactions']])
        removed = set(
            [rxn_id for rxn_id in prior_ids if rxn_id not in current_ids])
        if len(reactions_to_remove) != len(removed):
            print "WARNING: expected to remove", len(
                reactions_to_remove), "reactions but only removed", removed
            print "Failed to remove", set(reactions_to_remove) - removed
            print "Full arg reactions_to_remove:", ', '.join(
                reactions_to_remove)
        return self.save_object(model_data,
                                model_info[2],
                                model.workspace_id,
                                name=model.name)

    def remove_reaction(self, model, reaction, output_id=None, in_place=False):
        """

        :param model: FBAModel to remove the reaction from
        :param reaction: removal_id (str) of the reaction to remove
        :param output_id: (optional) (str) of the new name for the output model
        :param in_place: (optional) set to true if you want to remove the reaction from the model in place instead of making
            a new model. Will disregard output_id argument if set to true
        :return: info tuple for the new FBAModel in the stored environment
        """

        if in_place:
            self.remove_reactions_in_place(model, [reaction])
        if output_id is None:
            i = 0
            output_id = model.name + '-' + str(i)
            names = set(
                [info[3] for info in self.list_objects(model.workspace_id)])
            while output_id in names:
                i += 1
                output_id = model.name + '-' + str(i)

        model_data, model_info = self.get_object(model.object_id,
                                                 model.workspace_id)
        for i, r in enumerate(model_data['modelreactions']):
            if reaction == r['id']:
                # remove in json and save
                del model_data['modelreactions'][i]
        return self.save_object(model_data,
                                model_info[2],
                                model.workspace_id,
                                name=output_id)

    def add_reactions(self, model, new_reactions, workspace=None, name=None):
        """
        adds reactions to an FBAModel, in place or with a copy (set name to a new name)
        :param model: FBAModel to add reactions to
        :param new_reactions: list of tuples of the form (rxn_id, rxn_comp, direction, gpr) (gpr is optional)
        :param workspace: (optional) destination workspace, default is model.workspace_id
        :param name: output name for the new model. use to make a new one or modify in place
        :return: tuple identity of the model stored in the service (FBAModel)
        """
        reactions_to_add = [{
            'add_reaction_id':
            r[0],
            'reaction_compartment_id':
            len(r) > 1 and [r[1]] or [],
            'add_reaction_name':
            r[0],
            'add_reaction_direction':
            len(r) > 2 and r[2] or '=',
            'add_reaction_gpr':
            len(r) > 3 and r[3] or '',
        } for r in new_reactions]
        add_rxn_args = {
            'fbamodel_id': model.object_id,
            'fbamodel_workspace': model.workspace_id,
            'fbamodel_output_id': name or model.name,
            'workspace': workspace or model.workspace_id,
            'reactions_to_add': reactions_to_add
        }
        info = self.fba_client.edit_metabolic_model(add_rxn_args)
        return self._parse_objid_from_ref(
            info['new_fbamodel_ref']), model.workspace_id

    def add_reactions_manually(self,
                               model,
                               reactions,
                               workspace=None,
                               name=None):
        """
        Manually fix special reactions within the the object itself (use with caution)
        :param name: what to name the model when it is saved
        :param workspace: workspace to save the new FBAModel in
        :param reactions: (list<ModelReaction>) list of reactions to add manually
        :param model: FBAModel to add the reactions to
        """
        model.get_object()
        if workspace is None:
            workspace = model.workspace_id
        obj = model.data
        cpds = dict([(c['id'], c) for c in obj['modelcompounds']])
        for r in reactions:
            obj['modelreactions'].append(r.data)
            for cpd in r.data['modelReactionReagents']:
                c = cpd['modelcompound_ref'].split('/')[-1]
                if c not in cpds:
                    compound = {
                        'id':
                        c,
                        'name':
                        c,
                        'aliases': [u'mdlid:' + c.split('_')[0]],
                        'charge':
                        0,
                        'compound_ref':
                        '489/6/6/compounds/id/cpd00000',
                        'modelcompartment_ref':
                        '~/modelcompartments/id/' + c.split('_')[-1],
                        'formula':
                        ''
                    }
                    obj['modelcompounds'].append(compound)
                    cpds = dict([(c['id'], c) for c in obj['modelcompounds']])
        if name is not None:
            return self.save_object(obj,
                                    types()['FBAModel'],
                                    workspace,
                                    name=name)
        return self.save_object(obj,
                                types()['FBAModel'],
                                workspace,
                                objid=model.object_id)

    def adjust_directions_and_gprs(self, model, adjustments):
        reactions_to_change = [{
            'change_reaction_id': [r[0]],
            'change_reaction_direction': str(r[1]),
            'change_reaction_gpr': str(r[2])[1:-1],
        } for r in adjustments]
        change_rxn_args = {
            'fbamodel_id': model.object_id,
            'fbamodel_workspace': model.workspace_id,
            'fbamodel_output_id': model.name,
            'workspace': model.workspace_id,
            'reactions_to_change': reactions_to_change
        }
        self.fba_client.edit_metabolic_model(change_rxn_args)

    def adjust_directions(self, model, adjustments):
        """
        adjusts directions for reactions in an FBAModel
        :param model: FBAModel to adjust directions for
        :param adjustments: list<tuple> (rxn_id, direction). if rxn_id is not already in the model, it may be added
        :return: None
        """
        adjust_args = {
            'model': model.object_id,
            'workspace': model.workspace_id,
            'reaction': [r[0] for r in adjustments],
            'direction': [str(r[1]) for r in adjustments]
        }
        self.fba_client.adjust_model_reaction(adjust_args)

    def _integrate_gapfill(self, model, solution_fba, workspace=None):
        changes = self._gapfill_solution(solution_fba)
        reactions = dict([(r.rxn_id(), r) for r in model.get_reactions()])
        dirs = []
        additions = []
        for r in changes:
            if r[0] in reactions:
                dirs.append((reactions[r[0]].get_removal_id(), r[1]))
            else:
                temp = r[0].split('_')
                rxn_id = temp[0]
                rxn_comp = temp[1]
                additions.append((rxn_id, rxn_comp, r[1]))
        self.adjust_directions(model, dirs)
        info = self.add_reactions(model, additions, workspace=workspace)
        return info

    def model_info(self, model):
        comp = self.fba_client.compare_models({
            'models': [model.object_id],
            'workspaces': [model.workspace_id]
        })
        return (comp['model_comparisons'],
                dict([(r['reaction'], r)
                      for r in comp['reaction_comparisons']]))

    def init_workspace(self, ws=None, name=None):
        ws_id = ws
        ws_name = name
        if ws_name is None:
            ws_name = 'MMws'
        if ws is None:
            ws_conflict = True
            while ws_conflict:
                create_ws_params = {
                    'workspace':
                    ws_name,
                    'globalread':
                    'r',
                    'description':
                    "A workspace for storing the FBA's and meta data of the algorithm"
                }
                # Try to create a workspace, catch an error if the name is already in use
                try:
                    new_ws = self.ws_client.create_workspace(create_ws_params)
                    # new_ws is type workspace_info, a tuple where 0, 1 are id, name
                    ws_id = new_ws[0]
                    ws_name = new_ws[1]
                    ws_conflict = False
                except ServerError:
                    ws_name += str(random.randint(1, 9))
        return ws_id, ws_name

    def _parse_objid_from_ref(self, ref):
        return ref.split('/')[1]
Esempio n. 14
0
class TreeUtils:
    '''
    Module Name:
    TreeUtils

    Module Description:
    
    '''

    ######## WARNING FOR GEVENT USERS ####### noqa
    # Since asynchronous IO can lead to methods - even the same method -
    # interrupting each other, you must be *very* careful when using global
    # state. A method could easily clobber the state set by another while
    # the latter method is running.
    ######################################### noqa
    VERSION = "0.0.1"
    GIT_URL = ""
    GIT_COMMIT_HASH = "acb216cd302c161d5b4dfb272bd4bbae44cdac28"

    #BEGIN_CLASS_HEADER
    #END_CLASS_HEADER

    # config contains contents of config file in a hash or None if it couldn't
    # be found
    def __init__(self, config):
        #BEGIN_CONSTRUCTOR
        self.utils = Utils(config)
        self.scratch = config['scratch']
        self.dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL'])
        self.ws = Workspace(config['workspace-url'])
        logging.basicConfig(level=logging.INFO)
        #END_CONSTRUCTOR
        pass


    def get_trees(self, ctx, params):
        """
        :param params: instance of type "GetTreesParams" (tree_refs -
           (required) list of WS references included_fields - (optional)
           subset of tree fields to include) -> structure: parameter
           "tree_refs" of list of String, parameter "included_fields" of list
           of String
        :returns: instance of list of type "TreeData" -> structure: parameter
           "data" of type "Tree" (Data type for phylogenetic trees. @optional
           name description type tree_attributes @optional
           default_node_labels ws_refs kb_refs leaf_list) -> structure:
           parameter "name" of String, parameter "description" of String,
           parameter "type" of String, parameter "tree" of type "newick_tree"
           (Trees are represented in KBase by default in newick format
           (http://en.wikipedia.org/wiki/Newick_format) and are returned to
           you in this format by default.) -> type "tree" (A string
           representation of a phylogenetic tree.  The format/syntax of the
           string is specified by using one of the available typedefs
           declaring a particular format, such as 'newick_tree',
           'phylo_xml_tree' or 'json_tree'.  When a format is not explictily
           specified, it is possible to return trees in different formats
           depending on addtional parameters. Regardless of format, all leaf
           nodes in trees built from MSAs are indexed to a specific MSA row. 
           You can use the appropriate functionality of the API to replace
           these IDs with other KBase Ids instead. Internal nodes may or may
           not be named. Nodes, depending on the format, may also be
           annotated with structured data such as bootstrap values and
           distances.), parameter "tree_attributes" of mapping from String to
           String, parameter "default_node_labels" of mapping from type
           "node_id" to type "label", parameter "ws_refs" of mapping from
           type "node_id" to mapping from type "ref_type" (An enumeration of
           reference types for a node.  Either the one letter abreviation or
           full name can be given.  For large trees, it is strongly advised
           you use the one letter abreviations. Supported types are: g |
           genome  => genome typed object or CDS data p | protein => protein
           sequence object or CDS data, often given as the MD5 of the
           sequence n | dna     => dna sequence object or CDS data, often
           given as the MD5 of the sequence f | feature => feature object or
           CDS data) to list of type "ws_obj_id" (@id ws), parameter
           "kb_refs" of mapping from type "node_id" to mapping from type
           "ref_type" (An enumeration of reference types for a node.  Either
           the one letter abreviation or full name can be given.  For large
           trees, it is strongly advised you use the one letter abreviations.
           Supported types are: g | genome  => genome typed object or CDS
           data p | protein => protein sequence object or CDS data, often
           given as the MD5 of the sequence n | dna     => dna sequence
           object or CDS data, often given as the MD5 of the sequence f |
           feature => feature object or CDS data) to list of type "kbase_id"
           (A KBase ID is a string starting with the characters "kb|".  KBase
           IDs are typed. The types are designated using a short string. For
           instance," g" denotes a genome, "tree" denotes a Tree, and "aln"
           denotes a sequence alignment. KBase IDs may be hierarchical.  For
           example, if a KBase genome identifier is "kb|g.1234", a protein
           encoding gene within that genome may be represented as
           "kb|g.1234.peg.771". @id kb), parameter "leaf_list" of list of
           type "node_id", parameter "info" of type "object_info"
           (Information about an object, including user provided metadata.
           obj_id objid - the numerical id of the object. obj_name name - the
           name of the object. type_string type - the type of the object.
           timestamp save_date - the save date of the object. obj_ver ver -
           the version of the object. username saved_by - the user that saved
           or copied the object. ws_id wsid - the workspace containing the
           object. ws_name workspace - the workspace containing the object.
           string chsum - the md5 checksum of the object. int size - the size
           of the object in bytes. usermeta meta - arbitrary user-supplied
           metadata about the object.) -> tuple of size 11: parameter "objid"
           of type "obj_id" (The unique, permanent numerical ID of an
           object.), parameter "name" of type "obj_name" (A string used as a
           name for an object. Any string consisting of alphanumeric
           characters and the characters |._- that is not an integer is
           acceptable.), parameter "type" of type "type_string" (A type
           string. Specifies the type and its version in a single string in
           the format [module].[typename]-[major].[minor]: module - a string.
           The module name of the typespec containing the type. typename - a
           string. The name of the type as assigned by the typedef statement.
           major - an integer. The major version of the type. A change in the
           major version implies the type has changed in a non-backwards
           compatible way. minor - an integer. The minor version of the type.
           A change in the minor version implies that the type has changed in
           a way that is backwards compatible with previous type definitions.
           In many cases, the major and minor versions are optional, and if
           not provided the most recent version will be used. Example:
           MyModule.MyType-3.1), parameter "save_date" of type "timestamp" (A
           time in the format YYYY-MM-DDThh:mm:ssZ, where Z is either the
           character Z (representing the UTC timezone) or the difference in
           time to UTC in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500
           (EST time) 2013-04-03T08:56:32+0000 (UTC time)
           2013-04-03T08:56:32Z (UTC time)), parameter "version" of Long,
           parameter "saved_by" of type "username" (Login name of a KBase
           user account.), parameter "wsid" of type "ws_id" (The unique,
           permanent numerical ID of a workspace.), parameter "workspace" of
           type "ws_name" (A string used as a name for a workspace. Any
           string consisting of alphanumeric characters and "_", ".", or "-"
           that is not an integer is acceptable. The name may optionally be
           prefixed with the workspace owner's user name and a colon, e.g.
           kbasetest:my_workspace.), parameter "chsum" of String, parameter
           "size" of Long, parameter "meta" of type "usermeta" (User provided
           metadata about an object. Arbitrary key-value pairs provided by
           the user.) -> mapping from String to String
        """
        # ctx is the context object
        # return variables are: result
        #BEGIN get_trees
        logging.info("Starting 'get_trees' with params:{}".format(params))
        self.utils.validate_params(params, ("tree_refs",), ("included_fields",))
        ws_objs = [{'ref': r, 'included': params.get('included_fields', None)}
                   for r in params['tree_refs']]
        result = self.ws.get_objects2({'objects': ws_objs})['data']
        #END get_trees

        # At some point might do deeper type checking...
        if not isinstance(result, list):
            raise ValueError('Method get_trees return value ' +
                             'result is not type list as required.')
        # return the results
        return [result]

    def save_trees(self, ctx, params):
        """
        :param params: instance of type "SaveTreesParams" -> structure:
           parameter "ws_id" of type "ws_id" (The unique, permanent numerical
           ID of a workspace.), parameter "trees" of list of type
           "ObjectSaveData" (An object and associated data required for
           saving. Required arguments: type_string type - the type of the
           object. Omit the version information to use the latest version.
           UnspecifiedObject data - the object data. Optional arguments: One
           of an object name or id. If no name or id is provided the name
           will be set to 'auto' with the object id appended as a string,
           possibly with -\d+ appended if that object id already exists as a
           name. obj_name name - the name of the object. obj_id objid - the
           id of the object to save over. usermeta meta - arbitrary
           user-supplied metadata for the object, not to exceed 16kb; if the
           object type specifies automatic metadata extraction with the 'meta
           ws' annotation, and your metadata name conflicts, then your
           metadata will be silently overwritten. list<ProvenanceAction>
           provenance - provenance data for the object. boolean hidden - true
           if this object should not be listed when listing workspace
           objects.) -> structure: parameter "type" of type "type_string" (A
           type string. Specifies the type and its version in a single string
           in the format [module].[typename]-[major].[minor]: module - a
           string. The module name of the typespec containing the type.
           typename - a string. The name of the type as assigned by the
           typedef statement. major - an integer. The major version of the
           type. A change in the major version implies the type has changed
           in a non-backwards compatible way. minor - an integer. The minor
           version of the type. A change in the minor version implies that
           the type has changed in a way that is backwards compatible with
           previous type definitions. In many cases, the major and minor
           versions are optional, and if not provided the most recent version
           will be used. Example: MyModule.MyType-3.1), parameter "data" of
           unspecified object, parameter "name" of type "obj_name" (A string
           used as a name for an object. Any string consisting of
           alphanumeric characters and the characters |._- that is not an
           integer is acceptable.), parameter "objid" of type "obj_id" (The
           unique, permanent numerical ID of an object.), parameter "meta" of
           type "usermeta" (User provided metadata about an object. Arbitrary
           key-value pairs provided by the user.) -> mapping from String to
           String, parameter "provenance" of list of type "ProvenanceAction"
           (A provenance action. A provenance action (PA) is an action taken
           while transforming one data object to another. There may be
           several PAs taken in series. A PA is typically running a script,
           running an api command, etc. All of the following fields are
           optional, but more information provided equates to better data
           provenance. resolved_ws_objects should never be set by the user;
           it is set by the workspace service when returning data. On input,
           only one of the time or epoch may be supplied. Both are supplied
           on output. The maximum size of the entire provenance object,
           including all actions, is 1MB. timestamp time - the time the
           action was started epoch epoch - the time the action was started.
           string caller - the name or id of the invoker of this provenance
           action. In most cases, this will be the same for all PAs. string
           service - the name of the service that performed this action.
           string service_ver - the version of the service that performed
           this action. string method - the method of the service that
           performed this action. list<UnspecifiedObject> method_params - the
           parameters of the method that performed this action. If an object
           in the parameters is a workspace object, also put the object
           reference in the input_ws_object list. string script - the name of
           the script that performed this action. string script_ver - the
           version of the script that performed this action. string
           script_command_line - the command line provided to the script that
           performed this action. If workspace objects were provided in the
           command line, also put the object reference in the input_ws_object
           list. list<obj_ref> input_ws_objects - the workspace objects that
           were used as input to this action; typically these will also be
           present as parts of the method_params or the script_command_line
           arguments. list<obj_ref> resolved_ws_objects - the workspace
           objects ids from input_ws_objects resolved to permanent workspace
           object references by the workspace service. list<string>
           intermediate_incoming - if the previous action produced output
           that 1) was not stored in a referrable way, and 2) is used as
           input for this action, provide it with an arbitrary and unique ID
           here, in the order of the input arguments to this action. These
           IDs can be used in the method_params argument. list<string>
           intermediate_outgoing - if this action produced output that 1) was
           not stored in a referrable way, and 2) is used as input for the
           next action, provide it with an arbitrary and unique ID here, in
           the order of the output values from this action. These IDs can be
           used in the intermediate_incoming argument in the next action.
           list<ExternalDataUnit> external_data - data external to the
           workspace that was either imported to the workspace or used to
           create a workspace object. list<SubAction> subactions - the
           subactions taken as a part of this action. mapping<string, string>
           custom - user definable custom provenance fields and their values.
           string description - a free text description of this action.) ->
           structure: parameter "time" of type "timestamp" (A time in the
           format YYYY-MM-DDThh:mm:ssZ, where Z is either the character Z
           (representing the UTC timezone) or the difference in time to UTC
           in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500 (EST time)
           2013-04-03T08:56:32+0000 (UTC time) 2013-04-03T08:56:32Z (UTC
           time)), parameter "epoch" of type "epoch" (A Unix epoch (the time
           since 00:00:00 1/1/1970 UTC) in milliseconds.), parameter "caller"
           of String, parameter "service" of String, parameter "service_ver"
           of String, parameter "method" of String, parameter "method_params"
           of list of unspecified object, parameter "script" of String,
           parameter "script_ver" of String, parameter "script_command_line"
           of String, parameter "input_ws_objects" of list of type "obj_ref"
           (A string that uniquely identifies an object in the workspace
           service. There are two ways to uniquely identify an object in one
           string: "[ws_name or id]/[obj_name or id]/[obj_ver]" - for
           example, "MyFirstWorkspace/MyFirstObject/3" would identify the
           third version of an object called MyFirstObject in the workspace
           called MyFirstWorkspace. 42/Panic/1 would identify the first
           version of the object name Panic in workspace with id 42.
           Towel/1/6 would identify the 6th version of the object with id 1
           in the Towel workspace. "kb|ws.[ws_id].obj.[obj_id].ver.[obj_ver]"
           - for example, "kb|ws.23.obj.567.ver.2" would identify the second
           version of an object with id 567 in a workspace with id 23. In all
           cases, if the version number is omitted, the latest version of the
           object is assumed.), parameter "resolved_ws_objects" of list of
           type "obj_ref" (A string that uniquely identifies an object in the
           workspace service. There are two ways to uniquely identify an
           object in one string: "[ws_name or id]/[obj_name or id]/[obj_ver]"
           - for example, "MyFirstWorkspace/MyFirstObject/3" would identify
           the third version of an object called MyFirstObject in the
           workspace called MyFirstWorkspace. 42/Panic/1 would identify the
           first version of the object name Panic in workspace with id 42.
           Towel/1/6 would identify the 6th version of the object with id 1
           in the Towel workspace. "kb|ws.[ws_id].obj.[obj_id].ver.[obj_ver]"
           - for example, "kb|ws.23.obj.567.ver.2" would identify the second
           version of an object with id 567 in a workspace with id 23. In all
           cases, if the version number is omitted, the latest version of the
           object is assumed.), parameter "intermediate_incoming" of list of
           String, parameter "intermediate_outgoing" of list of String,
           parameter "external_data" of list of type "ExternalDataUnit" (An
           external data unit. A piece of data from a source outside the
           Workspace. On input, only one of the resource_release_date or
           resource_release_epoch may be supplied. Both are supplied on
           output. string resource_name - the name of the resource, for
           example JGI. string resource_url - the url of the resource, for
           example http://genome.jgi.doe.gov string resource_version -
           version of the resource timestamp resource_release_date - the
           release date of the resource epoch resource_release_epoch - the
           release date of the resource string data_url - the url of the
           data, for example
           http://genome.jgi.doe.gov/pages/dynamicOrganismDownload.jsf?
           organism=BlaspURHD0036 string data_id - the id of the data, for
           example 7625.2.79179.AGTTCC.adnq.fastq.gz string description - a
           free text description of the data.) -> structure: parameter
           "resource_name" of String, parameter "resource_url" of String,
           parameter "resource_version" of String, parameter
           "resource_release_date" of type "timestamp" (A time in the format
           YYYY-MM-DDThh:mm:ssZ, where Z is either the character Z
           (representing the UTC timezone) or the difference in time to UTC
           in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500 (EST time)
           2013-04-03T08:56:32+0000 (UTC time) 2013-04-03T08:56:32Z (UTC
           time)), parameter "resource_release_epoch" of type "epoch" (A Unix
           epoch (the time since 00:00:00 1/1/1970 UTC) in milliseconds.),
           parameter "data_url" of String, parameter "data_id" of String,
           parameter "description" of String, parameter "subactions" of list
           of type "SubAction" (Information about a subaction that is invoked
           by a provenance action. A provenance action (PA) may invoke
           subactions (SA), e.g. calling a separate piece of code, a service,
           or a script. In most cases these calls are the same from PA to PA
           and so do not need to be listed in the provenance since providing
           information about the PA alone provides reproducibility. In some
           cases, however, SAs may change over time, such that invoking the
           same PA with the same parameters may produce different results.
           For example, if a PA calls a remote server, that server may be
           updated between a PA invoked on day T and another PA invoked on
           day T+1. The SubAction structure allows for specifying information
           about SAs that may dynamically change from PA invocation to PA
           invocation. string name - the name of the SA. string ver - the
           version of SA. string code_url - a url pointing to the SA's
           codebase. string commit - a version control commit ID for the SA.
           string endpoint_url - a url pointing to the access point for the
           SA - a server url, for instance.) -> structure: parameter "name"
           of String, parameter "ver" of String, parameter "code_url" of
           String, parameter "commit" of String, parameter "endpoint_url" of
           String, parameter "custom" of mapping from String to String,
           parameter "description" of String, parameter "hidden" of type
           "boolean" (A boolean. 0 = false, other = true.)
        :returns: instance of list of type "object_info" (Information about
           an object, including user provided metadata. obj_id objid - the
           numerical id of the object. obj_name name - the name of the
           object. type_string type - the type of the object. timestamp
           save_date - the save date of the object. obj_ver ver - the version
           of the object. username saved_by - the user that saved or copied
           the object. ws_id wsid - the workspace containing the object.
           ws_name workspace - the workspace containing the object. string
           chsum - the md5 checksum of the object. int size - the size of the
           object in bytes. usermeta meta - arbitrary user-supplied metadata
           about the object.) -> tuple of size 11: parameter "objid" of type
           "obj_id" (The unique, permanent numerical ID of an object.),
           parameter "name" of type "obj_name" (A string used as a name for
           an object. Any string consisting of alphanumeric characters and
           the characters |._- that is not an integer is acceptable.),
           parameter "type" of type "type_string" (A type string. Specifies
           the type and its version in a single string in the format
           [module].[typename]-[major].[minor]: module - a string. The module
           name of the typespec containing the type. typename - a string. The
           name of the type as assigned by the typedef statement. major - an
           integer. The major version of the type. A change in the major
           version implies the type has changed in a non-backwards compatible
           way. minor - an integer. The minor version of the type. A change
           in the minor version implies that the type has changed in a way
           that is backwards compatible with previous type definitions. In
           many cases, the major and minor versions are optional, and if not
           provided the most recent version will be used. Example:
           MyModule.MyType-3.1), parameter "save_date" of type "timestamp" (A
           time in the format YYYY-MM-DDThh:mm:ssZ, where Z is either the
           character Z (representing the UTC timezone) or the difference in
           time to UTC in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500
           (EST time) 2013-04-03T08:56:32+0000 (UTC time)
           2013-04-03T08:56:32Z (UTC time)), parameter "version" of Long,
           parameter "saved_by" of type "username" (Login name of a KBase
           user account.), parameter "wsid" of type "ws_id" (The unique,
           permanent numerical ID of a workspace.), parameter "workspace" of
           type "ws_name" (A string used as a name for a workspace. Any
           string consisting of alphanumeric characters and "_", ".", or "-"
           that is not an integer is acceptable. The name may optionally be
           prefixed with the workspace owner's user name and a colon, e.g.
           kbasetest:my_workspace.), parameter "chsum" of String, parameter
           "size" of Long, parameter "meta" of type "usermeta" (User provided
           metadata about an object. Arbitrary key-value pairs provided by
           the user.) -> mapping from String to String
        """
        # ctx is the context object
        # return variables are: result
        #BEGIN save_trees
        logging.info("Starting 'save_trees'")
        self.utils.validate_params(params, ("ws_id", "trees"), ('type',))
        trees = []
        for i, t in enumerate(params['trees']):
            self.utils.validate_params(t, ("data",), ("name", "hidden", "meta", "type"))
            if 'type' in t and t['type'] != 'KBaseTrees.Tree':
                raise ValueError("This method only saves KBaseTrees.Tree objects")
            if "tree" not in t['data']:
                raise ValueError("Object {} missing 'tree' attribute containing newick tree"
                                 .format(i))
            if not Utils.validate_newick(t['data']['tree']):
                raise ValueError("Object {} has an invalid newick tree: {}"
                                 .format(i, t['data']['tree']))

            t['type'] = 'KBaseTrees.Tree'
            trees.append(t)

        result = self.dfu.save_objects({"id": params["ws_id"], "objects": trees})
        #END save_trees

        # At some point might do deeper type checking...
        if not isinstance(result, list):
            raise ValueError('Method save_trees return value ' +
                             'result is not type list as required.')
        # return the results
        return [result]

    def tree_to_newick_file(self, ctx, params):
        """
        :param params: instance of type "TreeToNewickFileParams" ->
           structure: parameter "input_ref" of type "Tree_id" (@id kb
           KBaseTrees.Tree), parameter "destination_dir" of String
        :returns: instance of type "TreeToNewickFileOutput" -> structure:
           parameter "file_path" of String
        """
        # ctx is the context object
        # return variables are: result
        #BEGIN tree_to_newick_file
        logging.info("Starting 'tree_to_newick' with params: {}".format(params))
        self.utils.validate_params(params, ("destination_dir", "input_ref"))
        _, result = self.utils.to_newick(params)
        #END tree_to_newick_file

        # At some point might do deeper type checking...
        if not isinstance(result, dict):
            raise ValueError('Method tree_to_newick_file return value ' +
                             'result is not type dict as required.')
        # return the results
        return [result]

    def export_tree_newick(self, ctx, params):
        """
        :param params: instance of type "ExportTreeParams" -> structure:
           parameter "input_ref" of type "Tree_id" (@id kb KBaseTrees.Tree)
        :returns: instance of type "ExportTreeOutput" -> structure: parameter
           "shock_id" of String
        """
        # ctx is the context object
        # return variables are: result
        #BEGIN export_tree_newick
        logging.info("Starting 'export_tree_newick' with params:{}".format(params))
        self.utils.validate_params(params, ("input_ref",))
        params['destination_dir'] = self.scratch
        cs_id, files = self.utils.to_newick(params)
        result = self.utils.export(files['file_path'], cs_id, params['input_ref'])
        #END export_tree_newick

        # At some point might do deeper type checking...
        if not isinstance(result, dict):
            raise ValueError('Method export_tree_newick return value ' +
                             'result is not type dict as required.')
        # return the results
        return [result]
    def status(self, ctx):
        #BEGIN_STATUS
        returnVal = {'state': "OK",
                     'message': "",
                     'version': self.VERSION,
                     'git_url': self.GIT_URL,
                     'git_commit_hash': self.GIT_COMMIT_HASH}
        #END_STATUS
        return [returnVal]
class DiffExprMatrixUtils:
    """
     Constains a set of functions for expression levels calculations.
    """

    PARAM_IN_WS_NAME = 'workspace_name'
    PARAM_IN_OBJ_NAME = 'output_obj_name'
    PARAM_IN_DIFFEXPMATSET_REF = 'diffExprMatrixSet_ref'

    def __init__(self, config, logger=None):
        self.config = config
        self.logger = logger
        self.scratch = os.path.join(config['scratch'], 'DEM_' + str(uuid.uuid4()))
        self.ws_url = config['workspace-url']
        self._mkdir_p(self.scratch)
        pass

    def _mkdir_p(self, path):
        """
        _mkdir_p: make directory for given path
        """
        if not path:
            return
        try:
            os.makedirs(path)
        except OSError as exc:
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise

    def process_params(self, params):
        """
        validates params passed to gen expression matrix method
        """
        for p in [self.PARAM_IN_DIFFEXPMATSET_REF]:
            if p not in params:
                raise ValueError('"{}" parameter is required, but missing'.format(p))

    def get_expressionset_data(self, expressionset_ref):

        expr_set_obj = self.ws_client.get_objects2(
            {'objects': [{'ref': expressionset_ref}]})['data'][0]

        expr_set_obj_type = expr_set_obj.get('info')[2]
        expr_set_data = dict()
        expr_set_data['ws_name'] = expr_set_obj.get('info')[7]
        expr_set_data['obj_name'] = expr_set_obj.get('info')[1]

        if re.match('KBaseRNASeq.RNASeqExpressionSet-\d.\d', expr_set_obj_type):
            expr_set_data['genome_ref'] = expr_set_obj['data']['genome_id']
            expr_obj_refs = list()
            for expr_obj in expr_set_obj['data']['mapped_expression_ids']:
                expr_obj_refs.append(expr_obj.values()[0])
            expr_set_data['expr_obj_refs'] = expr_obj_refs

        elif re.match('KBaseSets.ExpressionSet-\d.\d', expr_set_obj_type):
            items = expr_set_obj.get('data').get('items')
            expr_obj_refs = list()
            for item in items:
                expr_obj_refs.append(item['ref'])
            expr_obj = self.ws_client.get_objects2(
                {'objects': [{'ref': expr_obj_refs[0]}]})['data'][0]
            expr_set_data['genome_ref'] = expr_obj['data']['genome_id']
            expr_set_data['expr_obj_refs'] = expr_obj_refs
        else:
            raise TypeError(self.PARAM_IN_EXPSET_REF + ' should be of type ' +
                            'KBaseRNASeq.RNASeqExpressionSet ' +
                            'or KBaseSets.ExpressionSet')
        return expr_set_data

    def get_diffexpr_matrixset(self, params, token):

        self.ws_client = Workspace(self.ws_url, token=token)

        col_names = {'gene_id': 'gene',
                     'log2_fold_change': 'log2fc_f',
                     'p_value': 'p_value_f',
                     'q_value': 'q_value'}

        json_fields = ['log2fc_f', 'p_value_f', 'q_value']

        self.process_params(params)

        diffexprmatset_list = list()
        diffexprmatset_ref = params.get(self.PARAM_IN_DIFFEXPMATSET_REF)

        diffexprmatset_obj = self.ws_client.get_objects2(
                                {'objects': [{'ref': diffexprmatset_ref}]})['data'][0]

        items = diffexprmatset_obj.get('data').get('items')
        diffexprmat_refs = list()

        for item in items:
            diffexprmat_refs.append(item['ref'])
            self.logger.info('DiffExprMatrix ref: ' + item['ref'])

        for diffexprmat_ref in diffexprmat_refs:
            diffexprmat_dict = dict()
            diffexprmat_obj = self.ws_client.get_objects2(
                                {'objects': [{'ref': diffexprmat_ref}]})['data'][0]
            diffexprmat = diffexprmat_obj.get('data')
            diffexprmat_dict['condition_1'] = diffexprmat.get('condition_mapping').keys()[0]
            diffexprmat_dict['condition_2'] = diffexprmat.get('condition_mapping').values()[0]
            voldata = list()
            data = diffexprmat.get('data')

            for row_index, row_id in enumerate(data.get('row_ids')):
                row_data = dict()
                row_data['gene'] = row_id
                values = data.get('values')[row_index]
                for col_index in range(len(values)):
                    row_data[json_fields[col_index]] = values[col_index]

                voldata.append(row_data)

            diffexprmat_dict['voldata'] = voldata
            diffexprmatset_list.append(diffexprmat_dict)

        return diffexprmatset_list
Esempio n. 16
0
class BallgownUtil:

    def __init__(self, config):
        self.ws_url = config["workspace-url"]
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        self.shock_url = config['shock-url']
        self.dfu = DataFileUtil(self.callback_url)
        self.rau = ReadsAlignmentUtils(self.callback_url)
        self.fv = KBaseFeatureValues(self.callback_url)
        self.deu = DifferentialExpressionUtils(self.callback_url, service_ver='dev')
        self.ws = Workspace(self.ws_url, token=self.token)
        self.scratch = config['scratch']
        self.config = config

    def _xor(self, a, b):
        return bool(a) != bool(b)

    def _validate_run_ballgown_app_params(self, params):
        """
        _validate_run_ballgown_app_params:
                validates params passed to run_ballgown_app method
        """

        log('start validating run_ballgown_app params')

        # check for required parameters
        for p in ['expressionset_ref', 'diff_expression_matrix_set_suffix',
                  'workspace_name']:
            if p not in params:
                raise ValueError('"{}" parameter is required, but missing'.format(p))

        run_all_combinations = params.get('run_all_combinations')
        condition_pair_subset = params.get('condition_pair_subset')

        if not self._xor(run_all_combinations, condition_pair_subset):
            error_msg = "Invalid input:\nselect 'Run All Paired Condition Combinations' "
            error_msg += "or provide subset of condition pairs. Don't provide both, or neither."
            raise ValueError(error_msg)

    def _mkdir_p(self, path):
        """
        _mkdir_p: make directory for given path
        """
        if not path:
            return
        try:
            os.makedirs(path)
        except OSError as exc:
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise

    def _generate_html_report(self, result_directory, params, diff_expression_matrix_set_ref):
        """
        _generate_html_report: generate html summary report
        """

        log('start generating html report')
        html_report = list()

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_directory)
        result_file_path = os.path.join(output_directory, 'report.html')

        for file in glob.glob(os.path.join(result_directory, '*.tsv')):
            shutil.copy(file, output_directory)

        # volcano_plot exists only if there are two condition groups
        for file in glob.glob(os.path.join(result_directory, '*.png')):
            shutil.copy(file, output_directory)

        diff_expr_set = self.ws.get_objects2({'objects':
                                              [{'ref':
                                                diff_expression_matrix_set_ref[
                                                    'diffExprMatrixSet_ref']}]})['data'][0]
        diff_expr_set_data = diff_expr_set['data']
        diff_expr_set_info = diff_expr_set['info']
        diff_expr_set_name = diff_expr_set_info[1]

        overview_content = ''
        overview_content += '<br/><table><tr><th>Generated DifferentialExpressionMatrixSet'
        overview_content += ' Object</th></tr>'
        overview_content += '<tr><td>{} ({})'.format(diff_expr_set_name,
                                                     diff_expression_matrix_set_ref[
                                                         'diffExprMatrixSet_ref'])
        overview_content += '</td></tr></table>'

        overview_content += '<p><br/></p>'

        overview_content += '<br/><table><tr><th>Generated DifferentialExpressionMatrix'
        overview_content += ' Object</th><th></th><th></th><th></th></tr>'
        overview_content += '<tr><th>Differential Expression Matrix Name</th>'
        overview_content += '<th>Condition 1</th>'
        overview_content += '<th>Condition 2</th>'
        overview_content += '</tr>'

        for item in diff_expr_set_data['items']:
            item_diffexprmatrix_object = self.ws.get_objects2({'objects':
                                                               [{'ref': item['ref']}]})[
                'data'][0]
            item_diffexprmatrix_info = item_diffexprmatrix_object['info']
            item_diffexprmatrix_data = item_diffexprmatrix_object['data']
            diffexprmatrix_name = item_diffexprmatrix_info[1]

            overview_content += '<tr><td>{} ({})</td>'.format(diffexprmatrix_name,
                                                              item['ref'])
            overview_content += '<td>{}</td>'.format(item_diffexprmatrix_data.
                                                     get('condition_mapping').keys()[0])
            overview_content += '<td>{}</td>'.format(item_diffexprmatrix_data.
                                                     get('condition_mapping').values()[0])
            overview_content += '</tr>'
        overview_content += '</table>'

        # visualization
        image_content = ''
        for image in glob.glob(output_directory + "/*.png"):
            image = image.replace(output_directory + '/', '')
            caption = image.replace(output_directory + '/', '').replace('.png', '')
            image_content += '<p style="text-align:center"><img align="center" src="{}" ' \
                             'width="600" height="400"></a><a target="_blank"><br>' \
                             '<p align="center">{}</p></p>'.format(
                                 image, caption)

        with open(result_file_path, 'w') as result_file:
            with open(os.path.join(os.path.dirname(__file__), 'report_template.html'),
                      'r') as report_template_file:
                report_template = report_template_file.read()
                report_template = report_template.replace('<p>Overview_Content</p>',
                                                          overview_content)
                report_template = report_template.replace('<p>Image Gallery</p>',
                                                          image_content)
                result_file.write(report_template)

        report_shock_id = self.dfu.file_to_shock({'file_path': output_directory,
                                                  'pack': 'zip'})['shock_id']

        html_report.append({'shock_id': report_shock_id,
                            'name': os.path.basename(result_file_path),
                            'label': os.path.basename(result_file_path),
                            'description': 'HTML summary report for Ballgown App'})
        return html_report

    def _generate_output_file_list(self, result_directory):
        """
        _generate_output_file_list: zip result files and generate file_links for report
        """
        log('Start packing result files')
        output_files = list()

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_directory)
        result_file = os.path.join(output_directory, 'ballgown_result.zip')

        with zipfile.ZipFile(result_file, 'w',
                             zipfile.ZIP_DEFLATED,
                             allowZip64=True) as zip_file:
            for root, dirs, files in os.walk(result_directory):
                for file in files:
                    if not (file.endswith('.zip') or
                            file.endswith('.png') or
                            file.endswith('.DS_Store')):
                        zip_file.write(os.path.join(root, file), file)

        output_files.append({'path': result_file,
                             'name': os.path.basename(result_file),
                             'label': os.path.basename(result_file),
                             'description': 'File(s) generated by Ballgown App'})

        return output_files

    def _generate_report(self, params, result_directory, diff_expression_matrix_set_ref):
        """
        _generate_report: generate summary report
        """
        log('creating report')

        output_files = self._generate_output_file_list(result_directory)

        output_html_files = self._generate_html_report(
            result_directory, params, diff_expression_matrix_set_ref)

        report_params = {
            'message': '',
            'workspace_name': params.get('workspace_name'),
            'file_links': output_files,
            'html_links': output_html_files,
            'direct_html_link_index': 0,
            'html_window_height': 333,
            'report_object_name': 'kb_ballgown_report_' + str(uuid.uuid4())}

        kbase_report_client = KBaseReport(self.callback_url)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {'report_name': output['name'], 'report_ref': output['ref']}

        return report_output

    def get_sample_dir_group_file(self, mapped_expression_ids, condition_labels):

        ngroups = 0
        group_name_indices = {}
        group_counts = {}

        for group in condition_labels:
            if not group in group_name_indices:
                group_name_indices[group] = ngroups
                ngroups = ngroups + 1
            if not group in group_counts:
                group_counts[group] = 1
            else:
                group_counts[group] = group_counts[group] + 1

        # checks for proper ballgown execution:
        if ngroups < 2:
            raise Exception("At least two condition groups are needed for this analysis. ")
        for group in condition_labels:
            if group_counts[group] < 2:
                raise Exception(
                    "Condition group {0} has less than 2 members; ballgown will not run. "
                    "At least two condition groups are needed for this analysis. ".format(group))

        group_file_dir = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(group_file_dir)

        try:
            condition_labels_uniqued = list(set(condition_labels))
            sgf_name = os.path.join(group_file_dir, 'sample_dir_group_file_' +
                                    condition_labels_uniqued[0] + '_' +
                                    condition_labels_uniqued[1])
            sgf = open(sgf_name, "w")
        except Exception:
            raise Exception(
                "Can't open file {0} for writing {1}".format(
                    sgf_name, traceback.format_exc()))

        index = 0  # condition label index
        for ii in mapped_expression_ids:
            for alignment_id, expression_id in ii.items():
                expression_object = self.ws.get_objects2(
                    {'objects':
                     [{'ref': expression_id}]})['data'][0]
                handle_id = expression_object['data']['file']['hid']
                expression_name = expression_object['info'][1]

                expression_dir = os.path.join(group_file_dir, expression_name)
                self._mkdir_p(expression_dir)

                print('expression_name: ' + str(expression_dir) + ' ' +
                      str(group_name_indices[condition_labels[index]]))
                sgf.write("{0}  {1}\n".format(expression_dir,
                                              group_name_indices[condition_labels[index]]))

                self.dfu.shock_to_file({'handle_id': handle_id,
                                        'file_path': expression_dir,
                                        'unpack': 'unpack'})

                required_files = [
                    'e2t.ctab',
                    'e_data.ctab',
                    'i2t.ctab',
                    'i_data.ctab',
                    't_data.ctab']
                for file in glob.glob(expression_dir + '/*'):
                    if not os.path.basename(file) in required_files:
                        os.remove(file)

            index += 1

        return sgf_name

    def _cleanup(self, directory=None):
        """
        Clean up after the job.  At the moment this just means removing the working
        directory, but later could mean other things.
        """

        try:
            # it would not delete if fold is not empty
            shutil.rmtree(directory, ignore_errors=True)
            # need to iterate each entry
        except IOError as e:
            log("Unable to remove working directory {0}".format(directory))
            raise

    def _setupWorkingDir(self, directory=None):
        """
        Clean up an existing workingdir and create a new one
        """
        try:
            if os.path.exists(directory):
                self._cleanup(directory)
            os.mkdir(directory)
        except IOError:
            log("Unable to setup working dir {0}".format(directory))
            raise

    def _check_intron_measurements(self, sample_dir_group_table_file):
        """
        Check if intron measurements files are non-empty
        :param sample_dir_group_table_file:
        :return:
        """
        log('checking for intron level measurements... ')
        file = open(sample_dir_group_table_file, 'r')
        textFileLines = file.readlines()
        for line in textFileLines:
            expr_dir = line.split()[0]
            log(expr_dir)
            i2t_file = open(os.path.join(expr_dir, 'i2t.ctab'), 'r')
            if len(i2t_file.readlines()) <= 1:  # only header line exists
                raise Exception("No intron measurements found! Input expressions are possibly "
                                "from a prokaryote. Ballgown functions only on eukaryotic data."
                                " Consider using DeSeq2 or CuffDiff instead of BallGown.")
            idata_file = open(os.path.join(expr_dir, 'i_data.ctab'), 'r')
            if len(idata_file.readlines()) <= 1:  # only header line exists
                raise Exception("No intron measurements found! Input expressions are possibly "
                                "from a prokaryote. Ballgown functions only on eukaryotic data."
                                " Consider using DeSeq2 or CuffDiff instead of BallGown")

    def run_ballgown_diff_exp(self,
                              rscripts_dir,
                              sample_dir_group_table_file,
                              ballgown_output_dir,
                              output_csv,
                              volcano_plot_file
                              ):
        """ Make R call to execute the system

        :param rscripts_dir:
        :param sample_dir_group_table_file:

        :param ballgown_output_dir:
          sample_group_table is a listing of output Stringtie subdirectories,
         (full path specification) paired with group label (0 or 1), ie
            /path/WT_rep1_stringtie    0
            /path/WT_rep2_stringtie    0
            /path/EXP_rep1_stringtie   1
            /path/EXP_rep2_stringtie   1
          (order doesn't matter, but the directory-group correspondance does)

        :param output_csv:
        :param volcano_plot_file:
        :return:
        """
        # check if intron-level expression measurements are present
        self._check_intron_measurements(sample_dir_group_table_file)

        rcmd_list = ['Rscript', os.path.join(rscripts_dir, 'ballgown_fpkmgenematrix.R'),
                     '--sample_dir_group_table', sample_dir_group_table_file,
                     '--output_dir', ballgown_output_dir,
                     '--output_csvfile', output_csv,
                     '--volcano_plot_file', volcano_plot_file
                     ]
        rcmd_str = " ".join(str(x) for x in rcmd_list)
        log("rcmd_string is {0}".format(rcmd_str))
        openedprocess = subprocess.Popen(rcmd_str, shell=True)
        openedprocess.wait()
        # Make sure the openedprocess.returncode is zero (0)
        if openedprocess.returncode != 0:
            log("R script did not return normally, return code - "
                + str(openedprocess.returncode))
            raise Exception("Rscript failure")

    def load_diff_expr_matrix(self, ballgown_output_dir, output_csv):
        """
        Reads csv diff expr matrix file from Ballgown and returns as a
        dictionary of rows with the gene as key.  Each key gives a row of
        length three corresponding to fold_change, pval and qval in string form
        - can include 'NA'
        :param ballgown_output_dir
        :param output_csv:
        :return:
        """

        diff_matrix_file = os.path.join(ballgown_output_dir, output_csv)

        if not os.path.isfile(diff_matrix_file):
            raise Exception("differential expression matrix csvfile {0} doesn't exist!".format(
                diff_matrix_file))

        n = 0
        dm = {}
        with open(diff_matrix_file, "r") as csv_file:
            csv_rows = csv.reader(csv_file, delimiter="\t", quotechar='"')
            for row in csv_rows:
                n = n + 1
                if (n == 1):
                    if (row != ['id', 'fc', 'pval', 'qval']):
                        raise Exception(
                            "did not get expected column heading from {0}".format(
                                diff_matrix_file))
                else:
                    if (len(row) != 4):
                        raise Exception(
                            "did not get 4 elements in row {0} of csv file {1} ".format(
                                n, diff_matrix_file))
                    key = row[0]
                    # put in checks for NA or numeric for row[1] through 4
                    if (key in dm):
                        raise Exception(
                            "duplicate key {0} in row {1} of csv file {2} ".format(
                                key, n, diff_matrix_file))
                    dm[key] = row[1:5]

        return dm

    def _transform_expression_set_data(self, expression_set_data):
        """
        The stitch to connect KBaseSets.ExpressionSet-2.0 type data to
        the older KBaseRNASeq.RNASeqExpressionSet-3.0 that the implementation
        depends on. This is done by doing a dive into the nested alignment
        object ref and getting the required data
        :param expression_set_data:
        :return: transformed expression_set_data
        """
        transform = dict()
        # get genome id
        expression_ref = expression_set_data['items'][0]['ref']
        wsid, objid, ver = expression_ref.split('/')
        expression_obj = self.ws.get_objects([{'objid': objid, 'wsid': wsid}])
        transform['genome_id'] = expression_obj[0]['data']['genome_id']

        # get sampleset_id
        #alignment_ref = expression_obj[0]['data']['mapped_rnaseq_alignment'].values()[0]
        #wsid, objid, ver = alignment_ref.split('/')
        #alignment_obj = self.ws.get_objects([{'objid': objid, 'wsid': wsid}])
        #transform['sampleset_id'] = alignment_obj[0]['data']['sampleset_id']

        # build mapped_expression_ids
        mapped_expression_ids = list()
        for item in expression_set_data['items']:
            expression_ref = item['ref']
            wsid, objid, ver = expression_ref.split('/')
            expression_obj = self.ws.get_objects([{'objid': objid, 'wsid': wsid}])
            alignment_ref = expression_obj[0]['data']['mapped_rnaseq_alignment'].values()[0]
            mapped_expression_ids.append({alignment_ref: expression_ref})
        transform['mapped_expression_ids'] = mapped_expression_ids

        return transform

    def _build_condition_label_list(self, mapped_expression_ids):
        """
        Extracts the condition labels from each expression in the specified expression set data
        and builds a list of condition labels
        :param expression_set_data: expression set data
        :return: list of condition labels whose order resembles the expression order in
        the expression data
        """
        condition_labels = list()

        for ii in mapped_expression_ids:
            for alignment_id, expression_id in ii.items():
                expression_object = self.ws.get_objects2(
                    {'objects':
                     [{'ref': expression_id}]})['data'][0]
                condition_labels.append(expression_object['data']['condition'])

        return condition_labels

    def _update_output_file_header(self, output_file):
        """
        Modify header of output file (required by DifferentialExpressionUtils)
        :param output_file:
        :return:
        """
        f = open(output_file, 'r')
        filedata = f.read()
        f.close()

        modified_output = filedata.replace(
            '"id"\t"fc"\t"pval"\t"qval"',
            'gene_id\tlog2_fold_change\tp_value\tq_value')

        f = open(output_file, 'w')
        f.write(modified_output)
        f.close()

    def _check_input_labels(self, condition_pair_subset, available_condition_labels):
        """
        _check_input_labels: check input condition pairs
        """
        checked = True
        # example struct: [{u'condition': u'hy5'}, {u'condition': u'WT'}]
        condition_values = set()
        for condition in condition_pair_subset:
            condition_values.add(condition['condition'])

        if len(condition_values) < 2:
            error_msg = 'At least two unique conditions must be specified. '
            raise ValueError(error_msg)

        for condition in condition_pair_subset:

            label = condition['condition'].strip()
            if label not in available_condition_labels:
                error_msg = 'Condition label "{}" is not a valid condition. '.format(label)
                error_msg += 'Must be one of "{}"'.format(available_condition_labels)
                raise ValueError(error_msg)

        return checked

    def run_ballgown_app(self, params):
        """
        run_ballgown_app: run Ballgown app
        (https://www.bioconductor.org/packages/release/bioc/html/ballgown.html)

        required params:
            expressionset_ref: ExpressionSet object reference
            diff_expression_matrix_set_suffix: suffix to KBaseSets.DifferetialExpressionMatrixSet
            name
            condition_labels: conditions for expression set object
            alpha_cutoff: q value cutoff
            fold_change_cutoff: fold change cutoff
            workspace_name: the name of the workspace it gets saved to

        optional params:
            fold_scale_type: one of ["linear", "log2+1", "log10+1"]

        return:
            result_directory: folder path that holds all files generated by run_deseq2_app
            diff_expression_matrix_set_ref: generated KBaseSets.DifferetialExpressionMatrixSet
            object reference
            report_name: report name generated by KBaseReport
            report_ref: report reference generated by KBaseReport
        """
        log('--->\nrunning BallgownUtil.run_ballgown_app\n' +
            'params:\n{}'.format(json.dumps(params, indent=1)))

        self._validate_run_ballgown_app_params(params)

        expressionset_ref = params.get('expressionset_ref')

        expression_set_info = self.ws.get_object_info3({
            "objects": [{"ref": expressionset_ref}]})['infos'][0]
        expression_object_type = expression_set_info[2]

        # set output object name
        differential_expression_suffix = params['diff_expression_matrix_set_suffix']
        expression_name = expression_set_info[1]
        if re.match('.*_[Ee]xpression$', expression_name):
            params['diff_expression_matrix_set_name'] = re.sub(
                '_[Ee]xpression$', differential_expression_suffix, expression_name)
        if re.match('.*_[Ee]xpression_[Ss]et$', expression_name):
            params['diff_expression_matrix_set_name'] = re.sub(
                '_[Ee]xpression_[Ss]et$', differential_expression_suffix, expression_name)
        else:
            params['diff_expression_matrix_set_name'] = expression_name + \
                differential_expression_suffix

        log('--->\nexpression object type: \n' +
            '{}'.format(expression_object_type))

        if re.match('KBaseRNASeq.RNASeqExpressionSet-\d.\d', expression_object_type):
            expression_set_data = self.ws.get_objects2(
                {'objects':
                 [{'ref': expressionset_ref}]})['data'][0]['data']

        elif re.match('KBaseSets.ExpressionSet-\d.\d', expression_object_type):
            expression_set_data = self.ws.get_objects2(
                {'objects':
                 [{'ref': expressionset_ref}]})['data'][0]['data']

            expression_set_data = self._transform_expression_set_data(expression_set_data)

        mgroup = MultiGroup(self.ws)
        pairwise_mapped_expression_ids = mgroup.build_pairwise_groups(
            expression_set_data['mapped_expression_ids'])

        ballgown_output_dir = os.path.join(self.scratch, "ballgown_out")
        log("ballgown output dir is {0}".format(ballgown_output_dir))
        self._setupWorkingDir(ballgown_output_dir)

        # get set of all condition labels
        available_condition_labels = \
            self._build_condition_label_list(expression_set_data['mapped_expression_ids'])

        if params.get('run_all_combinations'):
            requested_condition_labels = available_condition_labels
        else:
            # get set of user specified condition labels
            condition_pair_subset = params.get('condition_pair_subset')
            if self._check_input_labels(condition_pair_subset, available_condition_labels):
                requested_condition_labels = list()
                # example: [{u'condition': u'hy5'}, {u'condition': u'WT'}]
                for condition in condition_pair_subset:
                    if condition.get('condition').strip() not in requested_condition_labels:
                        requested_condition_labels.append(condition.get('condition').strip())

        log("User requested pairwise combinations from condition label list : " +
            str(requested_condition_labels))

        diff_expr_files = list()

        for mapped_expression_ids in pairwise_mapped_expression_ids:
            print('processing pairwise combination: ')
            pprint(mapped_expression_ids)
            print('with condtion labels: ')
            condition_labels = self._build_condition_label_list(mapped_expression_ids)
            pprint(condition_labels)

            # skip if condition labels in this pairwise combination don't exist in
            # set of user requested condition labels
            skip = False
            for condition in condition_labels:
                if condition not in requested_condition_labels:
                    log("skipping " + str(condition_labels))
                    skip = True
            if skip:
                continue

            sample_dir_group_file = self.get_sample_dir_group_file(mapped_expression_ids,
                                                                   condition_labels)

            log("about to run_ballgown_diff_exp")
            rscripts_dir = '/kb/module/rscripts'

            condition_labels_uniqued = list()
            for condition in condition_labels:
                if condition not in condition_labels_uniqued:
                    condition_labels_uniqued.append(condition)

            output_csv = 'ballgown_diffexp_' + \
                condition_labels_uniqued[0] + '_' + condition_labels_uniqued[1] + '.tsv'
            volcano_plot_file = 'volcano_plot_' + \
                condition_labels_uniqued[0] + '_' + condition_labels_uniqued[1] + '.png'

            self.run_ballgown_diff_exp(rscripts_dir,
                                       sample_dir_group_file,
                                       ballgown_output_dir,
                                       output_csv,
                                       volcano_plot_file)

            log("back from run_ballgown_diff_exp, about to load diff exp matrix file")
            # diff_expr_matrix = self.load_diff_expr_matrix(ballgown_output_dir,
            # output_csv)  # read file before its zipped

            self._update_output_file_header(os.path.join(ballgown_output_dir, output_csv))

            diff_expr_file = dict()
            diff_expr_file.update({'condition_mapping':
                                   {condition_labels_uniqued[0]: condition_labels_uniqued[1]}})
            diff_expr_file.update(
                {'diffexpr_filepath': os.path.join(ballgown_output_dir, output_csv)})
            diff_expr_files.append(diff_expr_file)

        deu_param = {
            'destination_ref': params['workspace_name'] + '/' +
            params['diff_expression_matrix_set_name'],
            'diffexpr_data': diff_expr_files,
            'tool_used': TOOL_NAME,
            'tool_version': TOOL_VERSION,
            'genome_ref': expression_set_data.get('genome_id'),
        }

        diff_expression_matrix_set_ref = self.deu.save_differential_expression_matrix_set(
            deu_param)

        returnVal = {'result_directory': ballgown_output_dir,
                     'diff_expression_matrix_set_ref':
                         diff_expression_matrix_set_ref['diffExprMatrixSet_ref']}

        report_output = self._generate_report(params,
                                              ballgown_output_dir, diff_expression_matrix_set_ref)
        returnVal.update(report_output)

        return returnVal
Esempio n. 17
0
class STARUtils:
    STAR_VERSION = 'STAR 2.5.3a'
    STAR_BIN = '/kb/deployment/bin/STAR'
    STAR_IDX_DIR = 'STAR_Genome_index'
    STAR_OUT_DIR = 'STAR_Output'
    PARAM_IN_WS = 'output_workspace'
    PARAM_IN_FASTA_FILES = 'genomeFastaFiles'
    PARAM_IN_OUTFILE_PREFIX = 'outFileNamePrefix'
    PARAM_IN_STARMODE = 'runMode'
    PARAM_IN_THREADN = 'runThreadN'
    PARAM_IN_READS_FILES = 'readFilesIn'
    PARAM_IN_READS = 'readsset_ref'
    PARAM_IN_GENOME = 'genome_ref'
    SET_READS = 'set_reads_refs'

    def __init__(self, scratch_dir, workspace_url, callback_url, srv_wiz_url, provenance):
        self.workspace_url = workspace_url
        self.callback_url = callback_url
        self.srv_wiz_url = srv_wiz_url
        self.au = AssemblyUtil(self.callback_url)
        self.dfu = DataFileUtil(self.callback_url, service_ver='beta')
        self.scratch = scratch_dir
        self.working_dir = scratch_dir
        self.prog_runner = Program_Runner(self.STAR_BIN, self.scratch)
        self.provenance = provenance
        self.ws_client = Workspace(self.workspace_url)

        self.parallel_runner = KBParallel(self.callback_url)
        self.qualimap = kb_QualiMap(self.callback_url, service_ver='dev')
        self.set_api_client = SetAPI(self.srv_wiz_url, service_ver='dev')
        self.eu = ExpressionUtils(self.callback_url, service_ver='beta')

    def process_params(self, params):
        """
        process_params: checks params passed to run_star method and set default values
        """
        log('Start validating run_star parameters')
        # check for required parameters
        if params.get(self.PARAM_IN_WS, None) is None:
            raise ValueError(self.PARAM_IN_WS + ' parameter is required')

        if params.get(self.PARAM_IN_STARMODE, None) is None:
            params[self.PARAM_IN_STARMODE] = 'alignReads'
	if params.get(self.PARAM_IN_GENOME, None) is None:
            raise ValueError(self.PARAM_IN_GENOME +
				' parameter is required for generating genome index')

        if (params.get(self.PARAM_IN_STARMODE, None) is not None and
		params[self.PARAM_IN_STARMODE] != "genomeGenerate"):
            if params.get(self.PARAM_IN_READS, None) is None:
		raise ValueError(self.PARAM_IN_READS +
				' parameter is required for reads mapping')
		if not valid_string(params[self.PARAM_IN_READS], is_ref=True):
			raise ValueError("Parameter readsset_ref must be a valid Workspace object reference, "
					 "not {}".format(params.get(self.PARAM_IN_READS, None)))

        if params.get(self.PARAM_IN_THREADN, None) is not None:
            if not isinstance(params[self.PARAM_IN_THREADN], int):
                raise ValueError(self.PARAM_IN_HASH_THREADN + ' must be of type int')
	else:
             params[self.PARAM_IN_THREADN] = 2

	if "alignment_suffix" not in params or not valid_string(params["alignment_suffix"]):
            raise ValueError("Parameter alignment_suffix must be a valid Workspace object string, "
                      "not {}".format(params.get("alignment_suffix", None)))

        if params.get(self.PARAM_IN_OUTFILE_PREFIX, None) is not None:
            if params[self.PARAM_IN_OUTFILE_PREFIX].find('/') != -1:
                raise ValueError(self.PARAM_IN_OUTFILE_PREFIX + ' cannot contain subfolder(s).')
        else:
            params[self.PARAM_IN_OUTFILE_PREFIX] = 'star_'

        if params.get('create_report', None) is None:
            params['create_report'] = 0

        return self._setDefaultParameters(params)


    def _setDefaultParameters(self, params_in):
        """set default for this group of parameters
        """
        params = copy.deepcopy(params_in)
        if params.get('outFilterType', None) is None:
            params['outFilterType'] = "\"BySJout\""
        if params.get('outFilterMultimapNmax', None) is None:
            params['outFilterMultimapNmax'] = 20
        if params.get('outSAMtype', None) is None:
            params['outSAMtype'] = 'BAM'
        if params.get('outSAMattrIHstart', None) is None:
            params['outSAMattrIHstart'] = 0
        if params.get('outSAMstrandField', None) is None:
            params['outSAMstrandField'] = 'intronMotif'
        if params.get('outFilterIntronMotifs', None) is None:
            params['outFilterIntronMotifs'] = 'RemoveNoncanonical'
        if params.get(self.SET_READS, None) is None:
            params[self.SET_READS] = self._get_reads_refs_from_setref(params)

        return params

    def _get_genome_gtf_file(self, gnm_ref, gtf_file_dir):
        """
        Get data from genome object ref and return the GTF filename (with path)
        for STAR indexing and mapping.
        STAR uses the reference annotation to guide assembly and for creating alignment
        """
        log("Converting genome {0} to GFF file in folder {1}".format(gnm_ref, gtf_file_dir))
        gfu = GenomeFileUtil(self.callback_url)
        try:
            gfu_ret = gfu.genome_to_gff({self.PARAM_IN_GENOME: gnm_ref,
                                         'is_gtf': 1,
                                         'target_dir': gtf_file_dir
                                      })
        except ValueError as egfu:
            log('GFU getting GTF file raised error:\n')
            pprint(egfu)
            return None
        else:#no exception raised
            return gfu_ret.get('file_path')


    def _construct_indexing_cmd(self, params):
	# STEP 1: construct the command for running `STAR --runMode genomeGenerate...`
        idx_cmd = [self.STAR_BIN]
	idx_cmd.append('--genomeDir')
	idx_cmd.append(params[self.STAR_IDX_DIR])
	idx_cmd.append('--' + self.PARAM_IN_STARMODE)
	idx_cmd.append('genomeGenerate')
	idx_cmd.append('--' + self.PARAM_IN_THREADN)
	idx_cmd.append(str(params[self.PARAM_IN_THREADN]))

	if params.get(self.PARAM_IN_FASTA_FILES, None) is not None:
            idx_cmd.append('--' + self.PARAM_IN_FASTA_FILES)
            for fasta_file in params[self.PARAM_IN_FASTA_FILES]:
                idx_cmd.append(fasta_file)

	# STEP 2: append the standard optional inputs
        if params.get('sjdbGTFfile', None) is not None:
            idx_cmd.append('--sjdbGTFfile')
            idx_cmd.append(params['sjdbGTFfile'])
        if (params.get('sjdbOverhang', None) is not None
		and params['sjdbOverhang'] > 0):
            idx_cmd.append('--sjdbOverhang')
            idx_cmd.append(str(params['sjdbOverhang']))

        #print ('STAR indexing CMD:')
        #print ' '.join(idx_cmd)
        return idx_cmd

    def _construct_mapping_cmd(self, params):
	if params.get(self.PARAM_IN_STARMODE, None) is None:
            params[self.PARAM_IN_STARMODE] = 'alignReads'

        # STEP 1: set the working folder housing the STAR output results as well as the reads info
        star_out_dir = ''
	if params.get('align_output', None) is None:
            star_out_dir = self.scratch
	else:
            star_out_dir = params['align_output']

        # STEP 2: construct the command for running STAR mapping
        mp_cmd = [self.STAR_BIN]
	mp_cmd.append('--genomeDir')
	mp_cmd.append(params[self.STAR_IDX_DIR])
	mp_cmd.append('--' + self.PARAM_IN_STARMODE)
	mp_cmd.append(params[self.PARAM_IN_STARMODE])
	mp_cmd.append('--' + self.PARAM_IN_THREADN)
	mp_cmd.append(str(params[self.PARAM_IN_THREADN]))

	if params.get(self.PARAM_IN_READS_FILES, None) is not None:
            #print('Input reads files:\n' + pformat(params[self.PARAM_IN_READS_FILES]))
            mp_cmd.append('--' + self.PARAM_IN_READS_FILES)
            for reads_file in params[self.PARAM_IN_READS_FILES]:
                mp_cmd.append(reads_file)
		readName, readsExtension = os.path.splitext(reads_file)
                #print ('Reads file name-- {}/extension-- {}:'.format(readName, readsExtension))
		if readsExtension == '.gz':
			mp_cmd.append('--readFilesCommand')
			mp_cmd.append('gunzip')
			mp_cmd.append('-c')

		if readsExtension == '.bz2':
			mp_cmd.append('--readFilesCommand')
			mp_cmd.append('bunzip2')
			mp_cmd.append('-c')

        # STEP 3: appending the advanced optional inputs
        mp_cmd.append('--' + self.PARAM_IN_OUTFILE_PREFIX)
        mp_cmd.append(os.path.join(star_out_dir, params[self.PARAM_IN_OUTFILE_PREFIX]))

        if params.get('sjdbGTFfile', None) is not None:
            mp_cmd.append('--sjdbGTFfile')
            mp_cmd.append(params['sjdbGTFfile'])
        if (params.get('sjdbOverhang', None) is not None
		and params['sjdbOverhang'] > 0):
            mp_cmd.append('--sjdbOverhang')
            mp_cmd.append(str(params['sjdbOverhang']))

        if (params.get('outFilterType', None) is not None
                and isinstance(params['outFilterType'], str)):
            mp_cmd.append('--outFilterType')
            mp_cmd.append(params['outFilterType'])
        if (params.get('outFilterMultimapNmax', None) is not None
                and isinstance(params['outFilterMultimapNmax'], int)
                and params['outFilterMultimapNmax'] >= 0):
            mp_cmd.append('--outFilterMultimapNmax')
            mp_cmd.append(str(params['outFilterMultimapNmax']))

        #output sorted file:Aligned.sortedByCoord.out.bam
        #allowed values of --outSAMtype are BAM Unsorted or SortedByCoordinate or both
        if params.get('outSAMtype', None) is not None:
            mp_cmd.append('--outSAMtype')
            mp_cmd.append(params['outSAMtype'])
            if params.get('outSAMtype', None) == 'BAM':
                mp_cmd.append('SortedByCoordinate')

        # 'It is recommended to remove the non-canonical junctions for Cnks runs using
        # --outFilterIntronMotifs RemoveNoncanonical'
        if params.get('outFilterIntronMotifs', None) is not None:
            mp_cmd.append('--outFilterIntronMotifs')
            mp_cmd.append('RemoveNoncanonical')

        if (params.get('outSAMattrIHstart', None) is not None
                and isinstance(params['outSAMattrIHstart'], int)
                and params['outSAMattrIHstart'] >= 0):
            mp_cmd.append('--outSAMattrIHstart')
            mp_cmd.append(str(params['outSAMattrIHstart']))
        if (params.get('outSAMstrandField', None) is not None
                and isinstance(params['outSAMstrandField'], str)):
            mp_cmd.append('--outSAMstrandField')
            mp_cmd.append(params['outSAMstrandField'])

        quant_modes = ["TranscriptomeSAM", "GeneCounts", "Both"]
        if (params.get('quantMode', None) is not None
                and params.get('quantMode', None) in quant_modes):
            mp_cmd.append('--quantMode')
            if params['quantMode'] == "Both":
                mp_cmd.append("TranscriptomeSAM")
                mp_cmd.append("GeneCounts")
            else:
                mp_cmd.append(params['quantMode'])
        if (params.get('alignSJoverhangMin', None) is not None
		and isinstance(params['alignSJoverhangMin'], int)
                and params['alignSJoverhangMin'] > 0):
            mp_cmd.append('--alignSJoverhangMin')
            mp_cmd.append(str(params['alignSJoverhangMin']))
        if (params.get('alignSJDBoverhangMin', None) is not None
                and isinstance(params['alignSJDBoverhangMin'], int)
                and params['alignSJDBoverhangMin'] > 0):
            mp_cmd.append('--alignSJDBoverhangMin')
            mp_cmd.append(str(params['alignSJDBoverhangMin']))
        if (params.get('outFilterMismatchNmax', None) is not None
		and isinstance(params['outFilterMismatchNmax'], int)
                and params['outFilterMismatchNmax'] > 0):
            mp_cmd.append('--outFilterMismatchNmax')
            mp_cmd.append(str(params['outFilterMismatchNmax']))
        if (params.get('alignIntronMin', None) is not None
		and isinstance(params['alignIntronMin'], int)
                and params['alignIntronMin'] > 0):
            mp_cmd.append('--alignIntronMin')
            mp_cmd.append(str(params['alignIntronMin']))
        if (params.get('alignIntronMax', None) is not None
		and isinstance(params['alignIntronMax'], int)
                and params['alignIntronMax'] >= 0):
            mp_cmd.append('--alignIntronMax')
            mp_cmd.append(str(params['alignIntronMax']))
        if (params.get('alignMatesGapMax', None) is not None
		and isinstance(params['alignMatesGapMax'], int)
                and params['alignMatesGapMax'] >= 0):
            mp_cmd.append('--alignMatesGapMax')
            mp_cmd.append(str(params['alignMatesGapMax']))

        #print ('STAR mapping CMD:')
        #print ' '.join(mp_cmd)
        return mp_cmd

    def _exec_indexing(self, params):
        log('Running STAR index generating with params:\n' + pformat(params))

        idx_cmd = self._construct_indexing_cmd(params)

        exitCode = self.prog_runner.run(idx_cmd, self.scratch)

        return exitCode

    def _exec_mapping(self, params):
        log('Running STAR mapping with params:\n' + pformat(params))

        mp_cmd = self._construct_mapping_cmd(params)

        exitCode = self.prog_runner.run(mp_cmd, self.scratch)

        return exitCode

    def _exec_star_pipeline(self, params, rds_files, rds_name, idx_dir, out_dir):
        # build the parameters
        params_idx = self._get_indexing_params(params, idx_dir)
        params_mp = self._get_mapping_params(params, rds_files, rds_name, idx_dir, out_dir)

        # execute indexing and then mapping
        retVal = {}
        try:
            if params[self.PARAM_IN_STARMODE]=='genomeGenerate':
                ret = self._exec_indexing(params_idx)
            else:
		ret = 0
            while( ret != 0 ):
                time.sleep(1)
        except ValueError as eidx:
            log('STAR genome indexing raised error:\n')
            pprint(eidx)
        else:#no exception raised by genome indexing and STAR returns 0, then run mapping
            params_mp[self.PARAM_IN_STARMODE] = 'alignReads'
            try:
                ret = self._exec_mapping(params_mp)
                while( ret != 0 ):
                    time.sleep(1)
            except ValueError as emp:
                log('STAR mapping raised error:\n')
                pprint(emp)
            else:#no exception raised by STAR mapping and STAR returns 0, then move to saving and reporting  
                ret = {'star_idx': star_idx, 'star_output': params_mp.get('align_output')}

        return ret


    def upload_STARalignment(self, input_params, reads_ref, reads_info, output_bam_file):
        """
        Uploads the alignment file + metadata.
        Returns the STAR alignment reference.
        """

        aligner_opts = dict()
        for k in input_params:
            aligner_opts[k] = str(input_params[k])
        pprint(reads_info)

        alignment_name = reads_ref['alignment_output_name']
        align_upload_params = {
            "destination_ref": "{}/{}".format(input_params[self.PARAM_IN_WS], alignment_name),
            "file_path": output_bam_file,
            "assembly_or_genome_ref": input_params[self.PARAM_IN_GENOME],
            "read_library_ref": reads_info['object_ref'],
            "library_type": reads_info['style'],
            "condition": reads_info['condition'],
            "aligned_using": 'STAR',
            "aligner_version":self.STAR_VERSION,
            "aligner_opts": aligner_opts
        }

        pprint(align_upload_params)

        ra_util = ReadsAlignmentUtils(self.callback_url, service_ver='beta')
        rau_upload_ret = ra_util.upload_alignment(align_upload_params)
        alignment_ref = rau_upload_ret["obj_ref"]
        print("STAR alignment uploaded as object {}".format(alignment_ref))
        return rau_upload_ret


    def generate_report_for_single_run(self, run_output_info, params):
        input_ref = run_output_info['upload_results']['obj_ref']
        index_dir = run_output_info['index_dir']
        output_dir = run_output_info['output_dir']
        output_files = self._generate_output_file_list(index_dir, output_dir)

        # first run qualimap
        qualimap_report = self.qualimap.run_bamqc({'input_ref': input_ref})
        qc_result_zip_info = qualimap_report['qc_result_zip_info']

        # create report
        report_text = 'Ran on a single reads library.\n\n'
        alignment_info = self.get_obj_infos(input_ref)[0]
        report_text = 'Created ReadsAlignment: ' + str(alignment_info[1]) + '\n'
        report_text += '                        ' + input_ref + '\n'
        kbr = KBaseReport(self.callback_url)
        report_info = kbr.create_extended_report({'message': report_text,
                                                  'file_links': output_files,
                                                  'objects_created': [{'ref': input_ref,
                                                                       'description': 'ReadsAlignment'}],
                                                  'report_object_name': 'kb_STAR_report_' + str(uuid.uuid4()),
                                                  'direct_html_link_index': 0,
                                                  'html_links': [{'shock_id': qc_result_zip_info['shock_id'],
                                                                  'name': qc_result_zip_info['index_html_file_name'],
                                                                  'label': qc_result_zip_info['name']}],
                                                  'html_window_height': 366,
                                                  'workspace_name': params['output_workspace']
                                                  })
        return report_info #{'report_name': report_info['name'], 'report_ref': report_info['ref']}

    def _get_reads_info(self, reads, readsSet_ref):
        '''
        _get_reads_info:fetches the detailed info for each reads with ref in list reads_refs
        return an object of the following structure:
        {
            "style": "paired", "single", or "interleaved",
            "file_fwd": path_to_file,
            "name": name of the reads,
            "file_rev": path_to_file, only if paired end,
            "object_ref": reads reference for downstream convenience,
            "condition": the condition for the reads.
        }
        '''
        try:
            print("Fetching FASTA file from reads reference {}".format(reads['ref']))
            ret_reads_info = fetch_reads_from_reference(reads['ref'], self.callback_url)
        except ValueError:
            print("Incorrect object type for fetching a FASTA file!")
            raise

        if ret_reads_info.get("file_fwd", None) is None:
            raise RuntimeError("FASTA file fetched from reads {} doesn't seem to exist!".format(reads['ref']))
        else:
            if reads.get('condition', None) is not None:
                ret_reads_info['condition'] = reads['condition']
            else:
                ret_reads_info['condition'] = 'unspecified'
            if reads.get('object_ref', None) != readsSet_ref:
                ret_reads_info[self.PARAM_IN_READS] = readsSet_ref

        return ret_reads_info


    def _get_genome_fasta(self, gnm_ref):
        genome_fasta_files = list()
	if gnm_ref is not None:
            try:
		print("Fetching FASTA file from object {}".format(gnm_ref))
		genome_fasta_file = fetch_fasta_from_object(gnm_ref, self.workspace_url, self.callback_url)
		print("Done fetching FASTA file! Path = {}".format(genome_fasta_file.get("path", None)))
            except ValueError:
		print("Incorrect object type for fetching a FASTA file!")
		raise

            if genome_fasta_file.get("path", None) is None:
		raise RuntimeError("FASTA file fetched from object {} doesn't seem exist!".format(gnm_ref))
            else:
		genome_fasta_files.append(genome_fasta_file["path"])
        return genome_fasta_files


    def convert_params(self, validated_params):
        """
        Convert input parameters with KBase ref format into STAR parameters,
        and add the advanced options.
        """
        params = copy.deepcopy(validated_params)
        params['runMode'] = 'genomeGenerate'

        if validated_params.get('create_report', None) is not None:
                params['create_report'] = validated_params['create_report']
        if validated_params.get('concurrent_local_tasks', None) is not None:
                params['concurrent_local_tasks'] = validated_params['concurrent_local_tasks']
        if validated_params.get('concurrent_njsw_tasks', None) is not None:
                params['concurrent_njsw_tasks'] = validated_params['concurrent_njsw_tasks']
        if validated_params.get('alignmentset_suffix', None) is not None:
                params['alignmentset_suffix'] = validated_params['alignmentset_suffix']

        # Add advanced options from validated_params to params
        sjdbGTFfile = validated_params.get("sjdbGTFfile", None)
	if sjdbGTFfile is not None:
            params['sjdbGTFfile'] = sjdbGTFfile
        else:
            params['sjdbGTFfile'] = self._get_genome_gtf_file(
                                        params[self.PARAM_IN_GENOME],
                                        os.path.join(self.scratch, self.STAR_IDX_DIR))
        if validated_params.get('sjdbOverhang', None) is not None :
            params['sjdbOverhang'] = validated_params['sjdbOverhang']
        else:
            params['sjdbOverhang'] = 100

        quant_modes = ["TranscriptomeSAM", "GeneCounts", "Both"]
        if (validated_params.get('quantMode', None) is not None
                and validated_params.get('quantMode', None) in quant_modes):
            params['quantMode'] = validated_params['quantMode']
        else:
            params['quantMode'] = 'Both'

        return params


    def _get_indexing_params(self, params, star_idx_dir):
        params_idx = {
                'runMode': 'genomeGenerate',
		'runThreadN': params[self.PARAM_IN_THREADN],
		self.STAR_IDX_DIR: star_idx_dir,
                'genomeFastaFiles': params[self.PARAM_IN_FASTA_FILES]
        }
        if params.get('sjdbGTFfile', None) is not None:
            params_idx['sjdbGTFfile'] = params['sjdbGTFfile']
        if params.get('sjdbOverhang', None) is not None :
            params_idx['sjdbOverhang'] = params['sjdbOverhang']

        return params_idx


    def _get_mapping_params(self, params, rds_files, rds_name, idx_dir, out_dir):
        ''' build the mapping parameters'''
        aligndir = out_dir
        if rds_name:
            aligndir = os.path.join(out_dir, rds_name)
            self._mkdir_p(aligndir)
            #print '**********STAR output directory created:{}'.format(aligndir)

        params_mp = copy.deepcopy(params)
        params_mp['runMode'] = 'alignReads'
        params_mp['readFilesIn'] = rds_files
	params_mp[self.STAR_IDX_DIR] = idx_dir
        params_mp['align_output'] = aligndir

        return params_mp


    def determine_input_info(self, validated_params):
        ''' get info on the readsset_ref object and determine if we run once or run on a set
        input info provides information on the input and tells us if we should
        run as a single_library or as a set:
             input_info = {'run_mode': '', 'info': [..], 'ref': '55/1/2'}
        '''
        info = self.get_obj_infos(validated_params[self.PARAM_IN_READS])[0]
        obj_type = self.get_type_from_obj_info(info)
        if obj_type in ['KBaseAssembly.PairedEndLibrary', 'KBaseAssembly.SingleEndLibrary',
                        'KBaseFile.PairedEndLibrary', 'KBaseFile.SingleEndLibrary']:
            return {'run_mode': 'single_library', 'info': info, 'ref': validated_params[self.PARAM_IN_READS]}
        if obj_type == 'KBaseRNASeq.RNASeqSampleSet':
            return {'run_mode': 'sample_set', 'info': info, 'ref': validated_params[self.PARAM_IN_READS]}
        if obj_type == 'KBaseSets.ReadsSet':
            return {'run_mode': 'sample_set', 'info': info, 'ref': validated_params[self.PARAM_IN_READS]}

        raise ValueError('Object type of readsset_ref is not valid, was: ' + str(obj_type))

    def determine_unique_reads_names(self, validated_params):
        infos = self.get_obj_infos(validated_params[self.PARAM_IN_READS])
        return get_unique_names(infos)

    def get_type_from_obj_info(self, info):
        return info[2].split('-')[0]

    def get_name_from_obj_info(self, info):
        return info[1]

    def get_obj_infos(self, ref):
        return self.ws_client.get_object_info3({'objects': [{'ref': ref}]})['infos']

    def get_object_names(self, ref_list):
        """
        From a list of workspace references, returns a mapping from ref -> name of the object.
        """
        obj_ids = list()
        for ref in ref_list:
            obj_ids.append({"ref": ref})
        info = self.ws_client.get_object_info3({"objects": obj_ids})
        name_map = dict()
        # we already have the refs as passed previously, so use those for mapping, as they're in
        # the same order as what's returned.
        for i in range(len(info["infos"])):
            name_map[ref_list[i]] = info["infos"][i][1]
        return name_map


    def _mkdir_p(self, dir):
        """
        _mkdir_p: make directory for given path
        """
        log('Creating a new dir: ' + dir)
        if not dir:
            return
        if not os.path.exists(dir):
            os.makedirs(dir)
        else:
            log('{} has existed, so skip creating.'.format(dir))


    def create_star_dirs(self, star_home):
        '''creating the directories for STAR'''
        # the index directory
        idxdir = os.path.join(star_home, self.STAR_IDX_DIR)
        self._mkdir_p(idxdir)
        # the output directory
        outdir = os.path.join(star_home, self.STAR_OUT_DIR)
        self._mkdir_p(outdir)

        return (idxdir, outdir)


    def _get_reads_refs_from_setref(self, params):
        readsSet_ref = params[self.PARAM_IN_READS]
        reads_refs = list()
        try:
            #print("Fetching reads ref(s) from sample/reads set ref {}".format(readsSet_ref))
            reads_refs = fetch_reads_refs_from_sampleset(
                                    readsSet_ref,
                                    self.workspace_url,
                                    self.callback_url,
                                    params)
            #print("\nDone fetching reads ref(s) from readsSet {}--\nDetails:\n".format(readsSet_ref))
            #pprint(reads_refs)
        except ValueError:
            print("Incorrect object type for fetching reads ref(s)!")
            raise

        return reads_refs

    def _generate_output_file_list(self, idx_dir, out_dir):
        """
        _generate_output_file_list: zip result files and generate file_links for report
        """

        log('start packing result files')

        output_files = list()

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_directory)
        star_index = os.path.join(output_directory, 'star_index.zip')
        star_output = os.path.join(output_directory, 'star_output.zip')
        self.zip_folder(idx_dir, star_index)
        self.zip_folder(out_dir, star_output)

        #star_index = self.zip_folder_withDFU(idx_dir, 'star_index')
        #star_output = self.zip_folder_withDFU(out_dir, 'star_output')

        output_files.append({'path': star_index,
                             'name': os.path.basename(star_index),
                             'label': os.path.basename(star_index),
                             'description': 'Index file(s) generated by STAR'})

        output_files.append({'path': star_output,
                             'name': os.path.basename(star_output),
                             'label': os.path.basename(star_output),
                             'description': 'Output file(s) generated by STAR'})

        return output_files


    def zip_folder_withDFU(self, folder_path, output_name):
        """Zip the contents of an entire folder (with that folder included
        in the archive). Empty subfolders will be included in the archive
        as well.
        """
        output_path = self.dfu.pack_file(
                {'file_path': folder_path + '/' + output_name,
                 'pack': 'zip'})['file_path']

        print "{} created successfully.".format(output_path)

        #with zipfile.ZipFile(output_path, "r") as f:
            #print 'Checking the zipped file......\n'
            #for info in f.infolist():
                #    print info.filename, info.date_time, info.file_size, info.compress_size
            #for fn in f.namelist():
                #print fn

        return output_path


    def zip_folder(self, folder_path, output_path):
        """Zip the contents of an entire folder (with that folder included in the archive). 
        Empty subfolders could be included in the archive as well if the commented portion is used.
        """
        with zipfile.ZipFile(output_path, 'w',
                             zipfile.ZIP_DEFLATED,
                             allowZip64=True) as ziph:
            for root, folders, files in os.walk(folder_path):
                # Include all subfolders, including empty ones.
                #for folder_name in folders:
                #    absolute_path = os.path.join(root, folder_name)
                #    relative_path = os.path.join(os.path.basename(root), folder_name)
                #    print "Adding {} to archive.".format(absolute_path)
                #    ziph.write(absolute_path, relative_path)
                for f in files:
                    absolute_path = os.path.join(root, f)
                    relative_path = os.path.join(os.path.basename(root), f)
                    #print "Adding {} to archive.".format(absolute_path)
                    ziph.write(absolute_path, relative_path)

        print "{} created successfully.".format(output_path)

        #with zipfile.ZipFile(output_path, "r") as f:
        #    print 'Checking the zipped file......\n'
        #    for info in f.infolist():
        #        print info.filename, info.date_time, info.file_size, info.compress_size


    def _generate_html_report(self, out_dir, obj_ref):
        """
        _generate_html_report: generate html summary report
        """

        log('start generating html report')
        html_report = list()

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_directory)
        result_file_path = os.path.join(output_directory, 'report.html')

        star_obj = self.ws_client.get_objects2({'objects':
                                                 [{'ref': obj_ref}]})['data'][0]
        star_obj_info = star_obj['info']
        star_obj_data = star_obj['data']
        star_obj_type = star_obj_info[2]

        Overview_Content = ''
        if re.match('KBaseRNASeq.RNASeqAlignment-\d.\d', star_obj_type):
            Overview_Content += '<br/><table><tr><th>Generated Alignment Object</th>'
            Overview_Content += '<th></th></tr>'
            Overview_Content += '<tr><th>Alignment Name</th><th>Condition</th></tr>'
            Overview_Content += '<tr><td>{} ({})</td>'.format(star_obj_info[1],obj_ref)
            Overview_Content += '<td>{}</td></tr>'.format(star_obj_data['condition'])
            Overview_Content += '</table>'
        elif (re.match('KBaseRNASeq.RNASeqAlignmentSet-\d.\d', star_obj_type)
                or re.match('KBaseSets.ReadsAlignmentSet-\d.\d', star_obj_type)
                or re.match('KBaseSet.RNASeqAlignmentSet-\d.\d', star_obj_type)):
            Overview_Content += '<br/><table><tr><th>Generated AlignmentSet Object</th></tr>'
            Overview_Content += '<tr><td>{} ({})'.format(star_obj_info[1],obj_ref)
            Overview_Content += '</td></tr></table>'
            Overview_Content += '<p><br/></p>'
            Overview_Content += '<table><tr><th>Generated Alignment Objects</th>'
            Overview_Content += '<th></th></tr>'
            Overview_Content += self._fill_html_trs('Alignment Name', star_obj_data)
            Overview_Content += '</table>'
        elif re.match('KBaseRNASeq.RNASeqExpression-\d.\d', star_obj_type):
            Overview_Content += '<br/><table><tr><th>Generated Expression Object</th>'
            Overview_Content += '<th></th></tr>'
            Overview_Content += '<tr><th>Expression Name</th><th>Condition</th></tr>'
            Overview_Content += '<tr><td>{} ({})</td>'.format(star_obj_info[1], obj_ref)
            Overview_Content += '<td>{}</td></tr>'.format(star_obj_data['condition'])
            Overview_Content += '</table>'
        elif re.match('KBaseSets.ExpressionSet-\d.\d', star_obj_type):
            Overview_Content += '<br/><table><tr><th>Generated ExpressionSet Object</th></tr>'
            Overview_Content += '<tr><td>{} ({})'.format(star_obj_info[1], obj_ref)
            Overview_Content += '</td></tr></table>'
            Overview_Content += '<p><br/></p>'
            Overview_Content += '<table><tr><th>Generated Expression Objects</th>'
            Overview_Content += '<th></th></tr>'
            Overview_Content += self._fill_html_trs('Expression Name', star_obj_data)
            Overview_Content += '</table>'

        with open(result_file_path, 'w') as result_file:
            with open(os.path.join(os.path.dirname(__file__), 'report_template.html'),
                      'r') as report_template_file:
                report_template = report_template_file.read()
                report_template = report_template.replace('<p>Overview_Content</p>',
                                                          Overview_Content)
                result_file.write(report_template)

        html_report.append({'path': result_file_path,
                            'name': os.path.basename(result_file_path),
                            'label': os.path.basename(result_file_path),
                            'description': 'HTML summary report for STAR'})
        return html_report

    def _fill_html_trs(self, col_caption, obj_data):
        '''
        _fill_html_trs: simple creates an html string that has rows (tr) of td for a table
        '''
        tr_html_str = '<tr><th>{}</th><th>Condition</th></tr>'.format(col_caption)

        for item in obj_data['items']:
            item_obj = self.ws_client.get_objects2({'objects':[{'ref': item['ref']}]})['data'][0]
            item_obj_info = item_obj['info']
            item_obj_data = item_obj['data']
            obj_name = item_obj_info[1]

            tr_html_str += '<tr><td>{} ({})</td>'.format(obj_name, item['ref'])
            tr_html_str += '<td>{}</td></tr>'.format(item_obj_data['condition'])

        return tr_html_str

    def _generate_star_report(self, obj_ref, report_text, html_links, workspace_name, index_dir, output_dir):
        """
        _generate_star_report: generate summary report
        """
        log('creating STAR report')

        output_files = self._generate_output_file_list(index_dir, output_dir)
        output_html_files = self._generate_html_report(output_dir, obj_ref)
        output_html_files += html_links

        star_obj = self.ws_client.get_objects2({'objects':[{'ref': obj_ref}]})['data'][0]
        star_obj_info = star_obj['info']
        star_obj_data = star_obj['data']

        star_obj_type = star_obj_info[2]
        if re.match('KBaseRNASeq.RNASeqAlignment-\d+.\d+', star_obj_type):
            objects_created = [{'ref': obj_ref,
                                'description': 'RNASeqAlignment generated by STAR'}]
        elif (re.match('KBaseRNASeq.RNASeqAlignmentSet-\d+.\d+', star_obj_type)
                or re.match('KBaseSets.ReadsAlignmentSet-\d+.\d+', star_obj_type)
                or re.match('KBaseSet.RNASeqAlignmentSet-\d+.\d+', star_obj_type)):
            objects_created = [{'ref': obj_ref,
                'description': '{} generated by STAR'.format(re.sub(r"-\d+.\d+", "",star_obj_type))}]
            items = star_obj_data['items']
            for item in items:
                objects_created.append({'ref': item['ref'],
                                        'description': 'Alignment generated by STAR'})
        elif re.match('KBaseRNASeq.RNASeqExpression-\d+.\d+', star_obj_type):
            objects_created = [{'ref': obj_ref,
                                'description': 'Expression generated by STAR'}]
        elif re.match('KBaseSets.ExpressionSet-\d+.\d+', star_obj_type):
            objects_created = [{'ref': obj_ref,
                                'description': 'ExpressionSet generated by STAR'}]
            items = star_obj_data['items']
            for item in items:
                objects_created.append({'ref': item['ref'],
                                        'description': 'Expression generated by STAR'})

        report_params = {'message': report_text,
                         'workspace_name': workspace_name,
                         'file_links': output_files,
                         'objects_created': objects_created,
                         'html_links': output_html_files,
                         'direct_html_link_index': 0,
                         'html_window_height': 366,
                         'report_object_name': 'kb_STAR_report_' + str(uuid.uuid4())}

        kbase_report_client = KBaseReport(self.callback_url)
        report_output = kbase_report_client.create_extended_report(report_params)

        return report_output

    def upload_alignment_set(self, alignment_items, alignmentset_name, ws_name):
        """
        Compiles and saves a set of alignment references (+ other stuff) into a
        KBaseRNASeq.RNASeqAlignmentSet.
        Returns the reference to the new alignment set.
        alignment_items: [{
            "ref": alignment_ref,
            "label": condition label.
        }]
        """
        print("Uploading completed alignment set")
        alignment_set = {
            "description": "Alignments using STAR, v.{}".format(self.STAR_VERSION),
            "items": alignment_items
        }
        set_info = self.set_api_client.save_reads_alignment_set_v1({
            "workspace": ws_name,
            "output_object_name": alignmentset_name,
            "data": alignment_set
        })
        return set_info
class MutualInfoUtil:
    def __init__(self, config):
        self.ws_url = config["workspace-url"]
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        self.shock_url = config['shock-url']
        self.dfu = DataFileUtil(self.callback_url)
        self.ws = Workspace(self.ws_url, token=self.token)
        self.scratch = config['scratch']

    def _mkdir_p(self, path):
        """
		_mkdir_p: make directory for given path
		"""
        if not path:
            return
        try:
            os.makedirs(path)
        except OSError as exc:
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise

    def test_dfu(self):
        output_directory = self.scratch
        #output_directory = "/kb/module/test1/"
        #os.mkdir(output_directory)
        #self._mkdir_p(output_directory)

        test_file = os.path.join(output_directory, 'index.html')
        with open(test_file, 'w') as file:
            file.write("test!")
        print("OUTPUT DIR")
        print(output_directory)
        print(os.listdir(output_directory))
        print("file_to_shock")
        report_shock_id = self.dfu.file_to_shock({
            'file_path': output_directory,
            'pack': 'targz'
            #'pack': 'zip'
        })
        print(report_shock_id)
        return

    def _validate_run_flux_mutual_information_analysis_params(self, params):
        """
		_validate_run_flux_mutual_information_analysis_params:
				validates params passed to run_flux_mutual_information_analysis method
		"""

        log('start validating validate_run_flux_mutual_information_analysis params'
            )

        # check for required parameters
        for p in ['fbamodel_id', 'compounds', 'media_id', 'workspace_name']:
            if p not in params:
                raise ValueError(
                    '"{}" parameter is required, but missing'.format(p))

    def _get_file_from_ws(self, ref):
        try:
            file_path = self.ws.get_objects2({'objects': [{'ref': ref}]})
            file_path = file_path['data'][0]
        except Exception as e:
            raise ValueError('Unable to get object from workspace: (' + ref +
                             ')' + str(e))
        return file_path

    def _make_media_files(self, ws_name, base, compounds):
        """
		Build and store media objects for each combination of compound added to the base media.
		:param base: The base media file
		:param compounds: the set of compound to test
		:return: A list of media ids and a matrix with each media combination defined
		"""

        ref = ws_name + "/" + base
        if base.find("/") != -1:
            ref = base

        output = self._get_file_from_ws(ref)
        base_media = output['data']
        base = output['info'][1]
        myuuid = str(uuid.uuid4())
        media_ids = [base]
        new_media_list = []
        media_matrix = [[""] + compounds]
        media_matrix.append([[base] + [0] * len(compounds)])
        for n_comp in range(1, len(compounds) + 1):
            for combo in combinations(compounds, n_comp):
                new_media_id = base + '_v%s' % len(media_matrix)
                media_ids.append(new_media_id)
                media_matrix.append(
                    [new_media_id] +
                    [1 if comp in combo else 0 for comp in compounds])
                new_media = deepcopy(base_media)
                new_media['id'] = new_media_id
                new_media['name'] = new_media_id
                for new_comp in combo:
                    new_media['mediacompounds'].append({
                        'compound_ref':
                        '48/1/1/compounds/id/%s' % new_comp.split('_')[0],
                        'concentration':
                        1.0,
                        'maxFlux':
                        1000,
                        'minFlux':
                        -1000
                    })
                new_media_list.append(new_media)

        print("Made %s Media Files" % (len(media_ids) - 1))
        info = self.ws.save_objects({
            'workspace':
            ws_name,
            "objects": [{
                "hidden": 1,
                "type": "KBaseBiochem.Media",
                "data": media,
                "name": myuuid + "-" + media['name']
            } for media in new_media_list]
        })
        #print(info)
        return media_ids, media_matrix, myuuid

    def _run_fba(self, workspace_name, media_id_list, fbamodel_id, myuuid,
                 base_media):
        print('running fba')
        fba_tool_obj = fba_tools(self.callback_url, service_ver='dev')
        new_media_list = []
        for media in media_id_list:
            if media == base_media:
                new_media_list.append(workspace_name + "/" + media)
            else:
                new_media_list.append(workspace_name + "/" + myuuid + "-" +
                                      media)

        fba_tool_obj.run_flux_balance_analysis({
            "max_c_uptake":
            60,  #"max_c_uptake": 6, // previously default is 6 later set to 60
            "workspace": workspace_name,
            "fbamodel_id": fbamodel_id,
            "fba_output_id": fbamodel_id + ".mifba",
            "fbamodel_workspace": workspace_name,
            "media_id_list": new_media_list,
            "target_reaction": "bio1",
            "minimize_flux": 1
        })
        output = self.ws.get_objects2({
            'objects': [{
                'ref': workspace_name + "/" + fbamodel_id + '.mifba'
            }]
        })

        #json.dump(output, open(self.scratch+'/fba.json', 'w'))

        fba = output['data'][0]['data']
        biomass_data = "FBAs,Biomass\n"
        secretion_file = "," + ','.join(media_id_list) + "\n"
        full_secretion_file = "," + ','.join(media_id_list) + "\n"
        full_flux_file = "," + ','.join(media_id_list) + "\n"
        flux_file = "," + ','.join(media_id_list) + "\n"
        objectives = fba['other_objectives']
        for i in range(0, len(objectives)):
            biomass_data = biomass_data + media_id_list[i] + "," + str(
                objectives[i]) + "\n"

        flux_vars = fba['FBAReactionVariables']
        for var in flux_vars:
            id = var['modelreaction_ref'].split("/").pop()
            flux_file = flux_file + id
            full_flux_file = full_flux_file + id
            fluxes = var['other_values']
            for i in range(0, len(objectives)):
                if objectives[i] == 0:
                    full_flux_file = full_flux_file + ",0"
                    flux_file = flux_file + ",0"
                else:
                    full_flux_file = full_flux_file + "," + str(fluxes[i])
                    if abs(fluxes[i]) < 1e-7:
                        flux_file = flux_file + ",0"
                    else:
                        flux_file = flux_file + ",1"
            flux_file = flux_file + "\n"
            full_flux_file = full_flux_file + "\n"

        secretion_vars = fba['FBACompoundVariables']
        for var in secretion_vars:
            id = var['modelcompound_ref'].split("/").pop()
            secretion_file = secretion_file + id
            full_secretion_file = full_secretion_file + id
            fluxes = var['other_values']
            for i in range(0, len(objectives)):
                if objectives[i] == 0:
                    full_secretion_file = full_secretion_file + ",0"
                    secretion_file = secretion_file + ",0"
                else:
                    full_secretion_file = full_secretion_file + "," + str(
                        fluxes[i])
                    if abs(fluxes[i]) < 1e-7:
                        secretion_file = secretion_file + ",0"
                    elif fluxes[i] < 0:
                        secretion_file = secretion_file + ",-1"
                    else:
                        secretion_file = secretion_file + ",1"
            secretion_file = secretion_file + "\n"
            full_secretion_file = full_secretion_file + "\n"

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_directory)
        biomass_path = os.path.join(output_directory, 'biomass.csv')
        secretion_path = os.path.join(output_directory, 'secretion.csv')
        flux_path = os.path.join(output_directory, 'flux.csv')
        full_secretion_path = os.path.join(output_directory,
                                           'full_secretion.csv')
        full_flux_path = os.path.join(output_directory, 'full_flux.csv')

        with open(biomass_path, 'w') as biomass_f:
            biomass_f.write(biomass_data)

        with open(secretion_path, 'w') as secretion_f:
            secretion_f.write(secretion_file)

        with open(flux_path, 'w') as flux_f:
            flux_f.write(flux_file)

        with open(full_secretion_path, 'w') as full_secretion_f:
            full_secretion_f.write(full_secretion_file)

        with open(full_flux_path, 'w') as full_flux_f:
            full_flux_f.write(full_flux_file)

        return [
            biomass_path, secretion_path, flux_path, full_secretion_path,
            full_flux_path
        ]

    def _make_index_html(self, result_file_path, mutual_info_dict):
        overview_content = ''
        overview_content += '<table><tr><th>Mutual Information for various chemical compound combinations'
        overview_content += ' Object</th></td>'
        overview_content += '<tr><th>Input Chemical Compound Combination</th>'
        overview_content += '<th>Mutual Information (in Bits)</th>'
        overview_content += '</tr>'

        for k, v in mutual_info_dict.items():
            overview_content += '<tr><td>{}</td><td>{}</td></tr>'.format(k, v)
        overview_content += '</table>'
        with open(result_file_path, 'w') as result_file:
            with open(
                    os.path.join(os.path.dirname(__file__),
                                 'report_template.html'),
                    'r') as report_template_file:
                report_template = report_template_file.read()
                report_template = report_template.replace(
                    '<p>Overview_Content</p>', overview_content)
                result_file.write(report_template)
        return

    def _generate_html_report(self, result_directory, mutual_info_dict):
        """
		_generate_html_report: generate html summary report
		"""
        #scratch, uui, datafileutil, file_to_shock, shockId, extended report

        log('start generating html report')

        html_report = list()

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))

        self._mkdir_p(output_directory)
        result_file_path = os.path.join(output_directory,
                                        'mutual_information_report.html')

        shutil.copy(os.path.join(result_directory, 'MI_plot.png'),
                    os.path.join(output_directory, 'MI_plot.png'))

        overview_content = ''
        overview_content += '<table><tr><th>Mutual Information for various chemical compound combinations'
        overview_content += ' Object</th></td>'
        overview_content += '<tr><th>Input Chemical Compound Combination</th>'
        overview_content += '<th>Mutual Information (in Bits)</th>'
        overview_content += '</tr>'

        for k, v in mutual_info_dict.items():
            overview_content += '<tr><td>{}</td><td>{}</td></tr>'.format(k, v)
        overview_content += '</table>'

        with open(result_file_path, 'w') as result_file:
            with open(
                    os.path.join(os.path.dirname(__file__),
                                 'report_template.html'),
                    'r') as report_template_file:
                report_template = report_template_file.read()
                report_template = report_template.replace(
                    '<p>Overview_Content</p>', overview_content)
                result_file.write(report_template)

        report_shock_id = self.dfu.file_to_shock({
            'file_path': output_directory,
            'pack': 'targz'
        })['shock_id']

        #report_shock_id = self.dfu.file_to_shock({'file_path': output_directory,
        #										  'pack': 'zip'})['shock_id']

        html_report.append({
            'shock_id':
            report_shock_id,
            'name':
            os.path.basename(result_file_path),
            'label':
            os.path.basename(result_file_path),
            'description':
            'HTML summary report for Mutual Information App'
        })

        return html_report

    def _generate_report(self, result_directory, mutual_info_dict,
                         workspace_name):
        """
		_generate_report: generate summary report
		"""
        print('-->I am here *************')
        uuidStr = str(uuid.uuid4())
        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_directory)
        test_file = os.path.join(output_directory, "index.html")
        self._make_index_html(test_file, mutual_info_dict[1])
        #shutil.copy2(os.path.join(os.path.dirname(__file__), 'data', 'index.html'), output_directory)

        # shutil.copy('/kb/module/data/index.html', result_directory + '/' + uuidStr + '/index.html')
        json.dump(mutual_info_dict[0],
                  open(os.path.join(output_directory, 'pdata.json'), 'w'))
        #shutil.copy('pdata.json', result_directory + '/' + uuidStr + '/pdata.json')

        # DataFileUtils to shock
        print(output_directory)
        print(os.listdir(output_directory))
        report_shock_result = self.dfu.file_to_shock({
            'file_path': output_directory,
            'pack': 'targz'
        })
        #report_shock_result = self.dfu.file_to_shock({'file_path': output_directory,
        #											 'pack': 'zip'})

        report_shock_id = report_shock_result['shock_id']
        print(report_shock_result)

        report_file = {
            'name': 'index.html',
            'description': 'the report',
            'shock_id': report_shock_id
        }
        log('creating report')
        #output_html_files = self._generate_html_report(result_directory,
        #											   mutual_info_dict)
        report_params = {
            'message': '',
            'workspace_name': workspace_name,
            'html_links': [report_file],
            'file_links': [],
            'direct_html_link_index': 0,
            'html_window_height': 333,
            'report_object_name': 'MutualInfomation_report_' + uuidStr
        }

        kbase_report_client = KBaseReport(self.callback_url)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {
            'report_name': output['name'],
            'report_ref': output['ref']
        }

        return report_output

######### @@@@@@@ALL THREE MUTUAL INFORMATION CALCULATION START FROM HERE@@@@@@@#############

    def _generate_mutual_info(self, media_matrix, fba_file, mi_options):

        #print('this is fba_file')
        #print(fba_file)
        df1 = pd.read_csv(fba_file[0])
        df1.values

        #df1.as_matrix()
        #print('-->printing df1')# **** rm
        #print(df1.to_string())# **** rm
        #print(type(df1))  # **** rm
        #print('-->printing media_matrix')
        #print(media_matrix)

        df3 = pd.DataFrame(columns=media_matrix[0][1:])
        for i in range(1, len(media_matrix)):
            if i == 1:
                df3.loc[media_matrix[i][0][0]] = media_matrix[i][0][1:]
            else:
                df3.loc[media_matrix[i][0]] = media_matrix[i][1:]

        #print('-->*************OK')
        #print(df3)

        #----Input validation of Media/FBAs with Binary Matrix FBAs------
        # 1.0 Number of rows in Media.csv file =  (Number of columns -1)
        #   1.0. If they are different: Through an ERROR saying missed match number of FBAs in media and binary matrix.
        # 1.1 Check whether the elements in Media.csv file contains only binary values (i.e. 0 and 1)
        #   1.1. If the elements are different: Through an ERROR saying not approapriate input values
        # 1.2 Check whether the compounds in Media.csv file match with number of FBAs
        #   1.2. If the compounds are different from number of FBAs: Through an ERROR saying not appropriate input values

        media_matrix = df3
        s_df1 = df1.shape
        s_df2 = media_matrix.shape
        #print(media_matrix,type(media_matrix))

        Temp_df2 = np.array(media_matrix.values)
        #print('-->******')
        #print(Temp_df2)
        # Create matrix with only the elements remove first column and all the rows
        Temp_df2 = Temp_df2[0:, 1:]

        Bin_val_check = np.array_equal(Temp_df2, Temp_df2.astype(bool))
        #num_compounds = (s_df2[1])-1
        num_compounds = s_df2[1]

        if ((s_df1[1] - 1) != s_df2[0]) or (Bin_val_check != True) or (int(
                math.log(s_df2[0], 2)) != num_compounds):
            print('invalid input values')

        #-----All possible combination of the chemical compounds----------------------
        # 2.0 Sperating m0 from rest of the lables

        Temp1_df2 = media_matrix
        #print('-->*************OK')
        #print(Temp1_df2)
        cols = Temp1_df2.columns
        for i in range(0, len(cols)):
            Temp1_df2.loc[Temp1_df2[cols[i]] == 1, cols[i]] = cols[i]
        #print('-->*************OK')
        #print (Temp1_df2)

        # 2.1 Creating a disctionary for all FBAs except m0
        #print(len(Temp1_df2))
        #print('--->*********')
        #print(Temp1_df2)

        mydict = {}
        for x in range(0, len(Temp1_df2)):
            for i in range(0, s_df2[1]):
                currentvalue = Temp1_df2.iloc[x, i]
                currentid = Temp1_df2.index[x]
                mydict.setdefault(currentid, [])
                if currentvalue != 0:
                    mydict[currentid].append(currentvalue)
                # Add the first key as m0
        media_0_name = Temp1_df2.index[0]
        mydict[media_0_name] = ["0"]
        # Sort the keys
        mydict = collections.OrderedDict(natsort.natsorted(mydict.items()))
        #print ('--> ********')
        compoundslist = Temp1_df2.columns.get_values()
        compoundslist.tolist()
        #print(compoundslist)
        #print('all possible combination')
        #print(len(compoundslist))

        # List of Compounds combination in the list
        my_combi_list = []
        for L in range(0, len(compoundslist) + 1):
            for subset in itertools.combinations(compoundslist, L):
                my_combi_list.append(list(subset))

        my_combi_list[0] = [0]
        # print(my_combi_list)
        '''
		for k, v in mydict.iteritems():
			#print('--> ********')
			print(k, v)
		'''

        # Created a dictionary where the keys:
        # list of compounds combination
        # values are corresponding FBAs list in df2
        result_dict = {}
        for element in my_combi_list[1:]:
            for k, v in mydict.iteritems():
                if set(v).issubset(set(map(lambda x: str(x), element))):
                    key = ','.join(map(lambda x: str(x), element))
                    if result_dict.get(key):
                        media_list = result_dict[key]
                        media_list.append(k)
                        media_list = list(set(media_list))
                        result_dict.update({key: media_list})
                    else:
                        result_dict.update({key: [media_0_name, k]})

        # Sort the keys
        result_dict['0'] = [media_0_name]
        result_dict = collections.OrderedDict(
            natsort.natsorted(result_dict.items()))
        # print(result_dict)
        #print('-->I am here **** OK')
        #print(result_dict)
        #print (df1)

        # Created a dictionary where the keys are:
        # list of compounds combination
        # values are compounds combination FBAs with df1 vaules
        All_Comp_Combi_dic = {}
        for column, value in result_dict.items():
            All_Comp_Combi_dic.update({column: df1.get(value)})

        # print('-->All_Comp_Combi_dic******')
        # print (All_Comp_Combi_dic)
        # print(result_dict)

        # To print an item from the All_Comp_Combi_dic
        df = (pd.DataFrame(All_Comp_Combi_dic.items()))
        #print('--> printing df')
        #print(df[0].to_string())
        #print(df[1][7])

        ######### INTRACELLULAR FLUX MUTUAL INFORMATION CALCULATION #############
        if mi_options == "flux":
            print('Intracellular flux')
            MI_dict = {}
            for k in range(0, len(df[0])):
                drop_rows_df = df[1][k].drop_duplicates(keep="first")
                drop_columns_df = drop_rows_df.T.drop_duplicates(
                    keep="first").T
                remove = []
                removed = {}
                count_values = {}
                cols = df[1][k].columns
                for i in range(len(cols) - 1):
                    duplicated = []
                    v = df[1][k][cols[i]].values
                    for j in range(i + 1, len(cols)):
                        if np.array_equal(v, df[1][k][cols[j]].values):
                            remove.append(cols[j])
                            duplicated.append(cols[j])
                    if duplicated and cols[i] not in remove:
                        removed.update({cols[i]: duplicated})
                    count = {}
                    for key, value in removed.items():
                        count.update({key: len(value)})

                    #print v

                    # print drop_columns_df
                    count_values = count.values()
                    # print count_values
                    count_values = map(lambda x: x + 1, count_values)
                    # print count_values
                    d = {x: count_values.count(x) for x in count_values}
                #print('-->count_values')
                #print(count_values)

                #-------Mutual Inforamtion (MI) calculation-------------
                FBAs = len(df[1][k].columns)
                pure_entropy = math.log(FBAs, 2)
                #print (pure_entropy) (-->ok rm)

                # If No duplicates exist and list "value" is empty
                if not count_values:
                    #print("List is empty")
                    No_duplicate_FBAs = len(drop_columns_df.columns)
                    conditional_entropy = -1 * (No_duplicate_FBAs * (
                        (1 / No_duplicate_FBAs) *
                        ((1 / 1) * math.log(1.0 / 1.0, 2))))
                    Mutual_Info = pure_entropy - conditional_entropy
                    #print('Mutaul Info:', Mutual_Info)

                if count_values:
                    # If duplicates exist and list "value" is not empty
                    conditional_entropy = 0
                    for key in d:
                        #print key, d[key]
                        Temp = -1 * d[key] * (key / float(FBAs)) * key * (
                            1.0 / key) * math.log(1.0 / key, 2)
                        conditional_entropy = Temp + conditional_entropy
                    #print "%3f" %Temp
                    Mutual_Info = pure_entropy - conditional_entropy

                MI_dict[df[0][k]] = Mutual_Info
                MI_dict['0'] = 0.0

            #Sorted MI_dict
            MI_dict = sorted(MI_dict.items(), key=lambda x: (-len(x[0]), x[0]))
            MI_dict = OrderedDict(MI_dict)
            #print(MI_dict)

            #print('-->rest')
            #print(compoundslist)
            #print(num_compounds)

            x_groups = [[] for x in range(num_compounds)]
            y_groups = [[] for x in range(num_compounds)]
            names = [[] for x in range(num_compounds)]
            Comp_Mapping = [[] for x in range(num_compounds)]

            for key, val in MI_dict.iteritems():
                del_count = key.count(',')
                x_groups[del_count].append(key)
                y_groups[del_count].append(val)

                # for x, y in zip(x_groups, y_groups):
                # data.append(go.Bar(x=x, y=y, name='test'))

            pdata = []
            for i in range(0, len(x_groups)):
                names[i] = str(i + 1) + ' Compound Combination'
                Comp_Mapping = str(i + 1) + '-' + compoundslist[i]

                record = {}
                record["x"] = []
                for e in x_groups[i]:
                    record["x"].append("c" + e)
                record["y"] = y_groups[i]
                record["names"] = names[i]
                record["Comp_Mapping"] = Comp_Mapping
                pdata.append(record)

            #print (pdata)
            #json.dump(pdata, open(self.scratch+'/pdata.json', 'w'))
            return [pdata, MI_dict]
            #return MI_dict

######### INPUT COMPONENTS AND BIOMASS FLUX MUTUAL INFORMATION CALCULATION #############
        if mi_options == "biomass":
            # Load the file contain the information of FBAs(media) along with corresponding Biomass (growth)
            print('biomass flux')
            df2 = pd.read_csv(fba_file[1])
            df2.values
            #print(df)

            MI_dict_biomass = {}
            for r in range(0, len(df[0])):
                reaction_states = df[1][r].head(1000)

                def get_groups(flux_df):
                    groups = collections.defaultdict(list)
                    unique = flux_df.aggregate(lambda x: hash(str(x.values)))
                    for k, v in unique[0:].iteritems():
                        groups[v].append(k)
                    return dict([(i, g)
                                 for i, g in enumerate(groups.values())])

                n_group = collections.defaultdict(int)
                groups = get_groups(reaction_states)

                for group in groups.values():
                    n_group[len(group)] += 1

                groups_count = {}
                for key, values in groups.items():
                    groups_count[key] = len(values)
                    # print groups_count

                # Take first FBA label of every group
                group_id = {}
                for k, v in groups.items():
                    group_id.update({k: groups.values()[k][0]})

                # Obtain the Biomass of each Group
                cols_df = group_id.values()
                cols_df2 = df2.columns
                #print (cols_df)

                # Dictionary of first FBA label of every group and its corresponding number of members
                groups_label_count = {}
                for k, v in groups_count.items():
                    groups_label_count.update({cols_df[k]: v})
                #print('groups_label_count')
                #print(groups_label_count)

                def get_cond_count(re_group):
                    media_cond = 0
                    for media in re_group['FBAs']:
                        media_cond += groups_label_count[media]
                    return media_cond

                # Extract FBA Groups biomass inside df2
                Groups_Biomass = df2[df2['FBAs'].isin(cols_df)]
                #print('-->I am here')
                #print(Groups_Biomass)

                # Regroup based on the biomass values
                re_group = Groups_Biomass.groupby('Biomass')
                biomass_FBAs_groups = re_group.aggregate(get_cond_count)

                biomass_FBAs_label_groups = Groups_Biomass.groupby(
                    "Biomass", sort=True).sum()
                #print(biomass_FBAs_label_groups)

                #print (biomass_FBAs_label_groups)

                Summery = pd.merge(left=biomass_FBAs_label_groups,
                                   left_index=True,
                                   right=biomass_FBAs_groups,
                                   right_index=True,
                                   how='inner')
                Data_4_CondMI = Summery.groupby('FBAs_y').count()
                Data_4_CondMI = Data_4_CondMI.to_dict(orient='dict')
                for k, v in Data_4_CondMI.items():
                    Data_4_CondMI = v

                Num_of_FBAs = Data_4_CondMI.keys()
                Count_Num_of_FBAs = Data_4_CondMI.values()

                # -------Mutual Inforamtion (MI) calculation Stage II (input compounds respect to BIOMASS-------------
                # Pure Entropy
                FBAs = len(df[1][r].columns)
                pure_entropy = math.log(FBAs, 2)

                conditional_entropy = 0.0
                for l in range(0, len(Count_Num_of_FBAs)):
                    temp = -1 * Count_Num_of_FBAs[l] * (
                        Num_of_FBAs[l] / float(FBAs)) * Num_of_FBAs[l] * (
                            1.0 / float(Num_of_FBAs[l]) *
                            (math.log(1.0 / float(Num_of_FBAs[l]), 2)))
                    conditional_entropy += temp

                Mutual_Info_Biomass = pure_entropy - conditional_entropy
                # print('Mutaul Info:', Mutual_Info_Biomass)

                #print(Mutual_Info_Biomass)
                MI_dict_biomass.update({df[0][r]: Mutual_Info_Biomass})

                #print(MI_dict_biomass)

            # Sorted MI_dict_biomass
            MI_dict_biomass = sorted(MI_dict_biomass.items(),
                                     key=lambda x: (-len(x[0]), x[0]))
            MI_dict_biomass = OrderedDict(MI_dict_biomass)

            #print(MI_dict_biomass)

            x_groups = [[] for x in range(num_compounds)]
            y_groups = [[] for x in range(num_compounds)]
            names = [[] for x in range(num_compounds)]
            Comp_Mapping = [[] for x in range(num_compounds)]

            for key, val in MI_dict_biomass.iteritems():
                del_count = key.count(',')
                x_groups[del_count].append(key)
                y_groups[del_count].append(val)

            pdata = []
            for i in range(0, len(x_groups)):
                names[i] = str(i + 1) + ' Compound Combination'
                Comp_Mapping = str(i + 1) + '-' + compoundslist[i]

                record = {}
                record["x"] = []
                for e in x_groups[i]:
                    record["x"].append("c" + e)
                record["y"] = y_groups[i]
                record["names"] = names[i]
                record["Comp_Mapping"] = Comp_Mapping
                pdata.append(record)
            return [pdata, MI_dict_biomass]

######### INPUT COMPONENTS AND BIOMASS, SECRETION FLUX MUTUAL INFORMATION CALCULATION #############

        if mi_options == "secretion":
            #Load the file contain the information of FBAs(media) along with corresponding Biomass (growth)
            print('secretion flux')
            df4 = pd.read_csv(fba_file[2], header=0, index_col=0)

            df4.index.name = 'FBAs'
            df4 = df4.T

            dfbiomass = pd.read_csv(fba_file[1])
            aa = dfbiomass['Biomass'].values.tolist()
            # print(len(aa))
            df4['Biomass'] = aa
            # print(df4.shape)
            compoundslist_b_u_s = list(df4.columns.values)
            #print(compoundslist_b_u_s)

            MI_dict_b_u_s = {}
            for r in range(0, len(df[0])):
                reaction_states = df[1][r].head(1000)

                def get_groups(flux_df):
                    groups = collections.defaultdict(list)
                    unique = flux_df.aggregate(lambda x: hash(str(x.values)))
                    for k, v in unique[0:].iteritems():
                        groups[v].append(k)
                    return dict([(i, g)
                                 for i, g in enumerate(groups.values())])

                n_group = collections.defaultdict(int)
                groups = get_groups(reaction_states)
                for group in groups.values():
                    n_group[len(group)] += 1
                #print(n_group)
                #print(groups)

                groups_count = {}
                for key, values in groups.items():
                    groups_count[key] = len(values)
                # print(groups_count)

                # Take first FBA label of every group
                group_id = {}
                for k, v in groups.items():
                    group_id.update({k: groups.values()[k][0]})

                # Obtain the Biomass of each Group
                cols_df = group_id.values()
                cols_df4 = df4.columns

                # Dictionary of first FBA label of every group and its corresponding number of members
                groups_label_count = {}
                for k, v in groups_count.items():
                    groups_label_count.update({cols_df[k]: v})

                #print(groups_label_count)

                # Extract FBA Groups biomass inside df4
                df5 = df4.reset_index()
                Groups_Biomass = df5[df5['index'].isin(cols_df)]
                #print(Groups_Biomass)

                # Regroup based on the biomass values
                re_group = Groups_Biomass.groupby(compoundslist_b_u_s)
                #print(re_group)

                my_list = []
                for index, values in re_group:
                    my_list.append(values['index'].values)

                #print(my_list)

                B_U_S_dict = {}
                for media in my_list:
                    if len(media) > 1:
                        media_cond = 0
                        for i in (0, len(media) - 1):
                            media_cond += groups_label_count[media[i]]
                        B_U_S_dict.update({str(media)[1:-1]: media_cond})
                        #final_my_dict.update({tuple(media.tolist()):media_cond})
                    else:
                        B_U_S_dict.update({
                            str(media)[1:-1]:
                            groups_label_count[str(tuple(
                                media.tolist()))[1:-1][:-1][1:-1]]
                        })

                B_U_S_dict = {k: v for k, v in B_U_S_dict.iteritems()}
                #print(B_U_S_dict)

                Summery = pd.DataFrame(B_U_S_dict.items(),
                                       columns=['index_x', 'index_y'])

                Data_4_CondMI = Summery.groupby('index_y').count()
                Data_4_CondMI = Data_4_CondMI.to_dict(orient='dict')

                #print(Data_4_CondMI)
                for k, v in Data_4_CondMI.items():
                    Data_4_CondMI = v

                Num_of_FBAs = Data_4_CondMI.keys()
                Count_Num_of_FBAs = Data_4_CondMI.values()
                #print(Num_of_FBAs)
                #print(Count_Num_of_FBAs)
                #print('-->***<---')

                # -------Mutual Inforamtion (MI) calculation Stage II (input compounds respect to Biomass, Uptake and Secretion-------------
                # Pure Entropy
                FBAs = len(df[1][r].columns)
                pure_entropy = math.log(FBAs, 2)

                conditional_entropy = 0.0
                for l in range(0, len(Count_Num_of_FBAs)):
                    temp = -1 * Count_Num_of_FBAs[l] * (
                        Num_of_FBAs[l] / float(FBAs)) * Num_of_FBAs[l] * (
                            1.0 / float(Num_of_FBAs[l]) *
                            (math.log(1.0 / float(Num_of_FBAs[l]), 2)))
                    conditional_entropy += temp

                Mutual_Info_B_U_S = pure_entropy - conditional_entropy
                # print('Mutaul Info:', Mutual_Info_B_U_S)

                MI_dict_b_u_s.update({df[0][r]: Mutual_Info_B_U_S})

            # Sorted MI_dict_biomass
            MI_dict_b_u_s = sorted(MI_dict_b_u_s.items(),
                                   key=lambda x: (-len(x[0]), x[0]))
            MI_dict_b_u_s = OrderedDict(MI_dict_b_u_s)

            #print(MI_dict_b_u_s)

            x_groups = [[] for x in range(num_compounds)]
            y_groups = [[] for x in range(num_compounds)]
            names = [[] for x in range(num_compounds)]
            Comp_Mapping = [[] for x in range(num_compounds)]

            for key, val in MI_dict_b_u_s.iteritems():
                del_count = key.count(',')
                x_groups[del_count].append(key)
                y_groups[del_count].append(val)

            # for x, y in zip(x_groups, y_groups):
            # data.append(go.Bar(x=x, y=y, name='test'))

            pdata = []
            for i in range(0, len(x_groups)):
                names[i] = str(i + 1) + ' Compound Combination'
                Comp_Mapping = str(i + 1) + '-' + compoundslist[i]

                record = {}
                record["x"] = []
                for e in x_groups[i]:
                    record["x"].append("c" + e)
                record["y"] = y_groups[i]
                record["names"] = names[i]
                record["Comp_Mapping"] = Comp_Mapping
                pdata.append(record)

            return [pdata, MI_dict_b_u_s]
Esempio n. 19
0
class TaxonAPI:
    '''
    Module Name:
    TaxonAPI

    Module Description:
    A KBase module: TaxonAPI
    '''

    ######## WARNING FOR GEVENT USERS ####### noqa
    # Since asynchronous IO can lead to methods - even the same method -
    # interrupting each other, you must be *very* careful when using global
    # state. A method could easily clobber the state set by another while
    # the latter method is running.
    ######################################### noqa
    VERSION = "1.0.2"
    GIT_URL = ""
    GIT_COMMIT_HASH = "5b8cdf63a676a609ea4f180891cf75006640f148"

    #BEGIN_CLASS_HEADER
    _GENOME_TYPES = ['KBaseGenomes.Genome',
                     'KBaseGenomeAnnotations.GenomeAnnotation']
    _TAXON_TYPES = ['KBaseGenomeAnnotations.Taxon']

    @functools32.lru_cache(maxsize=1000)
    def get_object(self, ref):
        res = self.ws.get_objects2({'objects': [{'ref': ref}]})['data'][0]
        return res

    def get_data(self, ref):
        obj = self.get_object(ref)
        return obj['data']

    @functools32.lru_cache(maxsize=1000)
    def translate_to_MD5_types(self, ktype):
        return self.ws.translate_to_MD5_types([ktype]).values()[0]

    def get_referrers(self, ref):
        referrers = self.ws.list_referencing_objects(
            [{"ref": ref}])[0]
        object_refs_by_type = dict()
        tlist = []
        for x in referrers:
            tlist.append(x[2])
        typemap = self.ws.translate_to_MD5_types(tlist)
        for x in referrers:
            typestring = typemap[x[2]]
            if typestring not in object_refs_by_type:
                object_refs_by_type[typestring] = list()
            upa = '%d/%d/%d' % (x[6], x[0], x[4])
            object_refs_by_type[typestring].append(upa)
        return object_refs_by_type

    def get_reffers_type(self, ref, types):
        referrers = self.get_referrers(ref)
        children = list()
        for object_type in referrers:
            if object_type.split('-')[0] in types:
                children.extend(referrers[object_type])

        return children

    def make_hash(self, i):
        omd = i[10]
        if i[10] == {}:
            omd = None

        return {
            'type_string': i[2],
            'workspace_id': i[6],
            'object_checksum': i[8],
            'object_reference': '%d/%d' % (i[6], i[0]),
            'object_size': i[9],
            'saved_by': i[5],
            'object_id': i[0],
            'save_date': i[3],
            'object_metadata': omd,
            'object_name': i[1],
            'version': i[4],
            'workspace_name': i[7],
            'object_reference_versioned': '%d/%d/%d' % (i[6], i[0], i[4])
        }
    #END_CLASS_HEADER

    # config contains contents of config file in a hash or None if it couldn't
    # be found
    def __init__(self, config):
        #BEGIN_CONSTRUCTOR
        self.workspaceURL = config['workspace-url']
        self.ws = Workspace(self.workspaceURL)
        self.shockURL = config['shock-url']
        self.logger = logging.getLogger()
        log_handler = logging.StreamHandler()
        log_handler.setFormatter(logging.Formatter("%(asctime)s [%(levelname)s] %(message)s"))
        self.logger.addHandler(log_handler)

        #END_CONSTRUCTOR
        pass


    def get_parent(self, ctx, ref):
        """
        Retrieve parent Taxon.
        @return Reference to parent Taxon.
        :param ref: instance of type "ObjectReference"
        :returns: instance of type "ObjectReference"
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN get_parent
        data = self.get_data(ref)
        try:
            returnVal = data['parent_taxon_ref']
            # returnVal=taxon_api.get_parent(ref_only=True)
        except:
            returnVal = ''
        #END get_parent

        # At some point might do deeper type checking...
        if not isinstance(returnVal, basestring):
            raise ValueError('Method get_parent return value ' +
                             'returnVal is not type basestring as required.')
        # return the results
        return [returnVal]

    def get_children(self, ctx, ref):
        """
        Retrieve children Taxon.
        @return List of references to child Taxons.
        :param ref: instance of type "ObjectReference"
        :returns: instance of list of type "ObjectReference"
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN get_children
        returnVal = self.get_reffers_type(ref, self._TAXON_TYPES)
        #END get_children

        # At some point might do deeper type checking...
        if not isinstance(returnVal, list):
            raise ValueError('Method get_children return value ' +
                             'returnVal is not type list as required.')
        # return the results
        return [returnVal]

    def get_genome_annotations(self, ctx, ref):
        """
        funcdef GenomeAnnotation(s) that refer to this Taxon.
         If this is accessing a KBaseGenomes.Genome object, it will
         return an empty list (this information is not available).
         @return List of references to GenomeAnnotation objects.
        :param ref: instance of type "ObjectReference"
        :returns: instance of list of type "ObjectReference"
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN get_genome_annotations
        returnVal = self.get_reffers_type(ref, self._GENOME_TYPES)
        #END get_genome_annotations

        # At some point might do deeper type checking...
        if not isinstance(returnVal, list):
            raise ValueError('Method get_genome_annotations return value ' +
                             'returnVal is not type list as required.')
        # return the results
        return [returnVal]

    def get_scientific_lineage(self, ctx, ref):
        """
        Retrieve the scientific lineage.
        @return Strings for each 'unit' of the lineage, ordered in
          the usual way from Domain to Kingdom to Phylum, etc.
        :param ref: instance of type "ObjectReference"
        :returns: instance of list of String
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN get_scientific_lineage
        o = self.ws.get_objects2({'objects': [{'ref': ref}]})['data'][0]['data']
        returnVal = [x.strip() for x in o['scientific_lineage'].split(";")]
        #END get_scientific_lineage

        # At some point might do deeper type checking...
        if not isinstance(returnVal, list):
            raise ValueError('Method get_scientific_lineage return value ' +
                             'returnVal is not type list as required.')
        # return the results
        return [returnVal]

    def get_scientific_name(self, ctx, ref):
        """
        Retrieve the scientific name.
        @return The scientific name, e.g., "Escherichia Coli K12 str. MG1655"
        :param ref: instance of type "ObjectReference"
        :returns: instance of String
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN get_scientific_name
        obj = self.ws.get_objects2({'objects': [{'ref': ref}]})['data'][0]['data']
        returnVal = obj['scientific_name']
        #END get_scientific_name

        # At some point might do deeper type checking...
        if not isinstance(returnVal, basestring):
            raise ValueError('Method get_scientific_name return value ' +
                             'returnVal is not type basestring as required.')
        # return the results
        return [returnVal]

    def get_taxonomic_id(self, ctx, ref):
        """
        Retrieve the NCBI taxonomic ID of this Taxon.
        For type KBaseGenomes.Genome, the ``source_id`` will be returned.
        @return Integer taxonomic ID.
        :param ref: instance of type "ObjectReference"
        :returns: instance of Long
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN get_taxonomic_id
        obj = self.ws.get_objects2({'objects': [{'ref': ref}]})['data'][0]['data']
        returnVal = obj['taxonomy_id']
        #END get_taxonomic_id

        # At some point might do deeper type checking...
        if not isinstance(returnVal, int):
            raise ValueError('Method get_taxonomic_id return value ' +
                             'returnVal is not type int as required.')
        # return the results
        return [returnVal]

    def get_kingdom(self, ctx, ref):
        """
        Retrieve the kingdom.
        :param ref: instance of type "ObjectReference"
        :returns: instance of String
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN get_kingdom
        obj = self.ws.get_objects2({'objects': [{'ref': ref}]})['data'][0]['data']
        returnVal = obj['kingdom']
        #END get_kingdom

        # At some point might do deeper type checking...
        if not isinstance(returnVal, basestring):
            raise ValueError('Method get_kingdom return value ' +
                             'returnVal is not type basestring as required.')
        # return the results
        return [returnVal]

    def get_domain(self, ctx, ref):
        """
        Retrieve the domain.
        :param ref: instance of type "ObjectReference"
        :returns: instance of String
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN get_domain
        obj = self.ws.get_objects2({'objects': [{'ref': ref}]})['data'][0]['data']
        returnVal = obj['domain']
        #END get_domain

        # At some point might do deeper type checking...
        if not isinstance(returnVal, basestring):
            raise ValueError('Method get_domain return value ' +
                             'returnVal is not type basestring as required.')
        # return the results
        return [returnVal]

    def get_genetic_code(self, ctx, ref):
        """
        Retrieve the genetic code.
        :param ref: instance of type "ObjectReference"
        :returns: instance of Long
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN get_genetic_code
        obj = self.ws.get_objects2({'objects': [{'ref': ref}]})['data'][0]['data']
        returnVal = obj['genetic_code']
        #END get_genetic_code

        # At some point might do deeper type checking...
        if not isinstance(returnVal, int):
            raise ValueError('Method get_genetic_code return value ' +
                             'returnVal is not type int as required.')
        # return the results
        return [returnVal]

    def get_aliases(self, ctx, ref):
        """
        Retrieve the aliases.
        :param ref: instance of type "ObjectReference"
        :returns: instance of list of String
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN get_aliases
        obj = self.ws.get_objects2({'objects': [{'ref': ref}]})['data'][0]['data']
        if 'aliases' in obj:
            returnVal = obj['aliases']
        else:
            returnVal = list()
        #END get_aliases

        # At some point might do deeper type checking...
        if not isinstance(returnVal, list):
            raise ValueError('Method get_aliases return value ' +
                             'returnVal is not type list as required.')
        # return the results
        return [returnVal]

    def get_info(self, ctx, ref):
        """
        Retrieve object info.
        @skip documentation
        :param ref: instance of type "ObjectReference"
        :returns: instance of type "ObjectInfo" (* @skip documentation) ->
           structure: parameter "object_id" of Long, parameter "object_name"
           of String, parameter "object_reference" of String, parameter
           "object_reference_versioned" of String, parameter "type_string" of
           String, parameter "save_date" of String, parameter "version" of
           Long, parameter "saved_by" of String, parameter "workspace_id" of
           Long, parameter "workspace_name" of String, parameter
           "object_checksum" of String, parameter "object_size" of Long,
           parameter "object_metadata" of mapping from String to String
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN get_info
        # returnVal = self.ws.get_objects2({'objects': [{'ref': ref}]})['data'][0]['info']
        i = self.get_object(ref)['info']
        #md5_typestr = self.ws.translate_to_MD5_types([i[2]]).values()[0]
        returnVal = self.make_hash(i)
        #END get_info

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError('Method get_info return value ' +
                             'returnVal is not type dict as required.')
        # return the results
        return [returnVal]

    def get_history(self, ctx, ref):
        """
        Retrieve object history.
        @skip documentation
        :param ref: instance of type "ObjectReference"
        :returns: instance of type "ObjectHistory" (* @skip documentation) ->
           list of type "ObjectInfo" (* @skip documentation) -> structure:
           parameter "object_id" of Long, parameter "object_name" of String,
           parameter "object_reference" of String, parameter
           "object_reference_versioned" of String, parameter "type_string" of
           String, parameter "save_date" of String, parameter "version" of
           Long, parameter "saved_by" of String, parameter "workspace_id" of
           Long, parameter "workspace_name" of String, parameter
           "object_checksum" of String, parameter "object_size" of Long,
           parameter "object_metadata" of mapping from String to String
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN get_history
        # returnVal = self.ws.get_object_history({'ref': ref})
        returnVal = []
        for i in self.ws.get_object_history({'ref': ref}):
            returnVal.append(self.make_hash(i))
        #END get_history

        # At some point might do deeper type checking...
        if not isinstance(returnVal, list):
            raise ValueError('Method get_history return value ' +
                             'returnVal is not type list as required.')
        # return the results
        return [returnVal]

    def get_provenance(self, ctx, ref):
        """
        Retrieve object provenance.
        @skip documentation
        :param ref: instance of type "ObjectReference"
        :returns: instance of type "ObjectProvenance" (* @skip documentation)
           -> list of type "ObjectProvenanceAction" (* @skip documentation)
           -> structure: parameter "time" of String, parameter "service_name"
           of String, parameter "service_version" of String, parameter
           "service_method" of String, parameter "method_parameters" of list
           of String, parameter "script_name" of String, parameter
           "script_version" of String, parameter "script_command_line" of
           String, parameter "input_object_references" of list of String,
           parameter "validated_object_references" of list of String,
           parameter "intermediate_input_ids" of list of String, parameter
           "intermediate_output_ids" of list of String, parameter
           "external_data" of list of type "ExternalDataUnit" (* @skip
           documentation) -> structure: parameter "resource_name" of String,
           parameter "resource_url" of String, parameter "resource_version"
           of String, parameter "resource_release_date" of String, parameter
           "data_url" of String, parameter "data_id" of String, parameter
           "description" of String, parameter "description" of String
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN get_provenance
        prov = self.ws.get_object_provenance([{"ref": ref}])[0]['provenance']
        returnVal = []
        copy_keys = {"time": "time",
                     "service": "service_name",
                     "service_ver": "service_version",
                     "method": "service_method",
                     "method_params": "method_parameters",
                     "script": "script_name",
                     "script_ver": "script_version",
                     "script_command_line": "script_command_line",
                     "input_ws_objects": "input_object_references",
                     "resolved_ws_objects": "validated_object_references",
                     "intermediate_incoming": "intermediate_input_ids",
                     "intermediate_outgoing": "intermediate_output_ids",
                     "external_data": "external_data",
                     "description": "description"
                     }

        for object_provenance in prov:
            action = dict()

            for k in copy_keys:
                if k in object_provenance:
                    if isinstance(object_provenance[k], list) and len(object_provenance[k]) == 0:
                        continue

                    action[copy_keys[k]] = object_provenance[k]

            returnVal.append(action)
        #END get_provenance

        # At some point might do deeper type checking...
        if not isinstance(returnVal, list):
            raise ValueError('Method get_provenance return value ' +
                             'returnVal is not type list as required.')
        # return the results
        return [returnVal]

    def get_id(self, ctx, ref):
        """
        Retrieve object identifier.
        @skip documentation
        :param ref: instance of type "ObjectReference"
        :returns: instance of Long
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN get_id
        returnVal = self.get_object(ref)['info'][0]
        #END get_id

        # At some point might do deeper type checking...
        if not isinstance(returnVal, int):
            raise ValueError('Method get_id return value ' +
                             'returnVal is not type int as required.')
        # return the results
        return [returnVal]

    def get_name(self, ctx, ref):
        """
        Retrieve object name.
        @skip documentation
        :param ref: instance of type "ObjectReference"
        :returns: instance of String
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN get_name
        returnVal = self.get_object(ref)['info'][1]
        #END get_name

        # At some point might do deeper type checking...
        if not isinstance(returnVal, basestring):
            raise ValueError('Method get_name return value ' +
                             'returnVal is not type basestring as required.')
        # return the results
        return [returnVal]

    def get_version(self, ctx, ref):
        """
        Retrieve object version.
        @skip documentation
        :param ref: instance of type "ObjectReference"
        :returns: instance of String
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN get_version
        returnVal = str(self.get_object(ref)['info'][4])
        #END get_version

        # At some point might do deeper type checking...
        if not isinstance(returnVal, basestring):
            raise ValueError('Method get_version return value ' +
                             'returnVal is not type basestring as required.')
        # return the results
        return [returnVal]

    def get_all_data(self, ctx, params):
        """
        :param params: instance of type "GetAllDataParams" -> structure:
           parameter "ref" of type "ObjectReference", parameter
           "include_decorated_scientific_lineage" of type "boolean" (A
           boolean. 0 = false, other = true.), parameter
           "include_decorated_children" of type "boolean" (A boolean. 0 =
           false, other = true.), parameter "exclude_children" of type
           "boolean" (A boolean. 0 = false, other = true.)
        :returns: instance of type "TaxonData" -> structure: parameter
           "parent" of type "ObjectReference", parameter "children" of list
           of type "ObjectReference", parameter "decorated_children" of list
           of type "TaxonInfo" -> structure: parameter "ref" of type
           "ObjectReference", parameter "scientific_name" of String,
           parameter "scientific_lineage" of list of String, parameter
           "decorated_scientific_lineage" of list of type "TaxonInfo" ->
           structure: parameter "ref" of type "ObjectReference", parameter
           "scientific_name" of String, parameter "scientific_name" of
           String, parameter "taxonomic_id" of Long, parameter "kingdom" of
           String, parameter "domain" of String, parameter "genetic_code" of
           Long, parameter "aliases" of list of String, parameter "obj_info"
           of type "ObjectInfo" (* @skip documentation) -> structure:
           parameter "object_id" of Long, parameter "object_name" of String,
           parameter "object_reference" of String, parameter
           "object_reference_versioned" of String, parameter "type_string" of
           String, parameter "save_date" of String, parameter "version" of
           Long, parameter "saved_by" of String, parameter "workspace_id" of
           Long, parameter "workspace_name" of String, parameter
           "object_checksum" of String, parameter "object_size" of Long,
           parameter "object_metadata" of mapping from String to String
        """
        # ctx is the context object
        # return variables are: d
        #BEGIN get_all_data
        d = {}
        ref = params['ref']

        obj = self.get_object(ref)
        data = obj['data']

        try:
            d['parent'] = data['parent_taxon_ref']
        except KeyError:
            print('Error getting parent for ' + ref)
            # +':\n'+ str(traceback.format_exc()))
            d['parent'] = None

        if 'exclude_children' in params and params['exclude_children'] == 1:
            pass
        else:
            d['children'] = self.get_reffers_type(ref, self._TAXON_TYPES)

        d['scientific_lineage'] = data['scientific_lineage']
        d['scientific_name'] = data['scientific_name']
        d['taxonomic_id'] = data['taxonomy_id']
        try:
            d['kingdom'] = data['kingdom']
            # throws error if not found, so catch and log it
        except KeyError:
            print('Error getting kingdom for ' + ref)
            # +':\n'+ str(traceback.format_exc()))
            d['kingdom'] = None

        d['domain'] = data['domain']
        d['genetic_code'] = data['genetic_code']
        d['aliases'] = None
        if 'aliases' in data:
            d['aliases'] = data['aliases']
        d['info'] = self.make_hash(obj['info'])

        key = 'include_decorated_scientific_lineage'
        if key in params and params[key] == 1:
            l = self.get_decorated_scientific_lineage(ctx, {'ref': ref})[0]
            d['decorated_scientific_lineage'] = l['decorated_scientific_lineage']

        key = 'include_decorated_children'
        if key in params and params[key] == 1:
            l = self.get_decorated_children(ctx, {'ref': ref})[0]
            d['decorated_children'] = l['decorated_children']
        #END get_all_data

        # At some point might do deeper type checking...
        if not isinstance(d, dict):
            raise ValueError('Method get_all_data return value ' +
                             'd is not type dict as required.')
        # return the results
        return [d]

    def get_decorated_scientific_lineage(self, ctx, params):
        """
        :param params: instance of type "GetDecoratedScientificLineageParams"
           -> structure: parameter "ref" of type "ObjectReference"
        :returns: instance of type "DecoratedScientificLineage" (list starts
           at the root, and goes on down to this) -> structure: parameter
           "decorated_scientific_lineage" of list of type "TaxonInfo" ->
           structure: parameter "ref" of type "ObjectReference", parameter
           "scientific_name" of String
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN get_decorated_scientific_lineage

        lineageList = []
        ref = params['ref']

        while True:
            parent_data = None
            try:
                # note: doesn't look like there is a way to get a reference
                # of a Taxon directly (without constructing it from
                # object_info), so first get reference, then instantiate
                # another API object
                parent_ref = self.get_data(ref)['parent_taxon_ref']
                if parent_ref is not None:
                    data = self.get_data(ref)
                    scientific_name = data['scientific_name']
                    if scientific_name != 'root':
                        parent_data = {
                            'ref': parent_ref,
                            'scientific_name': scientific_name
                        }
                        ref = parent_ref

            except KeyError:
                # case where parent is not found
                pass

            if parent_data is not None:
                lineageList.append(parent_data)
            else:
                break

        lineageList.reverse()  # reverse list to match scientific_lineage style
        returnVal = {'decorated_scientific_lineage': lineageList[:-1]}

        #END get_decorated_scientific_lineage

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError('Method get_decorated_scientific_lineage return value ' +
                             'returnVal is not type dict as required.')
        # return the results
        return [returnVal]

    def get_decorated_children(self, ctx, params):
        """
        :param params: instance of type "GetDecoratedChildrenParams" ->
           structure: parameter "ref" of type "ObjectReference"
        :returns: instance of type "DecoratedChildren" -> structure:
           parameter "decorated_children" of list of type "TaxonInfo" ->
           structure: parameter "ref" of type "ObjectReference", parameter
           "scientific_name" of String
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN get_decorated_children
        ref = params['ref']
        children_refs = self.get_reffers_type(ref, self._TAXON_TYPES)

        decorated_children = []
        for child_ref in children_refs:
            decorated_children.append({
                'ref': child_ref,
                'scientific_name': self.get_data(child_ref)['scientific_name']
            })

        returnVal = {'decorated_children': decorated_children}
        #END get_decorated_children

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError('Method get_decorated_children return value ' +
                             'returnVal is not type dict as required.')
        # return the results
        return [returnVal]
    def status(self, ctx):
        #BEGIN_STATUS
        returnVal = {'state': "OK", 'message': "", 'version': self.VERSION,
                     'git_url': self.GIT_URL, 'git_commit_hash': self.GIT_COMMIT_HASH}
        #END_STATUS
        return [returnVal]
Esempio n. 20
0
class CompoundSetUtils:
    '''
    Module Name:
    CompoundSetUtils

    Module Description:
    A KBase module: CompoundSetUtils
Contains tools for import & export of compound sets
    '''

    ######## WARNING FOR GEVENT USERS ####### noqa
    # Since asynchronous IO can lead to methods - even the same method -
    # interrupting each other, you must be *very* careful when using global
    # state. A method could easily clobber the state set by another while
    # the latter method is running.
    ######################################### noqa
    VERSION = "0.0.1"
    GIT_URL = "https://github.com/kbaseapps/CompoundSetUtils.git"
    GIT_COMMIT_HASH = "53bac077a8efaaea9ead90d5557b1af1c0b23394"

    #BEGIN_CLASS_HEADER
    @staticmethod
    def _check_required_param(in_params, param_list):
        """
        Check if each of the params in the list are in the input params
        """
        for param in param_list:
            if param not in in_params or not in_params[param]:
                raise ValueError('{} parameter is required'.format(param))

    def _save_to_ws_and_report(self, ctx, method, workspace, source, compoundset):
        """Save compound set to the workspace and make report"""
        provenance = [{}]
        if 'provenance' in ctx:
            provenance = ctx['provenance']
        if 'model' in method:
            provenance[0]['input_ws_objects'] = workspace + '/' + source
        provenance[0]['service'] = 'CompoundSetUtils'
        provenance[0]['method'] = method
        info = self.ws_client.save_objects(
            {'workspace': workspace,
             "objects": [{
                 "type": "KBaseBiochem.CompoundSet",
                 "data": compoundset,
                 "name": compoundset['name']
             }]})[0]
        compoundset_ref = "%s/%s/%s" % (info[6], info[0], info[4])
        report_params = {
            'objects_created': [{'ref': compoundset_ref,
                                 'description': 'Compound Set'}],
            'message': 'Imported %s as %s' % (source, compoundset_ref),
            'workspace_name': workspace,
            'report_object_name': 'compound_set_creation_report'
        }

        # Construct the output to send back
        report_client = KBaseReport(self.callback_url)
        report_info = report_client.create_extended_report(report_params)
        output = {'report_name': report_info['name'],
                  'report_ref': report_info['ref'],
                  'compoundset_ref': compoundset_ref}
        return output
    #END_CLASS_HEADER

    # config contains contents of config file in a hash or None if it couldn't
    # be found
    def __init__(self, config):
        #BEGIN_CONSTRUCTOR
        self.config = config
        self.scratch = config['scratch']
        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.ws_url = config['workspace-url']
        self.ws_client = Workspace(self.ws_url)
        self.dfu = DataFileUtil(self.callback_url)
        #END_CONSTRUCTOR
        pass


    def compound_set_from_file(self, ctx, params):
        """
        CompoundSetFromFile
        string staging_file_path
        :param params: instance of type "compoundset_upload_params" ->
           structure: parameter "workspace_name" of String, parameter
           "staging_file_path" of String, parameter "compound_set_name" of
           String
        :returns: instance of type "compoundset_upload_results" -> structure:
           parameter "report_name" of String, parameter "report_ref" of
           String, parameter "compoundset_ref" of type "obj_ref"
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN compound_set_from_file
        self._check_required_param(params, ['workspace_name',
                                            'staging_file_path',
                                            'compound_set_name'])
        scratch_file_path = self.dfu.download_staging_file(
            {'staging_file_subdir_path': params['staging_file_path']}
        ).get('copy_file_path')
        # I probably should be uploading the raw files to shock

        ext = os.path.splitext(scratch_file_path)[1]
        file_name = os.path.basename(scratch_file_path)
        if ext == '.sdf':
            compounds = parse.read_sdf(scratch_file_path)
        elif ext == '.tsv':
            compounds = parse.read_tsv(scratch_file_path)
        else:
            raise ValueError('Invalid input file type. Expects .tsv or .sdf')

        compoundset = {
            'id': params['compound_set_name'],
            'name': params['compound_set_name'],
            'description': 'Compound Set produced from %s' % file_name,
            'compounds': compounds,
        }

        output = self._save_to_ws_and_report(ctx, 'compound_set_from_file',
                                             params['workspace_name'],
                                             params['staging_file_path'],
                                             compoundset)
        #END compound_set_from_file

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method compound_set_from_file return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]

    def compound_set_to_file(self, ctx, params):
        """
        CompoundSetToFile
        string compound_set_name
        string output_format
        :param params: instance of type "compoundset_download_params" ->
           structure: parameter "workspace_name" of String, parameter
           "compound_set_name" of String, parameter "output_format" of String
        :returns: instance of type "compoundset_download_results" ->
           structure: parameter "report_name" of String, parameter
           "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN compound_set_to_file
        self._check_required_param(params, ['workspace_name', 'compound_set_name',
                                            'output_format'])
        compoundset = self.ws_client.get_objects2({'objects': [
            {'workspace': params['workspace_name'],
             'name': params['compound_set_name']}]})['data'][0]['data']
        ext = params['output_format']
        out = "%s/%s.%s" % (self.scratch, compoundset['name'], ext)
        if ext == 'sdf':
            outfile_path = parse.write_sdf(compoundset, out)
        elif ext == 'tsv':
            outfile_path = parse.write_tsv(compoundset, out)
        else:
            raise ValueError('Invalid output file type. Expects tsv or sdf')

        report_files = [{'path': outfile_path,
                         'name': os.path.basename(outfile_path),
                         'label': os.path.basename(outfile_path),
                         'description': 'A compound set in %s format' % ext}]

        report_params = {
            'objects_created': [],
            'message': 'Converted %s compound set to %s format.' % (
                params['compound_set_name'], params['output_format']),
            'file_links': report_files,
            'workspace_name': params['workspace_name'],
            'report_object_name': 'compound_set_download_report'
        }

        # Construct the output to send back
        report_client = KBaseReport(self.callback_url)
        report_info = report_client.create_extended_report(report_params)
        output = {'report_name': report_info['name'],
                  'report_ref': report_info['ref'],
                  }
        #END compound_set_to_file

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method compound_set_to_file return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]

    def compound_set_from_model(self, ctx, params):
        """
        CompoundSetFromModel
        required:
        string workspace_name
        string model_name
        string compound_set_name
        :param params: instance of type "compoundset_from_model_params" ->
           structure: parameter "workspace_name" of String, parameter
           "model_name" of String, parameter "compound_set_name" of String
        :returns: instance of type "compoundset_upload_results" -> structure:
           parameter "report_name" of String, parameter "report_ref" of
           String, parameter "compoundset_ref" of type "obj_ref"
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN compound_set_from_model
        self._check_required_param(params, ['workspace_name', 'model_name',
                                            'compound_set_name'])
        model = self.ws_client.get_objects2({'objects': [
            {'workspace': params['workspace_name'],
             'name': params['model_name']}]})['data'][0]['data']
        compounds, undef = parse.parse_model(model)
        compoundset = {
            'id': params['compound_set_name'],
            'name': params['compound_set_name'],
            'description': 'Compound Set produced from %s, a metabolic model'
                           % model['id'],
            'compounds': compounds,
        }

        output = self._save_to_ws_and_report(ctx, 'compound_set_from_model',
                                             params['workspace_name'],
                                             params['model_name'], compoundset)
        #END compound_set_from_model

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method compound_set_from_model return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
    def status(self, ctx):
        #BEGIN_STATUS
        returnVal = {'state': "OK",
                     'message': "",
                     'version': self.VERSION,
                     'git_url': self.GIT_URL,
                     'git_commit_hash': self.GIT_COMMIT_HASH}
        #END_STATUS
        return [returnVal]
Esempio n. 21
0
from Workspace.WorkspaceClient import Workspace
import json
wsid = 16962
upa = '16962/3'
upa2 = '16962/23'


ws = Workspace('https://ci.kbase.us/services/ws')
d = ws.get_workspace_info({'id': wsid})
with open('get_workspace_info.json', 'w') as f:
    f.write(json.dumps(d))
d = ws.list_objects({'ids': [wsid]})
with open('list_objects.json', 'w') as f:
    f.write(json.dumps(d))
d = ws.get_objects2({'objects': [{'ref': upa}]})
with open('get_objects.json', 'w') as f:
    f.write(json.dumps(d))
d = ws.get_objects2({'objects': [{'ref': upa2}]})
with open('narrative_object.json', 'w') as f:
    f.write(json.dumps(d))
Esempio n. 22
0
def fetch_reads_refs_from_sampleset(ref, ws_url, callback_url, params):
    """
    From the given object ref, return a list of all reads objects that are a part of that
    object. E.g., if ref is a ReadsSet, return a list of all PairedEndLibrary or SingleEndLibrary
    refs that are a member of that ReadsSet. This is returned as a list of dictionaries as follows:
    {
        "ref": reads object reference,
        "condition": condition string associated with that reads object
    }
    The only one required is "ref", all other keys may or may not be present, based on the reads
    object or object type in initial ref variable. E.g. a RNASeqSampleSet might have condition info
    for each reads object, but a single PairedEndLibrary may not have that info.
    If ref is already a Reads library, just returns a list with ref as a single element.
    """
    obj_type = get_object_type(ref, ws_url)
    ws = Workspace(ws_url)
    refs = list()
    refs_for_ws_info = list()
    if "KBaseSets.ReadsSet" in obj_type:
        print("Looking up reads references in ReadsSet object")
        set_client = SetAPI(callback_url)
        reads_set = set_client.get_reads_set_v1({
            "ref": ref,
            "include_item_info": 0
        })
        for reads in reads_set["data"]["items"]:
            refs.append({"ref": reads["ref"], "condition": reads["label"]})
            refs_for_ws_info.append({'ref': reads['ref']})
    elif "KBaseRNASeq.RNASeqSampleSet" in obj_type:
        print("Looking up reads references in RNASeqSampleSet object")
        sample_set = ws.get_objects2({"objects": [{
            "ref": ref
        }]})["data"][0]["data"]
        for i in range(len(sample_set["sample_ids"])):
            refs.append({
                "ref": sample_set["sample_ids"][i],
                "condition": sample_set["condition"][i]
            })
            refs_for_ws_info.append({'ref': sample_set['sample_ids'][i]})
    elif ("KBaseAssembly.SingleEndLibrary" in obj_type
          or "KBaseFile.SingleEndLibrary" in obj_type
          or "KBaseFile.SingleEndLibrary-2.0" in obj_type
          or "KBaseFile.SingleEndLibrary-2.1" in obj_type
          or "KBaseAssembly.PairedEndLibrary" in obj_type
          or "KBaseFile.PairedEndLibrary" in obj_type
          or "KBaseFile.PairedEndLibrary-2.0" in obj_type
          or "KBaseFile.PairedEndLibrary-2.1" in obj_type):
        refs.append({"ref": ref})
        refs_for_ws_info.append({'ref': ref})
    else:
        raise ValueError("Unable to fetch reads reference from object {} "
                         "which is a {}".format(ref, obj_type))

    # get object info so we can name things properly
    infos = ws.get_object_info3({'objects': refs_for_ws_info})['infos']

    name_ext = '_alignment'
    if ('alignment_suffix' in params
            and params['alignment_suffix'] is not None):
        ext = params['alignment_suffix'].replace(' ', '')
        if ext:
            name_ext = ext

    unique_names = get_unique_names(infos)
    for k in range(0, len(refs)):
        refs[k]['info'] = infos[k]
        name = unique_names[k] + name_ext
        refs[k]['alignment_output_name'] = name

    return refs
class DiffExprMatrixUtils:
    """
     Constains a set of functions for expression levels calculations.
    """

    PARAM_IN_WS_NAME = 'workspace_name'
    PARAM_IN_OBJ_NAME = 'output_obj_name'
    PARAM_IN_DIFFEXPMATSET_REF = 'diffExprMatrixSet_ref'

    def __init__(self, config, logger=None):
        self.config = config
        self.logger = logger
        self.scratch = os.path.join(config['scratch'],
                                    'DEM_' + str(uuid.uuid4()))
        self.ws_url = config['workspace-url']
        self.serviceWizardURL = config['srv-wiz-url']
        self._mkdir_p(self.scratch)
        pass

    def _mkdir_p(self, path):
        """
        _mkdir_p: make directory for given path
        """
        if not path:
            return
        try:
            os.makedirs(path)
        except OSError as exc:
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise

    def process_params(self, params):
        """
        validates params passed to gen expression matrix method
        """
        for p in [self.PARAM_IN_DIFFEXPMATSET_REF]:
            if p not in params:
                raise ValueError(
                    '"{}" parameter is required, but missing'.format(p))

    def get_expressionset_data(self, expressionset_ref):

        expr_set_obj = self.ws_client.get_objects2(
            {'objects': [{
                'ref': expressionset_ref
            }]})['data'][0]

        expr_set_obj_type = expr_set_obj.get('info')[2]
        expr_set_data = dict()
        expr_set_data['ws_name'] = expr_set_obj.get('info')[7]
        expr_set_data['obj_name'] = expr_set_obj.get('info')[1]

        if re.match('KBaseRNASeq.RNASeqExpressionSet-\d.\d',
                    expr_set_obj_type):
            expr_set_data['genome_ref'] = expr_set_obj['data']['genome_id']
            expr_obj_refs = list()
            for expr_obj in expr_set_obj['data']['mapped_expression_ids']:
                expr_obj_refs.append(expr_obj.values()[0])
            expr_set_data['expr_obj_refs'] = expr_obj_refs

        elif re.match('KBaseSets.ExpressionSet-\d.\d', expr_set_obj_type):
            items = expr_set_obj.get('data').get('items')
            expr_obj_refs = list()
            for item in items:
                expr_obj_refs.append(item['ref'])
            expr_obj = self.ws_client.get_objects2(
                {'objects': [{
                    'ref': expr_obj_refs[0]
                }]})['data'][0]
            expr_set_data['genome_ref'] = expr_obj['data']['genome_id']
            expr_set_data['expr_obj_refs'] = expr_obj_refs
        else:
            raise TypeError(self.PARAM_IN_EXPSET_REF + ' should be of type ' +
                            'KBaseRNASeq.RNASeqExpressionSet ' +
                            'or KBaseSets.ExpressionSet')
        return expr_set_data

    def get_diffexpr_matrixset(self, params, token):

        self.ws_client = Workspace(self.ws_url, token=token)

        col_names = {
            'gene_id': 'gene',
            'log2_fold_change': 'log2fc_f',
            'p_value': 'p_value_f',
            'q_value': 'q_value'
        }

        json_fields = ['log2fc_f', 'p_value_f', 'q_value']

        self.process_params(params)

        diffexprmatset_list = list()
        diffexprmatset_ref = params.get(self.PARAM_IN_DIFFEXPMATSET_REF)

        diffexprmatset_obj = self.ws_client.get_objects2(
            {'objects': [{
                'ref': diffexprmatset_ref
            }]})['data'][0]

        items = diffexprmatset_obj.get('data').get('items')
        diffexprmat_refs = list()

        for item in items:
            diffexprmat_refs.append(item['ref'])
            self.logger.info('DiffExprMatrix ref: ' + item['ref'])

        for diffexprmat_ref in diffexprmat_refs:
            diffexprmat_dict = dict()
            diffexprmat_obj = self.ws_client.get_objects2(
                {'objects': [{
                    'ref': diffexprmat_ref
                }]})['data'][0]
            diffexprmat = diffexprmat_obj.get('data')
            diffexprmat_dict['condition_1'] = diffexprmat.get(
                'condition_mapping').keys()[0]
            diffexprmat_dict['condition_2'] = diffexprmat.get(
                'condition_mapping').values()[0]
            voldata = list()
            data = diffexprmat.get('data')

            for row_index, row_id in enumerate(data.get('row_ids')):
                row_data = dict()
                row_data['gene'] = row_id
                values = data.get('values')[row_index]
                for col_index in range(len(values)):
                    row_data[json_fields[col_index]] = values[col_index]

                voldata.append(row_data)

            diffexprmat_dict['voldata'] = voldata
            diffexprmatset_list.append(diffexprmat_dict)

        return diffexprmatset_list

    def get_matrix_stats(self, raw_row):
        """
        returns a list of [ min, max, mean, std.dev, is_data_missing] for one row of conditional 
        expression values
        """
        has_missing = "No"
        row = []
        for r in raw_row:
            if r == None or numpy.isnan(
                    r
            ):  # careful here - r can be 0 which is a legitimate value
                has_missing = "Yes"
            else:
                row.append(r)

        if len(row) < 1:
            return (['NA', 'NA', 'NA', 'NA', 'Yes'])

        if len(row) == 1:
            sd = 0
        else:
            sd = numpy.std(row, ddof=1)
        return ([min(row), max(row), numpy.mean(row), sd, has_missing])

    def convert_dem_to_dict(self, dem):
        """
        returns a dict that maps feature_id -> [ fc, q ]
        """
        row_ids = dem.get('row_ids')
        vals = dem.get('values')

        n_rows = len(row_ids)
        if (len(vals) != n_rows):
            raise Exception(
                "length discrepancy in differential expression matrix: {0} row_ids but {1} values"
                .format(n_rows, len(fvals)))

        dem_dict = {}
        for _id, val in zip(row_ids, vals):
            dem_dict[_id] = [
                val[0], val[2]
            ]  # [fc,q]. (not bothering to check for dups here)

        return dem_dict

    def get_enhancedFEM(self, params, tok):
        """
        implements get_enhancedFilteredExpressionMatrix() method
        """

        self.ws_client = Workspace(self.ws_url, token=tok)

        if 'fem_object_ref' not in params:
            raise ValueError(
                "fem_object_ref parameter not given to get_enhancedFilteredExpressionMatrix"
            )

        fem_object_ref = params.get('fem_object_ref')

        fem_obj_ret = self.ws_client.get_objects2(
            {'objects': [{
                'ref': fem_object_ref
            }]})['data'][0]
        fem = fem_obj_ret.get('data')
        prov = fem_obj_ret.get('provenance')[0]

        # create the enhanced FEM, starting with the FEM

        efem = {}
        for k in ['genome_ref', 'scale', 'type']:
            efem[k] = fem.get(k)

        efem['data'] = {}
        efem['data']['col_ids'] = [
            "description", "fold-change", "q-value", "min", "max", "mean",
            "std_dev", "is_missing_values"
        ]
        efem['data']['column_labels'] = [
            "Description", "Fold change", "Q value", "Min. expression",
            "Max. expression", "Mean expression", "Std. dev.",
            "Missing values?"
        ]
        fm = fem.get('data')
        efem['data']['row_ids'] = fm.get('row_ids')
        efem['data']['values'] = []
        n_efem_rows = len(efem['data']['row_ids'])
        fvals = fm.get('values')
        if (len(fvals) != n_efem_rows):
            raise Exception(
                "length discrepancy in filtered expression matrix: {0} row_ids but {1} values"
                .format(n_efem_rows, len(fvals)))

        # Get genome object and feature descriptions as a handy feature-indexed dict

        # moved from constructor
        gaa = GenomeAnnotationAPI(self.serviceWizardURL, token=tok)
        feat_dict = gaa.get_feature_functions({
            'ref': fem.get('genome_ref'),
            'feature_id_list': None
        })

        # if this FEM has a "resolved_ws_objects" record in its provenance,
        # then that should be a list of one DEM reference from which we get the FC and q values
        # as a feature (=row_id) -indexed dict.

        if fem.get('diff_expr_matrix_ref'):
            dem_ref = fem.get('diff_expr_matrix_ref')
            dem_obj_ret = self.ws_client.get_objects2(
                {'objects': [{
                    'ref': dem_ref
                }]})['data'][0]

            dem = dem_obj_ret.get('data')
            dem_dict = self.convert_dem_to_dict(
                dem.get('data'))  # convert to dictionary for quick lookups
        else:
            dem_dict = {}  # empty dictionary

        # for each row

        for row_id, fm_val_row in zip(fm.get('row_ids'), fvals):

            # make a new row with NA for description, FC and q

            new_values_row = ['NA', 'NA', 'NA'
                              ] + self.get_matrix_stats(fm_val_row)

            # if we have a description for this feature (row_id) put it in the first column

            desc = feat_dict.get(row_id)
            if desc:
                new_values_row[
                    0] = desc  # leave as 'NA' if no entry in feat_dict

            # if we have a DEM entry for this row, put FC and q into 2nd and 3rd columns
            d = dem_dict.get(row_id)
            if d:
                new_values_row[1], new_values_row[2] = d

            # finally, add this row to the eFEM

            efem['data']['values'].append(new_values_row)

        return efem
Esempio n. 24
0
class ExprMatrixUtils:
    """
     Constains a set of functions for expression levels calculations.
    """

    PARAM_IN_WS_NAME = 'workspace_name'
    PARAM_IN_OBJ_NAME = 'output_obj_name'
    PARAM_IN_EXPSET_REF = 'expressionset_ref'

    def __init__(self, config, logger=None):
        self.config = config
        self.logger = logger
        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.scratch = os.path.join(config['scratch'], 'EM_' + str(uuid.uuid4()))
        self.ws_url = config['workspace-url']
        self.ws_client = Workspace(self.ws_url)
        self.dfu = DataFileUtil(self.callback_url)
        pass

    def process_params(self, params):
        """
        validates params passed to gen expression matrix method
        """
        for p in [self.PARAM_IN_EXPSET_REF,
                  self.PARAM_IN_OBJ_NAME,
                  self.PARAM_IN_WS_NAME
                 ]:
            if p not in params:
                raise ValueError('"{}" parameter is required, but missing'.format(p))

        ws_name_id = params.get(self.PARAM_IN_WS_NAME)
        if not isinstance(ws_name_id, int):
            try:
                ws_name_id = self.dfu.ws_name_to_id(ws_name_id)
            except DFUError as se:
                prefix = se.message.split('.')[0]
                raise ValueError(prefix)
        self.ws_id = ws_name_id

    def get_expressionset_data(self, expressionset_ref):

        expr_set_obj = self.ws_client.get_objects2(
            {'objects': [{'ref': expressionset_ref}]})['data'][0]

        expr_set_obj_type = expr_set_obj.get('info')[2]
        expr_set_data = dict()
        expr_set_data['ws_name'] = expr_set_obj.get('info')[7]
        expr_set_data['obj_name'] = expr_set_obj.get('info')[1]

        if re.match('KBaseRNASeq.RNASeqExpressionSet-\d.\d', expr_set_obj_type):
            expr_set_data['genome_ref'] = expr_set_obj['data']['genome_id']
            expr_obj_refs = list()
            for expr_obj in expr_set_obj['data']['mapped_expression_ids']:
                expr_obj_refs.append(expr_obj.values()[0])
            expr_set_data['expr_obj_refs'] = expr_obj_refs

        elif re.match('KBaseSets.ExpressionSet-\d.\d', expr_set_obj_type):
            items = expr_set_obj.get('data').get('items')
            expr_obj_refs = list()
            for item in items:
                expr_obj_refs.append(item['ref'])
            expr_obj = self.ws_client.get_objects2(
                {'objects': [{'ref': expr_obj_refs[0]}]})['data'][0]
            expr_set_data['genome_ref'] = expr_obj['data']['genome_id']
            expr_set_data['expr_obj_refs'] = expr_obj_refs
        else:
            raise TypeError(self.PARAM_IN_EXPSET_REF + ' should be of type ' +
                            'KBaseRNASeq.RNASeqExpressionSet ' +
                            'or KBaseSets.ExpressionSet')
        return expr_set_data

    def save_expression_matrix(self, tables, expr_set_data, em_obj_name, hidden = 0):

        all_rows = {}    # build a dictionary of keys only which is a union of all row ids (gene_ids)
        self.logger.info( '***** length of tables is {0}'.format( len( tables )))
        for table in tables:
            for r in table.keys():
                all_rows[r] = []

        for gene_id in all_rows.keys():
            row = []
            for table in tables:
                if ( gene_id in table ):
                    #logger.info( 'append ' + gene_id )
                    #logger.info( pformat( table[gene_id]))
                               #all_rows[gene_id].append( table[gene_id] )
                    row.append( table[gene_id] )
                else:
                    #logger.info( 'append  0' )
                    row.append( 0 )
                all_rows[gene_id] = row
                #logger.info( all_rows[gene_id])

        em_data = {
                    'genome_ref': expr_set_data['genome_ref'],
                    'scale': 'log2',
                    'type': 'level',
                    'data': {
                            'row_ids': [],
                            'values': [],
                            'col_ids': expr_set_data['expr_obj_names']
                            },
                    'feature_mapping' : {},
                    'condition_mapping': expr_set_data['condition_map']
                   }

        # we need to load row-by-row to preserve the order
        self.logger.info('loading expression matrix data')

        for gene_id in all_rows.keys():
            em_data['feature_mapping'][gene_id] = gene_id
            em_data['data']['row_ids'].append(gene_id)
            em_data['data']['values'].append(all_rows[gene_id])

        try:
            self.logger.info( 'saving em_data em_name {0}'.format(em_obj_name))
            obj_info = self.dfu.save_objects({'id': self.ws_id,
                                              'objects': [
                                                          { 'type': 'KBaseFeatureValues.ExpressionMatrix',
                                                            'data': em_data,
                                                            'name': em_obj_name,
                                                            'hidden': hidden,
                                                            'extra_provenance_input_refs': [
                                                                em_data.get('genome_ref'),
                                                                self.params[self.PARAM_IN_EXPSET_REF]]
                                                          }
                                                    ]})[0]
            self.logger.info('ws save return:\n' + pformat(obj_info))
        except Exception as e:
            self.logger.exception(e)
            raise Exception('Failed Saving Expression Matrix to Workspace')

        return str(obj_info[6]) + '/' + str(obj_info[0]) + '/' + str(obj_info[4])

    def get_expression_matrix(self, params):

        self.process_params(params)
        self.params = params

        expressionset_ref = params.get(self.PARAM_IN_EXPSET_REF)

        expr_set_data = self.get_expressionset_data(expressionset_ref)
        expr_obj_names = list()
        fpkm_tables = list()
        tpm_tables = list()
        condition_map = dict()
        tpm_table = None
        for expr_obj_ref in expr_set_data['expr_obj_refs']:
            try:
                self.logger.info('*** getting expression set {0} from workspace ****'
                                 .format(expr_obj_ref))

                expr = self.ws_client.get_objects2(
                                            {'objects':
                                            [{'ref': expr_obj_ref}]})['data'][0]

            except Exception, e:
                self.logger.exception(e)
                raise Exception('Unable to download expression object {0} from workspace {1}'.
                                format(expr_obj_ref, expr_set_data['ws_name']))

            expr_name = expr.get('info')[1]
            expr_obj_names.append(expr_name)
            condition_map.update({expr_name: expr.get('data').get('condition')})
            num_interp = expr.get('data').get('numerical_interpretation')
            if num_interp != 'FPKM':
                raise Exception(
                    'Did not get expected FPKM value from numerical interpretation key from \
                     Expression object {0}, instead got '.format(expr_obj_ref, num_interp))

            pr_comments = expr.get('data').get('processing_comments', None)  # log2 Normalized
            if pr_comments is not None:
                self.logger.info('pr_comments are {0}'.format(pr_comments))

            fpkm_table = expr.get('data').get('expression_levels') # QUESTION: is this really FPKM levels?
            self.logger.info('FPKM keycount: {0}'.format(len(fpkm_table.keys())))
            fpkm_tables.append(fpkm_table)

            tpm_table = None  # Cufflinks doesn't generate TPM
            if 'tpm_expression_levels' in expr['data']:  # so we need to check for this key
                tpm_table = expr.get('data').get('tpm_expression_levels')
                self.logger.info('TPM keycount: {0}'.format(len(tpm_table.keys())))
                tpm_tables.append(tpm_table)

        expr_set_data['expr_obj_names'] = expr_obj_names
        expr_set_data['condition_map'] = condition_map
        output_obj_name = params.get(self.PARAM_IN_OBJ_NAME)
        fpkm_ref = self.save_expression_matrix(fpkm_tables,
                                               expr_set_data,
                                               '{0}_FPKM_ExpressionMatrix'.format(output_obj_name))
        tpm_ref = None
        if tpm_table is not None:
            tpm_ref = self.save_expression_matrix(tpm_tables,
                                                  expr_set_data,
                                                  '{0}_TPM_ExpressionMatrix'.format(output_obj_name))
        return fpkm_ref, tpm_ref
Esempio n. 25
0
def fetch_reads_refs_from_sampleset(ref, ws_url, callback_url):
    """
    From the given object ref, return a list of all reads objects that are a part of that
    object. E.g., if ref is a ReadsSet, return a list of all PairedEndLibrary or SingleEndLibrary
    refs that are a member of that ReadsSet. This is returned as a list of dictionaries as follows:
    {
        "ref": reads object reference,
        "condition": condition string associated with that reads object,
        "name": reads object name (needed for saving an AlignmentSet)
    }
    The only one required is "ref", all other keys may or may not be present, based on the reads
    object or object type in initial ref variable. E.g. a RNASeqSampleSet might have condition info
    for each reads object, but a single PairedEndLibrary may not have that info.

    If ref is already a Reads library, just returns a list with ref as a single element.
    """
    obj_type = get_object_type(ref, ws_url)
    refs = list()
    if "KBaseSets.ReadsSet" in obj_type:
        print("Looking up reads references in ReadsSet object")
        set_client = SetAPI(callback_url)
        reads_set = set_client.get_reads_set_v1({
            "ref": ref,
            "include_item_info": 0
        })
        print("Got results from ReadsSet object")
        pprint(reads_set)
        ref_list = [r["ref"] for r in reads_set["data"]["items"]]
        reads_names = get_object_names(ref_list, ws_url)
        for reads in reads_set["data"]["items"]:
            ref = reads["ref"]
            refs.append({
                "ref": ref,
                "condition": reads["label"],
                "name": reads_names[ref]
            })

    elif "KBaseRNASeq.RNASeqSampleSet" in obj_type:
        print("Looking up reads references in RNASeqSampleSet object")
        ws = Workspace(ws_url)
        sample_set = ws.get_objects2({"objects": [{
            "ref": ref
        }]})["data"][0]["data"]
        sample_names = get_object_names(sample_set["sample_ids"], ws_url)
        for i in range(len(sample_set["sample_ids"])):
            ref = sample_set["sample_ids"][i]
            refs.append({
                "ref": ref,
                "condition": sample_set["condition"][i],
                "name": sample_names[ref]
            })

    elif ("KBaseAssembly.SingleEndLibrary" in obj_type
          or "KBaseFile.SingleEndLibrary" in obj_type
          or "KBaseAssembly.PairedEndLibrary" in obj_type
          or "KBaseFile.PairedEndLibrary" in obj_type):
        refs.append({"ref": ref, "name": get_object_names([ref], ws_url)[ref]})
    else:
        raise ValueError("Unable to fetch reads reference from object {} "
                         "which is a {}".format(ref, obj_type))

    return refs
Esempio n. 26
0
    def exec_remove_adapters(self, ctx, params):
        """
        :param params: instance of type "RemoveAdaptersParams" -> structure:
           parameter "output_workspace" of String, parameter
           "output_object_name" of String, parameter "input_reads" of type
           "ws_ref" (@ref ws), parameter "five_prime" of type
           "FivePrimeOptions" (unfortunately, we have to name the fields
           uniquely between 3' and 5' options due to the current
           implementation of grouped parameters) -> structure: parameter
           "adapter_sequence_5P" of String, parameter "anchored_5P" of type
           "boolean" (@range (0, 1)), parameter "three_prime" of type
           "ThreePrimeOptions" -> structure: parameter "adapter_sequence_3P"
           of String, parameter "anchored_3P" of type "boolean" (@range (0,
           1)), parameter "error_tolerance" of Double, parameter
           "min_overlap_length" of Long, parameter "min_read_length" of Long,
           parameter "discard_untrimmed" of type "boolean" (@range (0, 1))
        :returns: instance of type "exec_RemoveAdaptersResult" -> structure:
           parameter "report" of String, parameter "output_reads_ref" of
           String
        """
        # ctx is the context object
        # return variables are: result
        #BEGIN exec_remove_adapters
        console = []
        self.log(console, 'Running exec_remove_adapters() with parameters: ')
        self.log(console, "\n" + pformat(params))
        self.log(console, "-----------------------------------------------\n")
        report = ''
        returnVal = dict()
        returnVal['output_reads_ref'] = None

        token = ctx['token']
        wsClient = workspaceService(self.config['workspace-url'], token=token)
        ws = Workspace(self.config['workspace-url'], token=token)
        #setAPI_Client = SetAPI (url=self.config['SDK_CALLBACK_URL'], token=token) # for SDK local, doesn't work for SetAPI
        setAPI_Client = SetAPI(url=self.config['service-wizard-url'],
                               token=token)  # for dynamic service
        headers = {'Authorization': 'OAuth ' + token}
        env = os.environ.copy()
        env['KB_AUTH_TOKEN'] = token

        # 0. param checks
        required_params = [
            'output_workspace', 'input_reads', 'output_object_name'
        ]
        for arg in required_params:
            if arg not in params or params[arg] == None or params[arg] == '':
                raise ValueError("Must define required param: '" + arg + "'")

        # 1. load provenance
        provenance = [{}]
        if 'provenance' in ctx:
            provenance = ctx['provenance']
        # add additional info to provenance here, in this case the input data object reference
        provenance[0]['input_ws_objects'] = [str(params['input_reads'])]

        # 2. Determine whether read library, ReadsSet or RNASeqSampleSet is input object
        #
        try:
            # object_info tuple
            [
                OBJID_I, NAME_I, TYPE_I, SAVE_DATE_I, VERSION_I, SAVED_BY_I,
                WSID_I, WORKSPACE_I, CHSUM_I, SIZE_I, META_I
            ] = range(11)

            input_reads_obj_info = wsClient.get_object_info_new(
                {'objects': [{
                    'ref': params['input_reads']
                }]})[0]
            input_reads_obj_type = input_reads_obj_info[TYPE_I]
            input_reads_obj_type = re.sub(
                '-[0-9]+\.[0-9]+$', "",
                input_reads_obj_type)  # remove trailing version
            #input_reads_obj_version = input_reads_obj_info[VERSION_I]  # this is object version, not type version
        except Exception as e:
            raise ValueError(
                'Unable to get read library object from workspace: (' +
                str(params['input_reads']) + ')' + str(e))

        acceptable_types = [
            "KBaseSets.ReadsSet", "KBaseRNASeq.RNASeqSampleSet",
            "KBaseFile.PairedEndLibrary", "KBaseFile.SingleEndLibrary",
            "KBaseAssembly.PairedEndLibrary", "KBaseAssembly.SingleEndLibrary"
        ]
        if input_reads_obj_type not in acceptable_types:
            raise ValueError("Input reads of type: '" + input_reads_obj_type +
                             "'.  Must be one of " +
                             ", ".join(acceptable_types))

        # 3. Retrieve the set details
        #
        readsSet_ref_list = []
        readsSet_names_list = []
        readsSet_types_list = []
        if "KBaseSets.ReadsSet" in input_reads_obj_type:
            try:
                input_readsSet_obj = setAPI_Client.get_reads_set_v1({
                    'ref':
                    params['input_reads'],
                    'include_item_info':
                    1
                })

            except Exception as e:
                raise ValueError(
                    'SetAPI FAILURE: Unable to get read library set object from workspace: ('
                    + str(params['input_reads']) + ")\n" + str(e))
            for readsLibrary_obj in input_readsSet_obj['data']['items']:
                readsSet_ref_list.append(readsLibrary_obj['ref'])
                NAME_I = 1
                TYPE_I = 2
                readsSet_names_list.append(readsLibrary_obj['info'][NAME_I])
                this_type = readsLibrary_obj['info'][TYPE_I]
                this_type = re.sub('-[0-9]+\.[0-9]+$', "",
                                   this_type)  # remove trailing version
                readsSet_types_list.append(this_type)

        elif "KBaseRNASeq.RNASeqSampleSet" in input_reads_obj_type:
            sample_set = ws.get_objects2(
                {"objects": [{
                    "ref": params['input_reads']
                }]})["data"][0]["data"]
            sample_refs = list()
            for i in range(len(sample_set["sample_ids"])):
                readsSet_ref_list.append(sample_set["sample_ids"][i])
                sample_refs.append({"ref": sample_set["sample_ids"][i]})

            info = ws.get_object_info3({"objects": sample_refs})
            for j in range(len(info["infos"])):
                NAME_I = 1
                TYPE_I = 2
                readsSet_names_list.append(info["infos"][j][NAME_I])
                sample_type = info["infos"][j][TYPE_I]
                sample_type = re.sub('-[0-9]+\.[0-9]+$', "",
                                     sample_type)  # remove trailing version
                readsSet_types_list.append(sample_type)
        else:
            readsSet_ref_list = [params['input_reads']]
            readsSet_names_list = [params['output_object_name']]
            readsSet_types_list = [input_reads_obj_type]

        # 4. Iterate through readsLibrary memebers of set
        #
        report = ''
        cutadapt_readsSet_ref = None
        cutadapt_readsLib_refs = []

        for reads_item_i, input_reads_library_ref in enumerate(
                readsSet_ref_list):
            exec_remove_adapters_OneLibrary_params = {
                'output_workspace': params['output_workspace'],
                'input_reads': input_reads_library_ref,
                'reads_type': readsSet_types_list[reads_item_i]
            }
            if (input_reads_obj_type != "KBaseSets.ReadsSet"
                    and input_reads_obj_type != "KBaseRNASeq.RNASeqSampleSet"):
                exec_remove_adapters_OneLibrary_params[
                    'output_object_name'] = params['output_object_name']
            else:
                exec_remove_adapters_OneLibrary_params[
                    'output_object_name'] = readsSet_names_list[
                        reads_item_i] + "_cutadapt"

            optional_params = [
                'float error_tolerance', 'min_overlap_length',
                'min_read_length', 'discard_untrimmed'
            ]
            optional_g_params = {
                'five_prime': ['adapter_sequence_5P', 'anchored_5P'],
                'three_prime': ['adapter_sequence_3P', 'anchored_3P']
            }
            for arg in optional_params:
                if arg in params and params[arg] != None:
                    exec_remove_adapters_OneLibrary_params[arg] = params[arg]

            for group in optional_g_params.keys():
                if group in params and params[group] != None:
                    exec_remove_adapters_OneLibrary_params[group] = dict()
                    for arg in optional_g_params[group]:
                        if arg in params[group] and params[group][arg] != None:
                            exec_remove_adapters_OneLibrary_params[group][
                                arg] = params[group][arg]

            msg = "\n\nRUNNING exec_remove_adapters_OneLibrary() ON LIBRARY: " + str(
                input_reads_library_ref) + " " + str(
                    readsSet_names_list[reads_item_i]) + "\n"
            msg += "----------------------------------------------------------------------------\n"
            report += msg
            self.log(console, msg)

            # RUN
            exec_remove_adapters_OneLibrary_retVal = self.exec_remove_adapters_OneLibrary(
                ctx, exec_remove_adapters_OneLibrary_params)[0]

            report += exec_remove_adapters_OneLibrary_retVal['report'] + "\n\n"
            cutadapt_readsLib_refs.append(
                exec_remove_adapters_OneLibrary_retVal['output_reads_ref'])

        # 5. Conclude
        # Just one Library
        if (input_reads_obj_type != "KBaseSets.ReadsSet"
                and input_reads_obj_type != "KBaseRNASeq.RNASeqSampleSet"):

            # create return output object
            result = {
                'report': report,
                'output_reads_ref': cutadapt_readsLib_refs[0],
            }
        # ReadsSet or SampleSet
        else:
            # save cutadapt readsSet
            some_cutadapt_output_created = False
            items = []
            for i, lib_ref in enumerate(cutadapt_readsLib_refs):

                if lib_ref == None:
                    #items.append(None)  # can't have 'None' items in ReadsSet
                    continue
                else:
                    some_cutadapt_output_created = True
                    try:
                        label = input_readsSet_obj['data']['items'][i]['label']
                    except:
                        NAME_I = 1
                        label = ws.get_object_info3(
                            {'objects': [{
                                'ref': lib_ref
                            }]})['infos'][0][NAME_I]
                    label = label + "_cutadapt"

                    items.append({
                        'ref': lib_ref,
                        'label': label
                        #'data_attachment': ,
                        #'info':
                    })
            if some_cutadapt_output_created:
                reads_desc_ext = " + Cutadapt"
                #reads_name_ext = "_cutadapt"
                descText = ""
                reads_name_ext = ""
                try:
                    descText = input_readsSet_obj['data']['description']
                except:
                    NAME_I = 1
                    descText = ws.get_object_info3(
                        {'objects': [{
                            'ref': params['input_reads']
                        }]})['infos'][0][NAME_I]
                descText = descText + reads_desc_ext

                output_readsSet_obj = {'description': descText, 'items': items}
                output_readsSet_name = str(
                    params['output_object_name']) + reads_name_ext
                cutadapt_readsSet_ref = setAPI_Client.save_reads_set_v1({
                    'workspace_name':
                    params['output_workspace'],
                    'output_object_name':
                    output_readsSet_name,
                    'data':
                    output_readsSet_obj
                })['set_ref']
            else:
                raise ValueError("No cutadapt output created")

            # create return output object
            result = {
                'report': report,
                'output_reads_ref': cutadapt_readsSet_ref
            }
        #END exec_remove_adapters

        # At some point might do deeper type checking...
        if not isinstance(result, dict):
            raise ValueError('Method exec_remove_adapters return value ' +
                             'result is not type dict as required.')
        # return the results
        return [result]
Esempio n. 27
0
class TaxonAPI:
    '''
    Module Name:
    TaxonAPI

    Module Description:
    A KBase module: TaxonAPI
    '''

    ######## WARNING FOR GEVENT USERS ####### noqa
    # Since asynchronous IO can lead to methods - even the same method -
    # interrupting each other, you must be *very* careful when using global
    # state. A method could easily clobber the state set by another while
    # the latter method is running.
    ######################################### noqa
    VERSION = "1.0.1"
    GIT_URL = "[email protected]:kbase/taxon_api.git"
    GIT_COMMIT_HASH = "de62da0a5e2d7e098927862ec0e0c09be1103f68"

    #BEGIN_CLASS_HEADER
    _GENOME_TYPES = [
        'KBaseGenomes.Genome', 'KBaseGenomeAnnotations.GenomeAnnotation'
    ]
    _TAXON_TYPES = ['KBaseGenomeAnnotations.Taxon']

    @functools32.lru_cache(maxsize=1000)
    def get_object(self, ref):
        res = self.ws.get_objects2({'objects': [{'ref': ref}]})['data'][0]
        return res

    def get_data(self, ref):
        obj = self.get_object(ref)
        return obj['data']

    @functools32.lru_cache(maxsize=1000)
    def translate_to_MD5_types(self, ktype):
        return self.ws.translate_to_MD5_types([ktype]).values()[0]

    def get_referrers(self, ref):
        referrers = self.ws.list_referencing_objects([{"ref": ref}])[0]
        object_refs_by_type = dict()
        tlist = []
        for x in referrers:
            tlist.append(x[2])
        typemap = self.ws.translate_to_MD5_types(tlist)
        for x in referrers:
            typestring = typemap[x[2]]
            if typestring not in object_refs_by_type:
                object_refs_by_type[typestring] = list()
            upa = '%d/%d/%d' % (x[6], x[0], x[4])
            object_refs_by_type[typestring].append(upa)
        return object_refs_by_type

    def get_reffers_type(self, ref, types):
        referrers = self.get_referrers(ref)
        children = list()
        for object_type in referrers:
            if object_type.split('-')[0] in types:
                children.extend(referrers[object_type])

        return children

    def make_hash(self, i):
        omd = i[10]
        if i[10] == {}:
            omd = None

        return {
            'type_string': i[2],
            'workspace_id': i[6],
            'object_checksum': i[8],
            'object_reference': '%d/%d' % (i[6], i[0]),
            'object_size': i[9],
            'saved_by': i[5],
            'object_id': i[0],
            'save_date': i[3],
            'object_metadata': omd,
            'object_name': i[1],
            'version': i[4],
            'workspace_name': i[7],
            'object_reference_versioned': '%d/%d/%d' % (i[6], i[0], i[4])
        }

    #END_CLASS_HEADER

    # config contains contents of config file in a hash or None if it couldn't
    # be found
    def __init__(self, config):
        #BEGIN_CONSTRUCTOR
        self.workspaceURL = config['workspace-url']
        self.ws = Workspace(self.workspaceURL)
        self.shockURL = config['shock-url']
        self.logger = logging.getLogger()
        log_handler = logging.StreamHandler()
        log_handler.setFormatter(
            logging.Formatter("%(asctime)s [%(levelname)s] %(message)s"))
        self.logger.addHandler(log_handler)

        #END_CONSTRUCTOR
        pass

    def get_parent(self, ctx, ref):
        """
        Retrieve parent Taxon.
        @return Reference to parent Taxon.
        :param ref: instance of type "ObjectReference"
        :returns: instance of type "ObjectReference"
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN get_parent
        data = self.get_data(ref)
        try:
            returnVal = data['parent_taxon_ref']
            # returnVal=taxon_api.get_parent(ref_only=True)
        except:
            returnVal = ''
        #END get_parent

        # At some point might do deeper type checking...
        if not isinstance(returnVal, basestring):
            raise ValueError('Method get_parent return value ' +
                             'returnVal is not type basestring as required.')
        # return the results
        return [returnVal]

    def get_children(self, ctx, ref):
        """
        Retrieve children Taxon.
        @return List of references to child Taxons.
        :param ref: instance of type "ObjectReference"
        :returns: instance of list of type "ObjectReference"
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN get_children
        returnVal = self.get_reffers_type(ref, self._TAXON_TYPES)
        #END get_children

        # At some point might do deeper type checking...
        if not isinstance(returnVal, list):
            raise ValueError('Method get_children return value ' +
                             'returnVal is not type list as required.')
        # return the results
        return [returnVal]

    def get_genome_annotations(self, ctx, ref):
        """
        funcdef GenomeAnnotation(s) that refer to this Taxon.
         If this is accessing a KBaseGenomes.Genome object, it will
         return an empty list (this information is not available).
         @return List of references to GenomeAnnotation objects.
        :param ref: instance of type "ObjectReference"
        :returns: instance of list of type "ObjectReference"
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN get_genome_annotations
        returnVal = self.get_reffers_type(ref, self._GENOME_TYPES)
        #END get_genome_annotations

        # At some point might do deeper type checking...
        if not isinstance(returnVal, list):
            raise ValueError('Method get_genome_annotations return value ' +
                             'returnVal is not type list as required.')
        # return the results
        return [returnVal]

    def get_scientific_lineage(self, ctx, ref):
        """
        Retrieve the scientific lineage.
        @return Strings for each 'unit' of the lineage, ordered in
          the usual way from Domain to Kingdom to Phylum, etc.
        :param ref: instance of type "ObjectReference"
        :returns: instance of list of String
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN get_scientific_lineage
        o = self.ws.get_objects2({'objects': [{
            'ref': ref
        }]})['data'][0]['data']
        returnVal = [x.strip() for x in o['scientific_lineage'].split(";")]
        #END get_scientific_lineage

        # At some point might do deeper type checking...
        if not isinstance(returnVal, list):
            raise ValueError('Method get_scientific_lineage return value ' +
                             'returnVal is not type list as required.')
        # return the results
        return [returnVal]

    def get_scientific_name(self, ctx, ref):
        """
        Retrieve the scientific name.
        @return The scientific name, e.g., "Escherichia Coli K12 str. MG1655"
        :param ref: instance of type "ObjectReference"
        :returns: instance of String
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN get_scientific_name
        obj = self.ws.get_objects2({'objects': [{
            'ref': ref
        }]})['data'][0]['data']
        returnVal = obj['scientific_name']
        #END get_scientific_name

        # At some point might do deeper type checking...
        if not isinstance(returnVal, basestring):
            raise ValueError('Method get_scientific_name return value ' +
                             'returnVal is not type basestring as required.')
        # return the results
        return [returnVal]

    def get_taxonomic_id(self, ctx, ref):
        """
        Retrieve the NCBI taxonomic ID of this Taxon.
        For type KBaseGenomes.Genome, the ``source_id`` will be returned.
        @return Integer taxonomic ID.
        :param ref: instance of type "ObjectReference"
        :returns: instance of Long
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN get_taxonomic_id
        obj = self.ws.get_objects2({'objects': [{
            'ref': ref
        }]})['data'][0]['data']
        returnVal = obj['taxonomy_id']
        #END get_taxonomic_id

        # At some point might do deeper type checking...
        if not isinstance(returnVal, int):
            raise ValueError('Method get_taxonomic_id return value ' +
                             'returnVal is not type int as required.')
        # return the results
        return [returnVal]

    def get_kingdom(self, ctx, ref):
        """
        Retrieve the kingdom.
        :param ref: instance of type "ObjectReference"
        :returns: instance of String
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN get_kingdom
        obj = self.ws.get_objects2({'objects': [{
            'ref': ref
        }]})['data'][0]['data']
        returnVal = obj['kingdom']
        #END get_kingdom

        # At some point might do deeper type checking...
        if not isinstance(returnVal, basestring):
            raise ValueError('Method get_kingdom return value ' +
                             'returnVal is not type basestring as required.')
        # return the results
        return [returnVal]

    def get_domain(self, ctx, ref):
        """
        Retrieve the domain.
        :param ref: instance of type "ObjectReference"
        :returns: instance of String
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN get_domain
        obj = self.ws.get_objects2({'objects': [{
            'ref': ref
        }]})['data'][0]['data']
        returnVal = obj['domain']
        #END get_domain

        # At some point might do deeper type checking...
        if not isinstance(returnVal, basestring):
            raise ValueError('Method get_domain return value ' +
                             'returnVal is not type basestring as required.')
        # return the results
        return [returnVal]

    def get_genetic_code(self, ctx, ref):
        """
        Retrieve the genetic code.
        :param ref: instance of type "ObjectReference"
        :returns: instance of Long
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN get_genetic_code
        obj = self.ws.get_objects2({'objects': [{
            'ref': ref
        }]})['data'][0]['data']
        returnVal = obj['genetic_code']
        #END get_genetic_code

        # At some point might do deeper type checking...
        if not isinstance(returnVal, int):
            raise ValueError('Method get_genetic_code return value ' +
                             'returnVal is not type int as required.')
        # return the results
        return [returnVal]

    def get_aliases(self, ctx, ref):
        """
        Retrieve the aliases.
        :param ref: instance of type "ObjectReference"
        :returns: instance of list of String
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN get_aliases
        obj = self.ws.get_objects2({'objects': [{
            'ref': ref
        }]})['data'][0]['data']
        if 'aliases' in obj:
            returnVal = obj['aliases']
        else:
            returnVal = list()
        #END get_aliases

        # At some point might do deeper type checking...
        if not isinstance(returnVal, list):
            raise ValueError('Method get_aliases return value ' +
                             'returnVal is not type list as required.')
        # return the results
        return [returnVal]

    def get_info(self, ctx, ref):
        """
        Retrieve object info.
        @skip documentation
        :param ref: instance of type "ObjectReference"
        :returns: instance of type "ObjectInfo" (* @skip documentation) ->
           structure: parameter "object_id" of Long, parameter "object_name"
           of String, parameter "object_reference" of String, parameter
           "object_reference_versioned" of String, parameter "type_string" of
           String, parameter "save_date" of String, parameter "version" of
           Long, parameter "saved_by" of String, parameter "workspace_id" of
           Long, parameter "workspace_name" of String, parameter
           "object_checksum" of String, parameter "object_size" of Long,
           parameter "object_metadata" of mapping from String to String
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN get_info
        # returnVal = self.ws.get_objects2({'objects': [{'ref': ref}]})['data'][0]['info']
        i = self.get_object(ref)['info']
        #md5_typestr = self.ws.translate_to_MD5_types([i[2]]).values()[0]
        returnVal = self.make_hash(i)
        #END get_info

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError('Method get_info return value ' +
                             'returnVal is not type dict as required.')
        # return the results
        return [returnVal]

    def get_history(self, ctx, ref):
        """
        Retrieve object history.
        @skip documentation
        :param ref: instance of type "ObjectReference"
        :returns: instance of type "ObjectHistory" (* @skip documentation) ->
           list of type "ObjectInfo" (* @skip documentation) -> structure:
           parameter "object_id" of Long, parameter "object_name" of String,
           parameter "object_reference" of String, parameter
           "object_reference_versioned" of String, parameter "type_string" of
           String, parameter "save_date" of String, parameter "version" of
           Long, parameter "saved_by" of String, parameter "workspace_id" of
           Long, parameter "workspace_name" of String, parameter
           "object_checksum" of String, parameter "object_size" of Long,
           parameter "object_metadata" of mapping from String to String
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN get_history
        # returnVal = self.ws.get_object_history({'ref': ref})
        returnVal = []
        for i in self.ws.get_object_history({'ref': ref}):
            returnVal.append(self.make_hash(i))
        #END get_history

        # At some point might do deeper type checking...
        if not isinstance(returnVal, list):
            raise ValueError('Method get_history return value ' +
                             'returnVal is not type list as required.')
        # return the results
        return [returnVal]

    def get_provenance(self, ctx, ref):
        """
        Retrieve object provenance.
        @skip documentation
        :param ref: instance of type "ObjectReference"
        :returns: instance of type "ObjectProvenance" (* @skip documentation)
           -> list of type "ObjectProvenanceAction" (* @skip documentation)
           -> structure: parameter "time" of String, parameter "service_name"
           of String, parameter "service_version" of String, parameter
           "service_method" of String, parameter "method_parameters" of list
           of String, parameter "script_name" of String, parameter
           "script_version" of String, parameter "script_command_line" of
           String, parameter "input_object_references" of list of String,
           parameter "validated_object_references" of list of String,
           parameter "intermediate_input_ids" of list of String, parameter
           "intermediate_output_ids" of list of String, parameter
           "external_data" of list of type "ExternalDataUnit" (* @skip
           documentation) -> structure: parameter "resource_name" of String,
           parameter "resource_url" of String, parameter "resource_version"
           of String, parameter "resource_release_date" of String, parameter
           "data_url" of String, parameter "data_id" of String, parameter
           "description" of String, parameter "description" of String
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN get_provenance
        prov = self.ws.get_object_provenance([{"ref": ref}])[0]['provenance']
        returnVal = []
        copy_keys = {
            "time": "time",
            "service": "service_name",
            "service_ver": "service_version",
            "method": "service_method",
            "method_params": "method_parameters",
            "script": "script_name",
            "script_ver": "script_version",
            "script_command_line": "script_command_line",
            "input_ws_objects": "input_object_references",
            "resolved_ws_objects": "validated_object_references",
            "intermediate_incoming": "intermediate_input_ids",
            "intermediate_outgoing": "intermediate_output_ids",
            "external_data": "external_data",
            "description": "description"
        }

        for object_provenance in prov:
            action = dict()

            for k in copy_keys:
                if k in object_provenance:
                    if isinstance(object_provenance[k], list) and len(
                            object_provenance[k]) == 0:
                        continue

                    action[copy_keys[k]] = object_provenance[k]

            returnVal.append(action)
        #END get_provenance

        # At some point might do deeper type checking...
        if not isinstance(returnVal, list):
            raise ValueError('Method get_provenance return value ' +
                             'returnVal is not type list as required.')
        # return the results
        return [returnVal]

    def get_id(self, ctx, ref):
        """
        Retrieve object identifier.
        @skip documentation
        :param ref: instance of type "ObjectReference"
        :returns: instance of Long
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN get_id
        returnVal = self.get_object(ref)['info'][0]
        #END get_id

        # At some point might do deeper type checking...
        if not isinstance(returnVal, int):
            raise ValueError('Method get_id return value ' +
                             'returnVal is not type int as required.')
        # return the results
        return [returnVal]

    def get_name(self, ctx, ref):
        """
        Retrieve object name.
        @skip documentation
        :param ref: instance of type "ObjectReference"
        :returns: instance of String
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN get_name
        returnVal = self.get_object(ref)['info'][1]
        #END get_name

        # At some point might do deeper type checking...
        if not isinstance(returnVal, basestring):
            raise ValueError('Method get_name return value ' +
                             'returnVal is not type basestring as required.')
        # return the results
        return [returnVal]

    def get_version(self, ctx, ref):
        """
        Retrieve object version.
        @skip documentation
        :param ref: instance of type "ObjectReference"
        :returns: instance of String
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN get_version
        returnVal = str(self.get_object(ref)['info'][4])
        #END get_version

        # At some point might do deeper type checking...
        if not isinstance(returnVal, basestring):
            raise ValueError('Method get_version return value ' +
                             'returnVal is not type basestring as required.')
        # return the results
        return [returnVal]

    def get_all_data(self, ctx, params):
        """
        :param params: instance of type "GetAllDataParams" -> structure:
           parameter "ref" of type "ObjectReference", parameter
           "include_decorated_scientific_lineage" of type "boolean" (A
           boolean. 0 = false, other = true.), parameter
           "include_decorated_children" of type "boolean" (A boolean. 0 =
           false, other = true.), parameter "exclude_children" of type
           "boolean" (A boolean. 0 = false, other = true.)
        :returns: instance of type "TaxonData" -> structure: parameter
           "parent" of type "ObjectReference", parameter "children" of list
           of type "ObjectReference", parameter "decorated_children" of list
           of type "TaxonInfo" -> structure: parameter "ref" of type
           "ObjectReference", parameter "scientific_name" of String,
           parameter "scientific_lineage" of list of String, parameter
           "decorated_scientific_lineage" of list of type "TaxonInfo" ->
           structure: parameter "ref" of type "ObjectReference", parameter
           "scientific_name" of String, parameter "scientific_name" of
           String, parameter "taxonomic_id" of Long, parameter "kingdom" of
           String, parameter "domain" of String, parameter "genetic_code" of
           Long, parameter "aliases" of list of String, parameter "obj_info"
           of type "ObjectInfo" (* @skip documentation) -> structure:
           parameter "object_id" of Long, parameter "object_name" of String,
           parameter "object_reference" of String, parameter
           "object_reference_versioned" of String, parameter "type_string" of
           String, parameter "save_date" of String, parameter "version" of
           Long, parameter "saved_by" of String, parameter "workspace_id" of
           Long, parameter "workspace_name" of String, parameter
           "object_checksum" of String, parameter "object_size" of Long,
           parameter "object_metadata" of mapping from String to String
        """
        # ctx is the context object
        # return variables are: d
        #BEGIN get_all_data
        d = {}
        ref = params['ref']

        obj = self.get_object(ref)
        data = obj['data']

        try:
            d['parent'] = data['parent_taxon_ref']
        except KeyError:
            print('Error getting parent for ' + ref)
            # +':\n'+ str(traceback.format_exc()))
            d['parent'] = None

        if 'exclude_children' in params and params['exclude_children'] == 1:
            pass
        else:
            d['children'] = self.get_reffers_type(ref, self._TAXON_TYPES)

        d['scientific_lineage'] = data['scientific_lineage']
        d['scientific_name'] = data['scientific_name']
        d['taxonomic_id'] = data['taxonomy_id']
        try:
            d['kingdom'] = data['kingdom']
            # throws error if not found, so catch and log it
        except KeyError:
            print('Error getting kingdom for ' + ref)
            # +':\n'+ str(traceback.format_exc()))
            d['kingdom'] = None

        d['domain'] = data['domain']
        d['genetic_code'] = data['genetic_code']
        d['aliases'] = None
        if 'aliases' in data:
            d['aliases'] = data['aliases']
        d['info'] = self.make_hash(obj['info'])

        key = 'include_decorated_scientific_lineage'
        if key in params and params[key] == 1:
            l = self.get_decorated_scientific_lineage(ctx, {'ref': ref})[0]
            d['decorated_scientific_lineage'] = l[
                'decorated_scientific_lineage']

        key = 'include_decorated_children'
        if key in params and params[key] == 1:
            l = self.get_decorated_children(ctx, {'ref': ref})[0]
            d['decorated_children'] = l['decorated_children']
        #END get_all_data

        # At some point might do deeper type checking...
        if not isinstance(d, dict):
            raise ValueError('Method get_all_data return value ' +
                             'd is not type dict as required.')
        # return the results
        return [d]

    def get_decorated_scientific_lineage(self, ctx, params):
        """
        :param params: instance of type "GetDecoratedScientificLineageParams"
           -> structure: parameter "ref" of type "ObjectReference"
        :returns: instance of type "DecoratedScientificLineage" (list starts
           at the root, and goes on down to this) -> structure: parameter
           "decorated_scientific_lineage" of list of type "TaxonInfo" ->
           structure: parameter "ref" of type "ObjectReference", parameter
           "scientific_name" of String
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN get_decorated_scientific_lineage

        lineageList = []
        ref = params['ref']

        while True:
            parent_data = None
            try:
                # note: doesn't look like there is a way to get a reference
                # of a Taxon directly (without constructing it from
                # object_info), so first get reference, then instantiate
                # another API object
                parent_ref = self.get_data(ref)['parent_taxon_ref']
                if parent_ref is not None:
                    data = self.get_data(ref)
                    scientific_name = data['scientific_name']
                    if scientific_name != 'root':
                        parent_data = {
                            'ref': parent_ref,
                            'scientific_name': scientific_name
                        }
                        ref = parent_ref

            except KeyError:
                # case where parent is not found
                pass

            if parent_data is not None:
                lineageList.append(parent_data)
            else:
                break

        lineageList.reverse()  # reverse list to match scientific_lineage style
        returnVal = {'decorated_scientific_lineage': lineageList[:-1]}

        #END get_decorated_scientific_lineage

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError(
                'Method get_decorated_scientific_lineage return value ' +
                'returnVal is not type dict as required.')
        # return the results
        return [returnVal]

    def get_decorated_children(self, ctx, params):
        """
        :param params: instance of type "GetDecoratedChildrenParams" ->
           structure: parameter "ref" of type "ObjectReference"
        :returns: instance of type "DecoratedChildren" -> structure:
           parameter "decorated_children" of list of type "TaxonInfo" ->
           structure: parameter "ref" of type "ObjectReference", parameter
           "scientific_name" of String
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN get_decorated_children
        ref = params['ref']
        children_refs = self.get_reffers_type(ref, self._TAXON_TYPES)

        decorated_children = []
        for child_ref in children_refs:
            decorated_children.append({
                'ref':
                child_ref,
                'scientific_name':
                self.get_data(child_ref)['scientific_name']
            })

        returnVal = {'decorated_children': decorated_children}
        #END get_decorated_children

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError('Method get_decorated_children return value ' +
                             'returnVal is not type dict as required.')
        # return the results
        return [returnVal]

    def status(self, ctx):
        #BEGIN_STATUS
        returnVal = {
            'state': "OK",
            'message': "",
            'version': self.VERSION,
            'git_url': self.GIT_URL,
            'git_commit_hash': self.GIT_COMMIT_HASH
        }
        #END_STATUS
        return [returnVal]
Esempio n. 28
0
class KnowledgeEngineAppsUtil:

    METRIC = [
        "braycurtis", "canberra", "chebyshev", "cityblock", "correlation",
        "cosine", "dice", "euclidean", "hamming", "jaccard", "kulsinski",
        "matching", "rogerstanimoto", "russellrao", "sokalmichener",
        "sokalsneath", "sqeuclidean", "yule"
    ]

    METHOD = [
        "single", "complete", "average", "weighted", "centroid", "median",
        "ward"
    ]

    CRITERION = ["inconsistent", "distance", "maxclust"]

    def _mkdir_p(self, path):
        """
        _mkdir_p: make directory for given path
        """
        if not path:
            return
        try:
            os.makedirs(path)
        except OSError as exc:
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise

    def _validate_run_pca_params(self, params):
        """
        _validate_run_pca_params:
            validates params passed to run_pca method
        """

        log('start validating run_pca params')

        # check for required parameters
        for p in ['cluster_set_ref', 'workspace_name', 'pca_matrix_name']:
            if p not in params:
                raise ValueError(
                    '"{}" parameter is required, but missing'.format(p))

    def _validate_run_kmeans_cluster_params(self, params):
        """
        _validate_run_kmeans_cluster_params:
                validates params passed to run_kmeans_cluster method
        """

        log('start validating run_kmeans_cluster params')

        # check for required parameters
        for p in ['matrix_ref', 'workspace_name', 'cluster_set_name', 'k_num']:
            if p not in params:
                raise ValueError(
                    '"{}" parameter is required, but missing'.format(p))

        # check metric validation
        metric = params.get('dist_metric')
        if metric and metric not in self.METRIC:
            error_msg = 'INPUT ERROR:\nInput metric function [{}] is not valid.\n'.format(
                metric)
            error_msg += 'Available metric: {}'.format(self.METRIC)
            raise ValueError(error_msg)

    def _validate_run_hierarchical_cluster_params(self, params):
        """
        _validate_run_hierarchical_cluster_params:
                validates params passed to run_hierarchical_cluster method
        """

        log('start validating run_hierarchical_cluster params')

        # check for required parameters
        for p in [
                'matrix_ref', 'workspace_name', 'cluster_set_name',
                'dist_cutoff_rate'
        ]:
            if p not in params:
                raise ValueError(
                    '"{}" parameter is required, but missing'.format(p))

        # check metric validation
        metric = params.get('dist_metric')
        if metric and metric not in self.METRIC:
            error_msg = 'INPUT ERROR:\nInput metric function [{}] is not valid.\n'.format(
                metric)
            error_msg += 'Available metric: {}'.format(self.METRIC)
            raise ValueError(error_msg)

        # check method validation
        method = params.get('linkage_method')
        if method and method not in self.METHOD:
            error_msg = 'INPUT ERROR:\nInput linkage algorithm [{}] is not valid.\n'.format(
                method)
            error_msg += 'Available metric: {}'.format(self.METHOD)
            raise ValueError(error_msg)

        # check criterion validation
        criterion = params.get('fcluster_criterion')
        if criterion and criterion not in self.CRITERION:
            error_msg = 'INPUT ERROR:\nInput criterion [{}] is not valid.\n'.format(
                criterion)
            error_msg += 'Available metric: {}'.format(self.CRITERION)
            raise ValueError(error_msg)

    def _gen_clusters(self, clusters, conditionset_mapping):
        clusters_list = list()

        for cluster in clusters.values():
            labeled_cluster = {}
            labeled_cluster.update({'id_to_data_position': cluster})
            if conditionset_mapping:
                id_to_condition = {
                    k: v
                    for k, v in conditionset_mapping.items()
                    if k in cluster.keys()
                }
                labeled_cluster.update({'id_to_condition': id_to_condition})

            clusters_list.append(labeled_cluster)

        return clusters_list

    def _gen_hierarchical_clusters(self, clusters, conditionset_mapping,
                                   data_matrix):
        clusters_list = list()

        df = pd.read_json(data_matrix)
        index = df.index.tolist()

        for cluster in clusters.values():
            labeled_cluster = {}
            id_to_data_position = {}
            for item in cluster:
                id_to_data_position.update({item: index.index(item)})

            labeled_cluster.update(
                {'id_to_data_position': id_to_data_position})
            if conditionset_mapping:
                id_to_condition = {
                    k: v
                    for k, v in conditionset_mapping.items() if k in cluster
                }
                labeled_cluster.update({'id_to_condition': id_to_condition})

            clusters_list.append(labeled_cluster)

        return clusters_list

    def _build_hierarchical_cluster_set(self, clusters, cluster_set_name,
                                        genome_ref, matrix_ref,
                                        conditionset_mapping, conditionset_ref,
                                        workspace_name, clustering_parameters,
                                        data_matrix):
        """
        _build_kmeans_cluster_set: build KBaseExperiments.ClusterSet object
        """

        log('start saving KBaseExperiments.ClusterSet object')

        if isinstance(workspace_name, int) or workspace_name.isdigit():
            workspace_id = workspace_name
        else:
            workspace_id = self.dfu.ws_name_to_id(workspace_name)

        clusters_list = self._gen_hierarchical_clusters(
            clusters, conditionset_mapping, data_matrix)

        cluster_set_data = {
            'clusters': clusters_list,
            'clustering_parameters': clustering_parameters,
            'original_data': matrix_ref,
            'condition_set_ref': conditionset_ref,
            'genome_ref': genome_ref
        }

        cluster_set_data = {k: v for k, v in cluster_set_data.items() if v}

        object_type = 'KBaseExperiments.ClusterSet'
        save_object_params = {
            'id':
            workspace_id,
            'objects': [{
                'type': object_type,
                'data': cluster_set_data,
                'name': cluster_set_name
            }]
        }

        dfu_oi = self.dfu.save_objects(save_object_params)[0]
        cluster_set_ref = str(dfu_oi[6]) + '/' + str(dfu_oi[0]) + '/' + str(
            dfu_oi[4])

        return cluster_set_ref

    def _build_kmeans_cluster_set(self, clusters, cluster_set_name, genome_ref,
                                  matrix_ref, conditionset_mapping,
                                  conditionset_ref, workspace_name,
                                  clustering_parameters):
        """
        _build_kmeans_cluster_set: build KBaseExperiments.ClusterSet object
        """

        log('start saving KBaseExperiments.ClusterSet object')

        if isinstance(workspace_name, int) or workspace_name.isdigit():
            workspace_id = workspace_name
        else:
            workspace_id = self.dfu.ws_name_to_id(workspace_name)

        clusters_list = self._gen_clusters(clusters, conditionset_mapping)

        cluster_set_data = {
            'clusters': clusters_list,
            'clustering_parameters': clustering_parameters,
            'original_data': matrix_ref,
            'condition_set_ref': conditionset_ref,
            'genome_ref': genome_ref
        }

        cluster_set_data = {k: v for k, v in cluster_set_data.items() if v}

        object_type = 'KBaseExperiments.ClusterSet'
        save_object_params = {
            'id':
            workspace_id,
            'objects': [{
                'type': object_type,
                'data': cluster_set_data,
                'name': cluster_set_name
            }]
        }

        dfu_oi = self.dfu.save_objects(save_object_params)[0]
        cluster_set_ref = str(dfu_oi[6]) + '/' + str(dfu_oi[0]) + '/' + str(
            dfu_oi[4])

        return cluster_set_ref

    def _generate_visualization_content(self, output_directory,
                                        row_dendrogram_path,
                                        row_dendrogram_truncate_path,
                                        col_dendrogram_path,
                                        col_dendrogram_truncate_path):
        """
        _generate_visualization_content: generate visualization html content
        """

        visualization_content = ''

        if row_dendrogram_path:
            row_dendrogram_name = 'row_dendrogram.png'
            row_dendrogram_display_name = 'row dendrogram'

            shutil.copy2(row_dendrogram_path,
                         os.path.join(output_directory, row_dendrogram_name))

            visualization_content += '<div class="gallery">'
            visualization_content += '<a target="_blank" href="{}">'.format(
                row_dendrogram_name)
            visualization_content += '<img src="{}" '.format(
                row_dendrogram_name)
            visualization_content += 'alt="{}" width="600" height="400">'.format(
                row_dendrogram_display_name)
            visualization_content += '</a><div class="desc">{}</div></div>'.format(
                row_dendrogram_display_name)

        if row_dendrogram_truncate_path:
            row_den_truncate_name = 'row_dendrogram_last12.png'
            row_den_truncate_display_name = 'row dendrogram truncated (last 12 merges)'

            shutil.copy2(row_dendrogram_truncate_path,
                         os.path.join(output_directory, row_den_truncate_name))

            visualization_content += '<div class="gallery">'
            visualization_content += '<a target="_blank" href="{}">'.format(
                row_den_truncate_name)
            visualization_content += '<img src="{}" '.format(
                row_den_truncate_name)
            visualization_content += 'alt="{}" width="600" height="400">'.format(
                row_den_truncate_display_name)
            visualization_content += '</a><div class="desc">{}</div></div>'.format(
                row_den_truncate_display_name)

        if col_dendrogram_path:
            col_dendrogram_name = 'column_dendrogram.png'
            col_dendrogram_display_name = 'column dendrogram'

            shutil.copy2(col_dendrogram_path,
                         os.path.join(output_directory, col_dendrogram_name))

            visualization_content += '<div class="gallery">'
            visualization_content += '<a target="_blank" href="{}">'.format(
                col_dendrogram_name)
            visualization_content += '<img src="{}" '.format(
                col_dendrogram_name)
            visualization_content += 'alt="{}" width="600" height="400">'.format(
                col_dendrogram_display_name)
            visualization_content += '</a><div class="desc">{}</div></div>'.format(
                col_dendrogram_display_name)

        if col_dendrogram_truncate_path:
            col_den_truncate_name = 'column_dendrogram_last12.png'
            col_den_truncate_display_name = 'column dendrogram truncated (last 12 merges)'

            shutil.copy2(col_dendrogram_truncate_path,
                         os.path.join(output_directory, col_den_truncate_name))

            visualization_content += '<div class="gallery">'
            visualization_content += '<a target="_blank" href="{}">'.format(
                col_den_truncate_name)
            visualization_content += '<img src="{}" '.format(
                col_den_truncate_name)
            visualization_content += 'alt="{}" width="600" height="400">'.format(
                col_den_truncate_display_name)
            visualization_content += '</a><div class="desc">{}</div></div>'.format(
                col_den_truncate_display_name)

        if not visualization_content:
            visualization_content = '<p>Dendrogram is too large to be printed.</p>'

        return visualization_content

    def _generate_hierarchical_html_report(self, cluster_set_refs,
                                           row_dendrogram_path,
                                           row_dendrogram_truncate_path,
                                           col_dendrogram_path,
                                           col_dendrogram_truncate_path):
        """
        _generate_hierarchical_html_report: generate html summary report for hierarchical
                                            clustering app
        """

        log('start generating html report')
        html_report = list()

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_directory)
        result_file_path = os.path.join(output_directory, 'hier_report.html')

        visualization_content = self._generate_visualization_content(
            output_directory, row_dendrogram_path,
            row_dendrogram_truncate_path, col_dendrogram_path,
            col_dendrogram_truncate_path)

        with open(result_file_path, 'w') as result_file:
            with open(
                    os.path.join(os.path.dirname(__file__),
                                 'hier_report_template.html'),
                    'r') as report_template_file:
                report_template = report_template_file.read()
                report_template = report_template.replace(
                    '<p>Visualization_Content</p>', visualization_content)
                result_file.write(report_template)

        report_shock_id = self.dfu.file_to_shock({
            'file_path': output_directory,
            'pack': 'zip'
        })['shock_id']

        html_report.append({
            'shock_id':
            report_shock_id,
            'name':
            os.path.basename(result_file_path),
            'label':
            os.path.basename(result_file_path),
            'description':
            'HTML summary report for ExpressionMatrix Cluster App'
        })
        return html_report

    def _generate_hierarchical_cluster_report(self, cluster_set_refs,
                                              workspace_name,
                                              row_dendrogram_path,
                                              row_dendrogram_truncate_path,
                                              col_dendrogram_path,
                                              col_dendrogram_truncate_path):
        """
        _generate_hierarchical_cluster_report: generate summary report
        """

        log('creating report')

        output_html_files = self._generate_hierarchical_html_report(
            cluster_set_refs, row_dendrogram_path,
            row_dendrogram_truncate_path, col_dendrogram_path,
            col_dendrogram_truncate_path)

        objects_created = []
        for cluster_set_ref in cluster_set_refs:
            objects_created.append({
                'ref': cluster_set_ref,
                'description': 'Hierarchical ClusterSet'
            })

        report_params = {
            'message': '',
            'workspace_name': workspace_name,
            'objects_created': objects_created,
            'html_links': output_html_files,
            'direct_html_link_index': 0,
            'html_window_height': 333,
            'report_object_name': 'kb_hier_cluster_report_' + str(uuid.uuid4())
        }

        kbase_report_client = KBaseReport(self.callback_url)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {
            'report_name': output['name'],
            'report_ref': output['ref']
        }

        return report_output

    def _generate_kmeans_cluster_report(self, cluster_set_refs,
                                        workspace_name):
        """
        _generate_kmeans_cluster_report: generate summary report
        """
        objects_created = []
        for cluster_set_ref in cluster_set_refs:
            objects_created.append({
                'ref': cluster_set_ref,
                'description': 'Kmeans ClusterSet'
            })
        report_params = {
            'message': '',
            'objects_created': objects_created,
            'workspace_name': workspace_name,
            'report_object_name': 'run_kmeans_cluster_' + str(uuid.uuid4())
        }

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {
            'report_name': output['name'],
            'report_ref': output['ref']
        }

        return report_output

    def _generate_pca_html_files(self, pca_plots, n_components):

        log('start generating html report')
        html_report = list()

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_directory)
        result_file_path = os.path.join(output_directory, 'pca_report.html')

        visualization_content = ''

        for pca_plot in pca_plots:
            pca_plot_name = os.path.basename(pca_plot)
            pca_plot_display_name = '{} Component PCA'.format(n_components)

            shutil.copy2(pca_plot, os.path.join(output_directory,
                                                pca_plot_name))

            visualization_content += '<div class="gallery">'
            visualization_content += '<a target="_blank" href="{}">'.format(
                pca_plot_name)
            visualization_content += '<img src="{}" '.format(pca_plot_name)
            visualization_content += 'alt="{}" width="600" height="600">'.format(
                pca_plot_display_name)
            visualization_content += '</a><div class="desc">{}</div></div>'.format(
                pca_plot_display_name)

        with open(result_file_path, 'w') as result_file:
            with open(
                    os.path.join(os.path.dirname(__file__),
                                 'pca_report_template.html'),
                    'r') as report_template_file:
                report_template = report_template_file.read()
                report_template = report_template.replace(
                    '<p>Visualization_Content</p>', visualization_content)
                result_file.write(report_template)

        report_shock_id = self.dfu.file_to_shock({
            'file_path': output_directory,
            'pack': 'zip'
        })['shock_id']

        html_report.append({
            'shock_id':
            report_shock_id,
            'name':
            os.path.basename(result_file_path),
            'label':
            os.path.basename(result_file_path),
            'description':
            'HTML summary report for ExpressionMatrix Cluster App'
        })
        return html_report

    def _generate_pca_plot(self, pca_matrix_data):
        """
        _generate_pca_plot: generate a plot for PCA data
        """
        pca_plots = []
        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_directory)

        df = pd.DataFrame(pca_matrix_data.get('values'),
                          index=pca_matrix_data.get('row_ids'),
                          columns=pca_matrix_data.get('col_ids'))

        n_components = range(1, df.columns.size)
        all_pairs = list(itertools.combinations(n_components, 2))

        for pair in all_pairs:
            first_component = pair[0]
            second_component = pair[1]
            pca_plot = os.path.join(
                output_directory,
                'pca_{}_{}.png'.format(first_component, second_component))

            plt.switch_backend('agg')

            fig = plt.figure(figsize=(8, 8))
            ax = fig.add_subplot(1, 1, 1)
            ax.set_xlabel('Principal Component {}'.format(first_component),
                          fontsize=15)
            ax.set_ylabel('Principal Component {}'.format(second_component),
                          fontsize=15)
            ax.set_title('{} component PCA'.format(len(n_components)),
                         fontsize=20)

            clusters = list(
                set(['cluster_{}'.format(x) for x in df['cluster'].tolist()]))
            colors = [
                'red', 'green', 'blue', 'orange', 'yellow', 'pink',
                'lightcyan', 'cyan'
            ]
            if len(clusters) > len(colors):
                np.random.seed(19680801)
                N = len(clusters)
                colors = []
                for i in range(N):
                    colors.append(np.random.rand(3, ))

            for cluster, color in zip(clusters, colors):
                indicesToKeep = df['cluster'] == int(cluster.split('_')[-1])
                ax.scatter(
                    df.loc[indicesToKeep,
                           'principal_component_{}'.format(first_component)],
                    df.loc[indicesToKeep,
                           'principal_component_{}'.format(second_component)],
                    c=color,
                    s=50)
            ax.legend(clusters, loc='best')
            ax.grid()

            plt.savefig(pca_plot)

            pca_plots.append(pca_plot)

        return pca_plots, len(n_components)

    def _generate_pca_report(self, pca_ref, pca_matrix_data, workspace_name):
        """
        _generate_kmeans_cluster_report: generate summary report
        """
        objects_created = []
        objects_created.append({'ref': pca_ref, 'description': 'PCA Matrix'})

        pca_plots, n_components = self._generate_pca_plot(pca_matrix_data)
        output_html_files = self._generate_pca_html_files(
            pca_plots, n_components)
        report_params = {
            'message': '',
            'objects_created': objects_created,
            'workspace_name': workspace_name,
            'html_links': output_html_files,
            'direct_html_link_index': 0,
            'report_object_name': 'run_pca_' + str(uuid.uuid4())
        }

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {
            'report_name': output['name'],
            'report_ref': output['ref']
        }

        return report_output

    def _save_2D_matrix(self, df, clusters, workspace_name, pca_matrix_name):
        """
        _save_2D_matrix: save dataframe as KBaseFeatureValues.FloatMatrix2D object
        """

        log('start saving KBaseFeatureValues.FloatMatrix2D object')

        if isinstance(workspace_name, int) or workspace_name.isdigit():
            workspace_id = workspace_name
        else:
            workspace_id = self.dfu.ws_name_to_id(workspace_name)

        row_ids = df.index.tolist()
        col_ids = df.columns.tolist()
        col_ids.append('cluster')
        values = df.values.tolist()

        idx = 0
        for cluster in clusters:
            cluster_items = cluster.get('id_to_condition').keys()

            for cluster_item in cluster_items:
                pos = row_ids.index(cluster_item)
                values[pos].append(idx)

            idx += 1

        pca_matrix_data = {
            'row_ids': row_ids,
            'col_ids': col_ids,
            'values': values
        }

        object_type = 'KBaseFeatureValues.FloatMatrix2D'
        save_object_params = {
            'id':
            workspace_id,
            'objects': [{
                'type': object_type,
                'data': pca_matrix_data,
                'name': pca_matrix_name
            }]
        }

        dfu_oi = self.dfu.save_objects(save_object_params)[0]
        float_matrix_ref = str(dfu_oi[6]) + '/' + str(dfu_oi[0]) + '/' + str(
            dfu_oi[4])

        return float_matrix_ref, pca_matrix_data

    def _build_flat_cluster(self,
                            data_matrix,
                            dist_cutoff_rate,
                            dist_metric=None,
                            linkage_method=None,
                            fcluster_criterion=None):
        """
        _build_cluster: build flat clusters and dendrogram for data_matrix
        """

        # calculate distance matrix
        pdist_params = {'data_matrix': data_matrix, 'metric': dist_metric}
        pdist_ret = self.ke_util.run_pdist(pdist_params)

        dist_matrix = pdist_ret['dist_matrix']
        labels = pdist_ret['labels']

        # performs hierarchical/agglomerative clustering
        linkage_params = {'dist_matrix': dist_matrix, 'method': linkage_method}
        linkage_ret = self.ke_util.run_linkage(linkage_params)

        linkage_matrix = linkage_ret['linkage_matrix']

        newick = self.ke_util.linkage_2_newick({
            'linkage_matrix': linkage_matrix,
            'labels': labels
        })['newick']

        height = max([item[2] for item in linkage_matrix])
        dist_threshold = height * dist_cutoff_rate
        log('Height: {} Setting dist_threshold: {}'.format(
            height, dist_threshold))
        merges = len(linkage_matrix)

        # generate flat clusters
        fcluster_params = {
            'linkage_matrix': linkage_matrix,
            'dist_threshold': dist_threshold,
            'labels': labels,
            'criterion': fcluster_criterion
        }
        fcluster_ret = self.ke_util.run_fcluster(fcluster_params)

        flat_cluster = fcluster_ret['flat_cluster']

        # generate dendrogram
        try:
            dendrogram_params = {
                'linkage_matrix': linkage_matrix,
                'dist_threshold': dist_threshold,
                'labels': labels
            }

            dendrogram_ret = self.ke_util.run_dendrogram(dendrogram_params)

            dendrogram_path = dendrogram_ret['result_plots'][0]
        except:
            dendrogram_path = None

        # generate truncated (last 12 merges) dendrogram
        if merges > 256:
            dendrogram_truncate_params = {
                'linkage_matrix': linkage_matrix,
                'dist_threshold': dist_threshold,
                'labels': labels,
                'last_merges': 12
            }
            dendrogram_truncate_ret = self.ke_util.run_dendrogram(
                dendrogram_truncate_params)

            dendrogram_truncate_path = dendrogram_truncate_ret['result_plots'][
                0]
        else:
            dendrogram_truncate_path = None

        return flat_cluster, labels, newick, dendrogram_path, dendrogram_truncate_path

    def _build_kmeans_cluster(self, data_matrix, k_num, dist_metric=None):
        """
        _build_kmeans_cluster: Build Kmeans cluster
        """

        # calculate distance matrix
        pdist_params = {'data_matrix': data_matrix, 'metric': dist_metric}
        pdist_ret = self.ke_util.run_pdist(pdist_params)

        dist_matrix = pdist_ret['dist_matrix']
        labels = pdist_ret['labels']

        # run kmeans algorithm
        kmeans_params = {'dist_matrix': dist_matrix, 'k_num': k_num}
        kmeans_ret = self.ke_util.run_kmeans2(kmeans_params)

        centroid = kmeans_ret.get('kmeans_ret')
        idx = kmeans_ret.get('idx')

        df = pd.read_json(data_matrix)
        rows = df.index.tolist()

        clusters = {}
        for list_index, value in enumerate(idx):
            cluster = clusters.get(value)
            if not cluster:
                clusters.update({value: {rows[list_index]: list_index}})
            else:
                cluster.update({rows[list_index]: list_index})

        return clusters

    def __init__(self, config):
        self.ws_url = config["workspace-url"]
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        self.shock_url = config['shock-url']
        self.srv_wiz_url = config['srv-wiz-url']
        self.scratch = config['scratch']
        self.dfu = DataFileUtil(self.callback_url)
        self.ke_util = kb_ke_util(self.callback_url, service_ver="dev")
        self.gen_api = GenericsAPI(self.callback_url, service_ver="dev")

        self.ws = Workspace(self.ws_url, token=self.token)
        self.set_client = SetAPI(self.srv_wiz_url)

    def run_pca(self, params):
        """
        run_pca: generates PCA matrix for KBaseExperiments.ClusterSet data object

        cluster_set_ref: KBaseExperiments.ClusterSet object references
        workspace_name: the name of the workspace
        pca_matrix_name: name of PCA (KBaseFeatureValues.FloatMatrix2D) object
        n_components - number of components (default 2)

        pca_ref: PCA object reference (as KBaseFeatureValues.FloatMatrix2D data type)
        report_name: report name generated by KBaseReport
        report_ref: report reference generated by KBaseReport
        """

        self._validate_run_pca_params(params)

        cluster_set_ref = params.get('cluster_set_ref')
        workspace_name = params.get('workspace_name')
        pca_matrix_name = params.get('pca_matrix_name')
        n_components = int(params.get('n_components', 2))

        cluster_set_source = self.dfu.get_objects(
            {"object_refs": [cluster_set_ref]})['data'][0]

        cluster_set_info = cluster_set_source.get('info')
        cluster_set_name = cluster_set_info[1]
        cluster_set_data = cluster_set_source.get('data')
        clusters = cluster_set_data.get('clusters')

        matrix_ref = cluster_set_data.get('original_data')

        data_matrix = self.gen_api.fetch_data({
            'obj_ref': matrix_ref
        }).get('data_matrix')

        if '_column' in cluster_set_name:
            data_matrix = pd.read_json(
                data_matrix).T.to_json()  # transpose matrix

        # run pca algorithm
        pca_params = {'data_matrix': data_matrix, 'n_components': n_components}
        PCA_matrix = self.ke_util.run_PCA(pca_params).get('PCA_matrix')

        df = pd.read_json(PCA_matrix)
        df.fillna(0, inplace=True)

        pca_ref, pca_matrix_data = self._save_2D_matrix(
            df, clusters, workspace_name, pca_matrix_name)

        returnVal = {'pca_ref': pca_ref}

        report_output = self._generate_pca_report(pca_ref, pca_matrix_data,
                                                  workspace_name)

        returnVal.update(report_output)
        return returnVal

    def run_kmeans_cluster(self, params):
        """
        run_kmeans_cluster: generates Kmeans clusters for Matrix data object

        matrix_ref: Matrix object reference
        workspace_name: the name of the workspace
        cluster_set_name: KBaseExperiments.ClusterSet object name
        k_num: number of clusters to form

        Optional arguments:
        dist_metric: The distance metric to use. Default set to 'euclidean'.
                     The distance function can be
                     ["braycurtis", "canberra", "chebyshev", "cityblock", "correlation", "cosine",
                      "dice", "euclidean", "hamming", "jaccard", "kulsinski", "matching",
                      "rogerstanimoto", "russellrao", "sokalmichener", "sokalsneath", "sqeuclidean",
                      "yule"]
                     Details refer to:
                     https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.pdist.html

        return:
        cluster_set_refs: KBaseExperiments.ClusterSet object references
        report_name: report name generated by KBaseReport
        report_ref: report reference generated by KBaseReport
        """

        self._validate_run_kmeans_cluster_params(params)

        matrix_ref = params.get('matrix_ref')
        workspace_name = params.get('workspace_name')
        cluster_set_name = params.get('cluster_set_name')
        k_num = params.get('k_num')
        dist_metric = params.get('dist_metric')

        matrix_object = self.ws.get_objects2(
            {'objects': [{
                'ref': matrix_ref
            }]})['data'][0]
        matrix_data = matrix_object['data']

        data_matrix = self.gen_api.fetch_data({
            'obj_ref': matrix_ref
        }).get('data_matrix')
        transpose_data_matrix = pd.read_json(data_matrix).T.to_json()

        row_kmeans_clusters = self._build_kmeans_cluster(
            data_matrix, k_num, dist_metric=dist_metric)

        col_kmeans_clusters = self._build_kmeans_cluster(
            transpose_data_matrix, k_num, dist_metric=dist_metric)

        genome_ref = matrix_data.get('genome_ref')
        clustering_parameters = {
            'k_num': str(k_num),
            'dist_metric': str(dist_metric)
        }

        cluster_set_refs = []

        row_cluster_set_name = cluster_set_name + '_row'
        row_cluster_set = self._build_kmeans_cluster_set(
            row_kmeans_clusters, row_cluster_set_name, genome_ref, matrix_ref,
            matrix_data.get('row_mapping'),
            matrix_data.get('row_conditionset_ref'), workspace_name,
            clustering_parameters)
        cluster_set_refs.append(row_cluster_set)

        col_cluster_set_name = cluster_set_name + '_column'
        col_cluster_set = self._build_kmeans_cluster_set(
            col_kmeans_clusters, col_cluster_set_name, genome_ref, matrix_ref,
            matrix_data.get('col_mapping'),
            matrix_data.get('col_conditionset_ref'), workspace_name,
            clustering_parameters)
        cluster_set_refs.append(col_cluster_set)

        returnVal = {'cluster_set_refs': cluster_set_refs}

        report_output = self._generate_kmeans_cluster_report(
            cluster_set_refs, workspace_name)

        returnVal.update(report_output)

        return returnVal

    def run_hierarchical_cluster(self, params):
        """
        run_hierarchical_cluster: generates hierarchical clusters for Matrix data object

        matrix_ref: Matrix object reference
        workspace_name: the name of the workspace
        cluster_set_name: KBaseExperiments.ClusterSet object name
        dist_cutoff_rate: the threshold to apply when forming flat clusters

        Optional arguments:
        dist_metric: The distance metric to use. Default set to 'euclidean'.
                     The distance function can be
                     ["braycurtis", "canberra", "chebyshev", "cityblock", "correlation", "cosine",
                      "dice", "euclidean", "hamming", "jaccard", "kulsinski", "matching",
                      "rogerstanimoto", "russellrao", "sokalmichener", "sokalsneath",
                      "sqeuclidean", "yule"]
                     Details refer to:
                     https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.pdist.html

        linkage_method: The linkage algorithm to use. Default set to 'ward'.
                        The method can be
                        ["single", "complete", "average", "weighted", "centroid", "median", "ward"]
                        Details refer to:
                        https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html

        fcluster_criterion: The criterion to use in forming flat clusters.
                            Default set to 'inconsistent'.
                            The criterion can be
                            ["inconsistent", "distance", "maxclust"]
                            Details refer to:
                            https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.fcluster.html

        return:
        cluster_set_refs: KBaseExperiments.ClusterSet object references
        report_name: report name generated by KBaseReport
        report_ref: report reference generated by KBaseReport
        """
        log('--->\nrunning run_hierarchical_cluster\n' +
            'params:\n{}'.format(json.dumps(params, indent=1)))

        self._validate_run_hierarchical_cluster_params(params)

        matrix_ref = params.get('matrix_ref')
        workspace_name = params.get('workspace_name')
        cluster_set_name = params.get('cluster_set_name')
        dist_cutoff_rate = float(params.get('dist_cutoff_rate'))
        dist_metric = params.get('dist_metric')
        linkage_method = params.get('linkage_method')
        fcluster_criterion = params.get('fcluster_criterion')

        matrix_object = self.ws.get_objects2(
            {'objects': [{
                'ref': matrix_ref
            }]})['data'][0]
        matrix_data = matrix_object['data']

        data_matrix = self.gen_api.fetch_data({
            'obj_ref': matrix_ref
        }).get('data_matrix')
        transpose_data_matrix = pd.read_json(data_matrix).T.to_json()

        (row_flat_cluster, row_labels, row_newick, row_dendrogram_path,
         row_dendrogram_truncate_path) = self._build_flat_cluster(
             data_matrix,
             dist_cutoff_rate,
             dist_metric=dist_metric,
             linkage_method=linkage_method,
             fcluster_criterion=fcluster_criterion)

        (col_flat_cluster, col_labels, col_newick, col_dendrogram_path,
         col_dendrogram_truncate_path) = self._build_flat_cluster(
             transpose_data_matrix,
             dist_cutoff_rate,
             dist_metric=dist_metric,
             linkage_method=linkage_method,
             fcluster_criterion=fcluster_criterion)

        genome_ref = matrix_data.get('genome_ref')

        clustering_parameters = {
            'dist_cutoff_rate': str(dist_cutoff_rate),
            'dist_metric': dist_metric,
            'linkage_method': linkage_method,
            'fcluster_criterion': fcluster_criterion
        }

        cluster_set_refs = []

        row_cluster_set_name = cluster_set_name + '_row'
        row_cluster_set = self._build_hierarchical_cluster_set(
            row_flat_cluster, row_cluster_set_name, genome_ref, matrix_ref,
            matrix_data.get('row_mapping'),
            matrix_data.get('row_conditionset_ref'), workspace_name,
            clustering_parameters, data_matrix)
        cluster_set_refs.append(row_cluster_set)

        col_cluster_set_name = cluster_set_name + '_column'
        col_cluster_set = self._build_hierarchical_cluster_set(
            col_flat_cluster, col_cluster_set_name, genome_ref, matrix_ref,
            matrix_data.get('col_mapping'),
            matrix_data.get('col_conditionset_ref'), workspace_name,
            clustering_parameters, transpose_data_matrix)
        cluster_set_refs.append(col_cluster_set)

        returnVal = {'cluster_set_refs': cluster_set_refs}

        report_output = self._generate_hierarchical_cluster_report(
            cluster_set_refs, workspace_name, row_dendrogram_path,
            row_dendrogram_truncate_path, col_dendrogram_path,
            col_dendrogram_truncate_path)
        returnVal.update(report_output)

        return returnVal
Esempio n. 29
0
from Workspace.WorkspaceClient import Workspace
import json
import os


def grab(upa, file):
    if not os.path.exists(file):
        d = ws.get_objects2({'objects': [{'ref': upa}]})
        with open(file, 'w') as f:
            f.write(json.dumps(d, indent=4))


ws = Workspace('https://ci.kbase.us/services/ws')

grab('36815/4/1', './test/mock_data/media_object.json')

grab('17335/21/2', './test/mock_data/fbamodel_object.json')

grab('4/23/1', './test/mock_data/media2_object.json')

upa = '16174/15/1'
l = ['scientific_name', 'taxonomy', 'id']
fname = './test/mock_data/genome_sub_object.json'
if not os.path.exists(fname):
    d = ws.get_objects2({'objects': [{'ref': upa, 'included': l}]})
    with open(fname, 'w') as f:
        f.write(json.dumps(d, indent=2))
Esempio n. 30
0
class CufflinksUtils:
    CUFFLINKS_TOOLKIT_PATH = '/opt/cufflinks/'
    GFFREAD_TOOLKIT_PATH = '/opt/cufflinks/'

    def __init__(self, config):
        """

        :param config:
        :param logger:
        :param directory: Working directory
        :param urls: Service urls
        """
        # BEGIN_CONSTRUCTOR
        self.ws_url = config["workspace-url"]
        self.ws_url = config["workspace-url"]
        self.callback_url = config['SDK_CALLBACK_URL']
        self.srv_wiz_url = config['srv-wiz-url']
        self.token = config['KB_AUTH_TOKEN']
        self.shock_url = config['shock-url']
        self.dfu = DataFileUtil(self.callback_url)
        self.gfu = GenomeFileUtil(self.callback_url)
        self.au = AssemblyUtil(self.callback_url)
        self.rau = ReadsAlignmentUtils(self.callback_url)
        self.set_api = SetAPI(self.srv_wiz_url, service_ver='dev')
        self.eu = ExpressionUtils(self.callback_url)
        self.ws = Workspace(self.ws_url, token=self.token)

        self.scratch = os.path.join(config['scratch'], str(uuid.uuid4()))
        self._mkdir_p(self.scratch)

        self.tool_used = "Cufflinks"
        self.tool_version = os.environ['VERSION']
        # END_CONSTRUCTOR
        pass

    def parse_FPKMtracking_calc_TPM(self, filename):
        """
        Generates TPM from FPKM
        :return:
        """
        fpkm_dict = {}
        tpm_dict = {}
        gene_col = 0
        fpkm_col = 9
        sum_fpkm = 0.0
        with open(filename) as f:
            next(f)
            for line in f:
                larr = line.split("\t")
                gene_id = larr[gene_col]
                if gene_id != "":
                    fpkm = float(larr[fpkm_col])
                    sum_fpkm = sum_fpkm + fpkm
                    fpkm_dict[gene_id] = math.log(fpkm + 1, 2)
                    tpm_dict[gene_id] = fpkm

        if sum_fpkm == 0.0:
            log("Warning: Unable to calculate TPM values as sum of FPKM values is 0"
                )
        else:
            for g in tpm_dict:
                tpm_dict[g] = math.log((tpm_dict[g] / sum_fpkm) * 1e6 + 1, 2)

        return fpkm_dict, tpm_dict

    def _mkdir_p(self, path):
        """
        _mkdir_p: make directory for given path
        """
        if not path:
            return
        try:
            os.makedirs(path)
        except OSError as exc:
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise

    def _validate_run_cufflinks_params(self, params):
        """
        _validate_run_cufflinks_params:
                Raises an exception if params are invalid
        """

        log('Start validating run_cufflinks params')

        # check for required parameters
        for p in ['alignment_object_ref', 'workspace_name', 'genome_ref']:
            if p not in params:
                raise ValueError(
                    '"{}" parameter is required, but missing'.format(p))

    def _run_command(self, command):
        """
        _run_command: run command and print result
        """

        log('Start executing command:\n{}'.format(command))
        pipe = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True)
        output = pipe.communicate()[0]
        exitCode = pipe.returncode

        if (exitCode == 0):
            log('Executed command:\n{}\n'.format(command) +
                'Exit Code: {}\nOutput:\n{}'.format(exitCode, output))
        else:
            error_msg = 'Error running command:\n{}\n'.format(command)
            error_msg += 'Exit Code: {}\nOutput:\n{}'.format(exitCode, output)

            raise ValueError(error_msg)

    def _run_gffread(self, gff_path, gtf_path):
        """
        _run_gffread: run gffread script

        ref: http://cole-trapnell-lab.github.io/cufflinks/file_formats/#the-gffread-utility
        """
        log('converting gff to gtf')
        command = self.GFFREAD_TOOLKIT_PATH + '/gffread '
        command += "-E {0} -T -o {1}".format(gff_path, gtf_path)

        self._run_command(command)

    def _create_gtf_annotation_from_genome(self, genome_ref):
        """
         Create reference annotation file from genome
        """
        ref = self.ws.get_object_subset([{
            'ref':
            genome_ref,
            'included': ['contigset_ref', 'assembly_ref']
        }])
        if 'contigset_ref' in ref[0]['data']:
            contig_id = ref[0]['data']['contigset_ref']
        elif 'assembly_ref' in ref[0]['data']:
            contig_id = ref[0]['data']['assembly_ref']
        if contig_id is None:
            raise ValueError(
                "Genome at {0} does not have reference to the assembly object".
                format(genome_ref))
        print(contig_id)
        log("Generating GFF file from Genome")
        try:
            ret = self.au.get_assembly_as_fasta({'ref': contig_id})
            output_file = ret['path']
            mapping_filename = c_mapping.create_sanitized_contig_ids(
                output_file)
            os.remove(output_file)
            # get the GFF
            ret = self.gfu.genome_to_gff({'genome_ref': genome_ref})
            genome_gff_file = ret['file_path']
            c_mapping.replace_gff_contig_ids(genome_gff_file,
                                             mapping_filename,
                                             to_modified=True)
            gtf_ext = ".gtf"

            if not genome_gff_file.endswith(gtf_ext):
                gtf_path = os.path.splitext(genome_gff_file)[0] + '.gtf'
                self._run_gffread(genome_gff_file, gtf_path)
            else:
                gtf_path = genome_gff_file

            log("gtf file : " + gtf_path)
        except Exception:
            raise ValueError(
                "Generating GTF file from Genome Annotation object Failed :  {}"
                .format("".join(traceback.format_exc())))
        return gtf_path

    def _get_gtf_file(self, alignment_ref):
        """
        _get_gtf_file: get the reference annotation file (in GTF or GFF3 format)
        """
        result_directory = self.scratch
        alignment_data = self.ws.get_objects2(
            {'objects': [{
                'ref': alignment_ref
            }]})['data'][0]['data']

        genome_ref = alignment_data.get('genome_id')
        # genome_name = self.ws.get_object_info([{"ref": genome_ref}], includeMetadata=None)[0][1]
        # ws_gtf = genome_name+"_GTF_Annotation"

        genome_data = self.ws.get_objects2({'objects': [{
            'ref': genome_ref
        }]})['data'][0]['data']

        gff_handle_ref = genome_data.get('gff_handle_ref')

        if gff_handle_ref:
            log('getting reference annotation file from genome')
            annotation_file = self.dfu.shock_to_file({
                'handle_id': gff_handle_ref,
                'file_path': result_directory,
                'unpack': 'unpack'
            })['file_path']
        else:
            annotation_file = self._create_gtf_annotation_from_genome(
                genome_ref)

        return annotation_file

    def _get_gtf_file_from_genome_ref(self, genome_ref):
        """
        _get_gtf_file: get the reference annotation file (in GTF or GFF3 format)
        """
        result_directory = self.scratch

        genome_data = self.ws.get_objects2({'objects': [{
            'ref': genome_ref
        }]})['data'][0]['data']

        gff_handle_ref = genome_data.get('gff_handle_ref')

        if gff_handle_ref:
            log('getting reference annotation file from genome')
            annotation_file = self.dfu.shock_to_file({
                'handle_id': gff_handle_ref,
                'file_path': result_directory,
                'unpack': 'unpack'
            })['file_path']
        else:
            annotation_file = self._create_gtf_annotation_from_genome(
                genome_ref)

        return annotation_file

    def _get_input_file(self, alignment_ref):
        """
        _get_input_file: get input BAM file from Alignment object
        """

        bam_file_dir = self.rau.download_alignment(
            {'source_ref': alignment_ref})['destination_dir']

        files = os.listdir(bam_file_dir)
        bam_file_list = [
            file for file in files if re.match(r'.*\_sorted\.bam', file)
        ]
        if not bam_file_list:
            bam_file_list = [
                file for file in files if re.match(r'.*(?<!sorted)\.bam', file)
            ]

        if not bam_file_list:
            raise ValueError('Cannot find .bam file from alignment {}'.format(
                alignment_ref))

        bam_file_name = bam_file_list[0]

        bam_file = os.path.join(bam_file_dir, bam_file_name)

        return bam_file

    def _generate_command(self, params):
        """
        _generate_command: generate cufflinks command
        """
        cufflinks_command = '/opt/cufflinks/cufflinks'
        cufflinks_command += (' -q --no-update-check -p ' +
                              str(params.get('num_threads', 1)))
        if 'max_intron_length' in params and params[
                'max_intron_length'] is not None:
            cufflinks_command += (' --max-intron-length ' +
                                  str(params['max_intron_length']))
        if 'min_intron_length' in params and params[
                'min_intron_length'] is not None:
            cufflinks_command += (' --min-intron-length ' +
                                  str(params['min_intron_length']))
        if 'overhang_tolerance' in params and params[
                'overhang_tolerance'] is not None:
            cufflinks_command += (' --overhang-tolerance ' +
                                  str(params['overhang_tolerance']))

        cufflinks_command += " -o {0} -G {1} {2}".format(
            params['result_directory'], params['gtf_file'],
            params['input_file'])

        log('Generated cufflinks command: {}'.format(cufflinks_command))

        return cufflinks_command

    def _process_rnaseq_alignment_object(self, params):
        """
        _process_alignment_object: process KBaseRNASeq.RNASeqAlignment type input object
        """
        log('start processing RNASeqAlignment object\nparams:\n{}'.format(
            json.dumps(params, indent=1)))
        alignment_ref = params.get('alignment_ref')

        result_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(result_directory)
        params['result_directory'] = str(result_directory)

        # input files
        params['input_file'] = self._get_input_file(alignment_ref)
        if not params.get('gtf_file'):
            params['gtf_file'] = self._get_gtf_file(alignment_ref)

        if '/' not in params['genome_ref']:
            params['genome_ref'] = params['workspace_name'] + '/' + params[
                'genome_ref']

        command = self._generate_command(params)
        self._run_command(command)

        expression_obj_ref = self._save_rnaseq_expression(
            result_directory, alignment_ref, params.get('workspace_name'),
            params.get('genome_ref'), params['gtf_file'],
            params['expression_suffix'])

        returnVal = {
            'result_directory': result_directory,
            'expression_obj_ref': expression_obj_ref,
            'alignment_ref': alignment_ref
        }

        expression_name = self.ws.get_object_info([{
            "ref": expression_obj_ref
        }],
                                                  includeMetadata=None)[0][1]

        widget_params = {
            "output": expression_name,
            "workspace": params.get('workspace_name')
        }
        returnVal.update(widget_params)

        return returnVal

    def _process_kbasesets_alignment_object(self, params):
        """
        _process_alignment_object: process KBaseRNASeq.RNASeqAlignment type input object
        """
        log('start processing KBaseSets object\nparams:\n{}'.format(
            json.dumps(params, indent=1)))
        alignment_ref = params.get('alignment_ref')

        result_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(result_directory)
        params['result_directory'] = str(result_directory)

        # input files
        params['input_file'] = self._get_input_file(alignment_ref)
        if not params.get('gtf_file'):
            params['gtf_file'] = self._get_gtf_file(alignment_ref)

        command = self._generate_command(params)
        self._run_command(command)

        expression_obj_ref = self._save_kbasesets_expression(
            result_directory, alignment_ref, params.get('workspace_name'),
            params.get('genome_ref'), params.get('gtf_file'),
            params.get('expression_suffix'))

        returnVal = {
            'result_directory': result_directory,
            'expression_obj_ref': expression_obj_ref,
            'alignment_ref': alignment_ref
        }

        expression_name = self.ws.get_object_info([{
            "ref": expression_obj_ref
        }],
                                                  includeMetadata=None)[0][1]

        widget_params = {
            "output": expression_name,
            "workspace": params.get('workspace_name')
        }
        returnVal.update(widget_params)

        return returnVal

    def _generate_html_report(self, result_directory, obj_ref):
        """
        _generate_html_report: generate html summary report
        """
        log('Start generating html report')
        html_report = list()

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_directory)
        result_file_path = os.path.join(output_directory, 'report.html')

        expression_object = self.ws.get_objects2(
            {'objects': [{
                'ref': obj_ref
            }]})['data'][0]

        expression_object_type = expression_object.get('info')[2]

        Overview_Content = ''
        if re.match('KBaseRNASeq.RNASeqExpression-\d.\d',
                    expression_object_type):
            Overview_Content += '<p>Generated Expression Object:</p><p>{}</p>'.format(
                expression_object.get('info')[1])
        elif re.match('KBaseRNASeq.RNASeqExpressionSet-\d.\d',
                      expression_object_type):
            Overview_Content += '<p>Generated Expression Set Object:</p><p>{}</p>'.format(
                expression_object.get('info')[1])
            Overview_Content += '<br><p>Generated Expression Object:</p>'
            for expression_ref in expression_object['data'][
                    'sample_expression_ids']:
                expression_name = self.ws.get_object_info(
                    [{
                        "ref": expression_ref
                    }], includeMetadata=None)[0][1]
                Overview_Content += '<p>{}</p>'.format(expression_name)
        elif re.match('KBaseSets.ExpressionSet-\d.\d', expression_object_type):
            pprint(expression_object)
            Overview_Content += '<p>Generated Expression Set Object:</p><p>{}</p>'.format(
                expression_object.get('info')[1])
            Overview_Content += '<br><p>Generated Expression Object:</p>'
            for expression_ref in expression_object['data']['items']:
                expression_name = self.ws.get_object_info(
                    [{
                        "ref": expression_ref['ref']
                    }], includeMetadata=None)[0][1]
                condition = expression_ref['label']
                Overview_Content += '<p>condition:{0}; expression_name: {1}</p>'.format(
                    condition, expression_name)

        with open(result_file_path, 'w') as result_file:
            with open(
                    os.path.join(os.path.dirname(__file__),
                                 'report_template.html'),
                    'r') as report_template_file:
                report_template = report_template_file.read()
                report_template = report_template.replace(
                    '<p>Overview_Content</p>', Overview_Content)
                result_file.write(report_template)

        html_report.append({
            'path':
            result_file_path,
            'name':
            os.path.basename(result_file_path),
            'label':
            os.path.basename(result_file_path),
            'description':
            'HTML summary report for Cufflinks App'
        })
        return html_report

    def _save_rnaseq_expression(self, result_directory, alignment_ref,
                                workspace_name, genome_ref, gtf_file,
                                expression_suffix):
        """
        _save_rnaseq_expression: save Expression object to workspace
        """
        log('start saving Expression object')
        alignment_object_name = self.ws.get_object_info(
            [{
                "ref": alignment_ref
            }], includeMetadata=None)[0][1]

        # set expression name
        if re.match('.*_[Aa]lignment$', alignment_object_name):
            expression_name = re.sub('_[Aa]lignment$', expression_suffix,
                                     alignment_object_name)
        else:  # assume user specified suffix
            expression_name = alignment_object_name + expression_suffix

        expression_ref = self.eu.upload_expression({
            'destination_ref':
            workspace_name + '/' + expression_name,
            'source_dir':
            result_directory,
            'alignment_ref':
            alignment_ref,
            'tool_used':
            self.tool_used,
            'tool_version':
            self.tool_version
        })['obj_ref']

        return expression_ref

    def _save_kbasesets_expression(self, result_directory, alignment_ref,
                                   workspace_name, genome_ref, gtf_file,
                                   expression_suffix):
        """
        _save_kbasesets_expression: save Expression object to workspace using ExpressionUtils
        and SetAPI
        """
        log('start saving Expression object')

        alignment_info = self.ws.get_object_info3(
            {'objects': [{
                "ref": alignment_ref
            }]})
        alignment_object_name = alignment_info['infos'][0][1]

        # set expression name
        if re.match('.*_[Aa]lignment$', alignment_object_name):
            expression_name = re.sub('_[Aa]lignment$', expression_suffix,
                                     alignment_object_name)
        else:  # assume user specified suffix
            expression_name = alignment_object_name + expression_suffix

        expression_ref = self.eu.upload_expression({
            'destination_ref':
            workspace_name + '/' + expression_name,
            'source_dir':
            result_directory,
            'alignment_ref':
            alignment_ref,
            'tool_used':
            self.tool_used,
            'tool_version':
            self.tool_version
        })['obj_ref']

        return expression_ref

    def _save_rnaseq_expression_set(self, alignment_expression_map,
                                    alignment_set_ref, workspace_name,
                                    expression_set_name):
        """
        _save_rnaseq_expression_set: save ExpressionSet object to workspace
        """
        log('start saving ExpressionSet object')
        if isinstance(workspace_name, int) or workspace_name.isdigit():
            workspace_id = workspace_name
        else:
            workspace_id = self.dfu.ws_name_to_id(workspace_name)

        expression_set_data = self._generate_expression_set_data(
            alignment_expression_map, alignment_set_ref, expression_set_name)

        object_type = 'KBaseRNASeq.RNASeqExpressionSet'
        save_object_params = {
            'id':
            workspace_id,
            'objects': [{
                'type': object_type,
                'data': expression_set_data,
                'name': expression_set_name
            }]
        }

        dfu_oi = self.dfu.save_objects(save_object_params)[0]
        expression_set_ref = str(dfu_oi[6]) + '/' + str(dfu_oi[0]) + '/' + str(
            dfu_oi[4])

        return expression_set_ref

    def _save_kbasesets_expression_set(self, alignment_expression_map,
                                       alignment_set_ref, workspace_name,
                                       expression_set_name):
        """
        _save_kbasesets_expression_set: save ExpressionSet object to workspace
        """
        log('start saving ExpressionSet object')
        if isinstance(workspace_name, int) or workspace_name.isdigit():
            workspace_id = workspace_name
        else:
            workspace_id = self.dfu.ws_name_to_id(workspace_name)

        expression_set_data = self._generate_expression_set_data(
            alignment_expression_map, alignment_set_ref, expression_set_name)

        object_type = 'KBaseRNASeq.RNASeqExpressionSet'
        save_object_params = {
            'id':
            workspace_id,
            'objects': [{
                'type': object_type,
                'data': expression_set_data,
                'name': expression_set_name
            }]
        }

        dfu_oi = self.dfu.save_objects(save_object_params)[0]
        expression_set_ref = str(dfu_oi[6]) + '/' + str(dfu_oi[0]) + '/' + str(
            dfu_oi[4])

        return expression_set_ref

    def _generate_report(self,
                         obj_ref,
                         workspace_name,
                         result_directory,
                         exprMatrix_FPKM_ref=None,
                         exprMatrix_TPM_ref=None):
        """
        _generate_report: generate summary report
        """

        log('creating report')

        output_files = self._generate_output_file_list(result_directory)
        output_html_files = self._generate_html_report(result_directory,
                                                       obj_ref)

        expression_object = self.ws.get_objects2(
            {'objects': [{
                'ref': obj_ref
            }]})['data'][0]
        expression_info = expression_object['info']
        expression_data = expression_object['data']

        expression_object_type = expression_info[2]
        if re.match('KBaseRNASeq.RNASeqExpression-\d+.\d+',
                    expression_object_type):
            objects_created = [{
                'ref':
                obj_ref,
                'description':
                'Expression generated by Cufflinks'
            }]
        elif re.match('KBaseRNASeq.RNASeqExpressionSet-\d+.\d+',
                      expression_object_type):
            objects_created = [{
                'ref':
                obj_ref,
                'description':
                'Expression generated by Cufflinks'
            }]
        elif re.match('KBaseSets.ExpressionSet-\d+.\d+',
                      expression_object_type):
            objects_created = [{
                'ref':
                obj_ref,
                'description':
                'ExpressionSet generated by Cufflinks'
            }]
            items = expression_data['items']
            for item in items:
                objects_created.append({
                    'ref':
                    item['ref'],
                    'description':
                    'Expression generated by Cufflinks'
                })
            objects_created.append({
                'ref':
                exprMatrix_FPKM_ref,
                'description':
                'FPKM ExpressionMatrix generated by Cufflinks'
            })
            objects_created.append({
                'ref':
                exprMatrix_TPM_ref,
                'description':
                'TPM ExpressionMatrix generated by Cufflinks'
            })

        report_params = {
            'message': '',
            'workspace_name': workspace_name,
            'file_links': output_files,
            'objects_created': objects_created,
            'html_links': output_html_files,
            'direct_html_link_index': 0,
            'html_window_height': 366,
            'report_object_name': 'kb_cufflinks_report_' + str(uuid.uuid4())
        }

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {
            'report_name': output['name'],
            'report_ref': output['ref']
        }

        return report_output

    def _parse_FPKMtracking(self, filename, metric):
        result = {}
        pos1 = 0
        if metric == 'FPKM':
            pos2 = 7
        if metric == 'TPM':
            pos2 = 8

        with open(filename) as f:
            next(f)
            for line in f:
                larr = line.split("\t")
                if larr[pos1] != "":
                    try:
                        result[larr[pos1]] = math.log(float(larr[pos2]) + 1, 2)
                    except ValueError:
                        result[larr[pos1]] = math.log(1, 2)

        return result

    def _generate_output_file_list(self, result_directory):
        """
        _generate_output_file_list: zip result files and generate file_links for report
        """
        log('Start packing result files')
        output_files = list()

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_directory)
        result_file = os.path.join(output_directory, 'cufflinks_result.zip')

        with zipfile.ZipFile(result_file,
                             'w',
                             zipfile.ZIP_DEFLATED,
                             allowZip64=True) as zip_file:
            for root, dirs, files in os.walk(result_directory):
                for file in files:
                    if not (file.endswith('.DS_Store')):
                        zip_file.write(
                            os.path.join(root, file),
                            os.path.join(os.path.basename(root), file))

        output_files.append({
            'path': result_file,
            'name': os.path.basename(result_file),
            'label': os.path.basename(result_file),
            'description': 'File(s) generated by Cufflinks App'
        })

        return output_files

    def _generate_expression_data(self, result_directory, alignment_ref,
                                  gtf_file, workspace_name, expression_suffix):
        """
        _generate_expression_data: generate Expression object with cufflinks output files
        """
        alignment_data_object = self.ws.get_objects2(
            {'objects': [{
                'ref': alignment_ref
            }]})['data'][0]

        # set expression name
        alignment_object_name = alignment_data_object['info'][1]
        if re.match('.*_[Aa]lignment$', alignment_object_name):
            expression_name = re.sub('_[Aa]lignment$', expression_suffix,
                                     alignment_object_name)
        else:  # assume user specified suffix
            expression_name = alignment_object_name + expression_suffix

        expression_data = {
            'id': expression_name,
            'type': 'RNA-Seq',
            'numerical_interpretation': 'FPKM',
            'processing_comments': 'log2 Normalized',
            'tool_used': self.tool_used,
            'tool_version': self.tool_version
        }
        alignment_data = alignment_data_object['data']

        condition = alignment_data.get('condition')
        expression_data.update({'condition': condition})

        genome_id = alignment_data.get('genome_id')
        expression_data.update({'genome_id': genome_id})

        read_sample_id = alignment_data.get('read_sample_id')
        expression_data.update(
            {'mapped_rnaseq_alignment': {
                read_sample_id: alignment_ref
            }})

        exp_dict, tpm_exp_dict = self.parse_FPKMtracking_calc_TPM(
            os.path.join(result_directory, 'genes.fpkm_tracking'))

        expression_data.update({'expression_levels': exp_dict})

        expression_data.update({'tpm_expression_levels': tpm_exp_dict})

        handle = self.dfu.file_to_shock({
            'file_path': result_directory,
            'pack': 'zip',
            'make_handle': True
        })['handle']
        expression_data.update({'file': handle})

        return expression_data

    def _generate_expression_set_data(self, alignment_expression_map,
                                      alignment_set_ref, expression_set_name):
        """
        _generate_expression_set_data: generate ExpressionSet object with cufflinks output files
        """
        alignment_set_data_object = self.ws.get_objects2(
            {'objects': [{
                'ref': alignment_set_ref
            }]})['data'][0]

        alignment_set_data = alignment_set_data_object['data']

        expression_set_data = {
            'tool_used': self.tool_used,
            'tool_version': self.tool_version,
            'id': expression_set_name,
            'alignmentSet_id': alignment_set_ref,
            'genome_id': alignment_set_data.get('genome_id'),
            'sampleset_id': alignment_set_data.get('sampleset_id')
        }

        sample_expression_ids = []
        mapped_expression_objects = []
        mapped_expression_ids = []

        for alignment_expression in alignment_expression_map:
            alignment_ref = alignment_expression.get('alignment_ref')
            expression_ref = alignment_expression.get('expression_obj_ref')
            sample_expression_ids.append(expression_ref)
            mapped_expression_ids.append({alignment_ref: expression_ref})
            alignment_name = self.ws.get_object_info(
                [{
                    "ref": alignment_ref
                }], includeMetadata=None)[0][1]
            expression_name = self.ws.get_object_info(
                [{
                    "ref": expression_ref
                }], includeMetadata=None)[0][1]
            mapped_expression_objects.append({alignment_name: expression_name})

        expression_set_data['sample_expression_ids'] = sample_expression_ids
        expression_set_data[
            'mapped_expression_objects'] = mapped_expression_objects
        expression_set_data['mapped_expression_ids'] = mapped_expression_ids

        return expression_set_data

    def _process_alignment_set_object(self, params, alignment_object_type):
        """
        _process_alignment_set_object: process KBaseRNASeq.RNASeqAlignmentSet type input object
                                        and KBaseSets.ReadsAlignmentSet type object
        """
        log('start processing KBaseRNASeq.RNASeqAlignmentSet object or KBaseSets.ReadsAlignmentSet object'
            '\nparams:\n{}'.format(json.dumps(params, indent=1)))

        alignment_set_ref = params.get('alignment_set_ref')

        if re.match('^KBaseRNASeq.RNASeqAlignmentSet-\d*',
                    alignment_object_type):
            params['gtf_file'] = self._get_gtf_file(alignment_set_ref)
        else:
            if not '/' in params['genome_ref']:
                params['genome_ref'] = params['workspace_name'] + '/' + params[
                    'genome_ref']

            params['gtf_file'] = self._get_gtf_file_from_genome_ref(
                params['genome_ref'])

        alignment_set = self.set_api.get_reads_alignment_set_v1({
            'ref':
            alignment_set_ref,
            'include_item_info':
            0,
            'include_set_item_ref_paths':
            1
        })
        mul_processor_params = []
        for alignment in alignment_set["data"]["items"]:
            alignment_ref = alignment['ref_path']
            alignment_upload_params = params.copy()
            alignment_upload_params['alignment_ref'] = alignment_ref
            mul_processor_params.append(alignment_upload_params)
            # use the following when you want to run the cmd sequentially
            # self._process_kbasesets_alignment_object(mul_processor_params[0])

        cpus = min(params.get('num_threads'), multiprocessing.cpu_count())
        pool = Pool(ncpus=cpus)
        log('running _process_alignment_object with {} cpus'.format(cpus))
        alignment_expression_map = pool.map(
            self._process_kbasesets_alignment_object, mul_processor_params)

        result_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(result_directory)

        expression_items = list()
        for proc_alignment_return in alignment_expression_map:
            expression_obj_ref = proc_alignment_return.get(
                'expression_obj_ref')
            alignment_ref = proc_alignment_return.get('alignment_ref')
            alignment_info = self.ws.get_object_info3({
                'objects': [{
                    "ref": alignment_ref
                }],
                'includeMetadata':
                1
            })
            condition = alignment_info['infos'][0][10]['condition']
            expression_items.append({
                "ref": expression_obj_ref,
                "label": condition,
            })
            expression_name = self.ws.get_object_info(
                [{
                    "ref": expression_obj_ref
                }], includeMetadata=None)[0][1]
            self._run_command('cp -R {} {}'.format(
                proc_alignment_return.get('result_directory'),
                os.path.join(result_directory, expression_name)))

        expression_set = {
            "description": "generated by kb_cufflinks",
            "items": expression_items
        }

        expression_set_info = self.set_api.save_expression_set_v1({
            "workspace":
            params['workspace_name'],
            "output_object_name":
            params['expression_set_name'],
            "data":
            expression_set
        })

        returnVal = {
            'result_directory': result_directory,
            'expression_obj_ref': expression_set_info['set_ref']
        }

        widget_params = {
            "output": params.get('expression_set_name'),
            "workspace": params.get('workspace_name')
        }
        returnVal.update(widget_params)

        return returnVal

    def _generate_output_object_name(self, params, alignment_object_type,
                                     alignment_object_name):
        """
        Generates the output object name based on input object type and name and stores it in
        params with key equal to 'expression' or 'expression_set' based on whether the input
        object is an alignment or alignment_set.

        :param params: module input params
        :param alignment_object_type: input alignment object type
        :param alignment_object_name: input alignment object name
        :param alignment_object_data: input alignment object data
        """
        expression_set_suffix = params['expression_set_suffix']
        expression_suffix = params['expression_suffix']

        if re.match('^KBaseRNASeq.RNASeqAlignment-\d*', alignment_object_type):
            if re.match('.*_[Aa]lignment$', alignment_object_name):
                params['expression_name'] = re.sub('_[Aa]lignment$',
                                                   expression_suffix,
                                                   alignment_object_name)
            else:  # assume user specified suffix
                params[
                    'expression_name'] = alignment_object_name + expression_suffix
        if re.match('^KBaseRNASeq.RNASeqAlignmentSet-\d*',
                    alignment_object_type):
            if re.match('.*_[Aa]lignment_[Ss]et$', alignment_object_name):
                # set expression set name
                params['expression_set_name'] = re.sub('_[Aa]lignment_[Ss]et$',
                                                       expression_set_suffix,
                                                       alignment_object_name)
            else:  # assume user specified suffix
                params[
                    'expression_set_name'] = alignment_object_name + expression_set_suffix
        if re.match('^KBaseSets.ReadsAlignmentSet-\d*', alignment_object_type):
            if re.match('.*_[Aa]lignment_[Ss]et$', alignment_object_name):

                # set expression set name
                params['expression_set_name'] = re.sub('_[Aa]lignment_[Ss]et$',
                                                       expression_set_suffix,
                                                       alignment_object_name)
            else:  # assume user specified suffix
                params[
                    'expression_set_name'] = alignment_object_name + expression_set_suffix

    def _save_expression_matrix(self, expressionset_ref, workspace_name):
        """
        _save_expression_matrix: save FPKM and TPM ExpressionMatrix
        """

        log('start saving ExpressionMatrix object')

        expression_set_name = self.ws.get_object_info(
            [{
                "ref": expressionset_ref
            }], includeMetadata=None)[0][1]

        output_obj_name_prefix = re.sub('_*[Ee]xpression_*[Ss]et', '',
                                        expression_set_name)

        upload_expression_matrix_params = {
            'expressionset_ref': expressionset_ref,
            'output_obj_name': output_obj_name_prefix,
            'workspace_name': workspace_name
        }

        expression_matrix_refs = self.eu.get_expressionMatrix(
            upload_expression_matrix_params)

        return expression_matrix_refs

    def run_cufflinks_app(self, params):
        log('--->\nrunning CufflinksUtil.run_cufflinks_app\n' +
            'params:\n{}'.format(json.dumps(params, indent=1)))

        self._validate_run_cufflinks_params(params)

        alignment_object_ref = params.get('alignment_object_ref')
        alignment_object_info = self.ws.get_object_info3(
            {"objects": [{
                "ref": alignment_object_ref
            }]})['infos'][0]

        alignment_object_type = alignment_object_info[2]
        alignment_object_name = alignment_object_info[1]

        # get output object name
        self._generate_output_object_name(params, alignment_object_type,
                                          alignment_object_name)

        log('--->\nalignment object type: \n' +
            '{}'.format(alignment_object_type))

        if re.match('^KBaseRNASeq.RNASeqAlignment-\d*', alignment_object_type):
            params.update({'alignment_ref': alignment_object_ref})
            returnVal = self._process_rnaseq_alignment_object(params)
            report_output = self._generate_report(
                returnVal.get('expression_obj_ref'),
                params.get('workspace_name'),
                returnVal.get('result_directory'))
            returnVal.update(report_output)
        elif re.match('^KBaseRNASeq.RNASeqAlignmentSet-\d*', alignment_object_type) or \
             re.match('^KBaseSets.ReadsAlignmentSet-\d*', alignment_object_type):
            params.update({'alignment_set_ref': alignment_object_ref})
            returnVal = self._process_alignment_set_object(
                params, alignment_object_type)
            expression_matrix_refs = self._save_expression_matrix(
                returnVal['expression_obj_ref'], params.get('workspace_name'))
            returnVal.update(expression_matrix_refs)

            report_output = self._generate_report(
                returnVal['expression_obj_ref'], params.get('workspace_name'),
                returnVal['result_directory'],
                expression_matrix_refs['exprMatrix_FPKM_ref'],
                expression_matrix_refs['exprMatrix_TPM_ref'])
            returnVal.update(report_output)
        else:
            raise ValueError(
                'None RNASeqAlignment type\nObject info:\n{}'.format(
                    alignment_object_info))

        return returnVal
Esempio n. 31
0
class Bowtie2Aligner(object):
    def __init__(self, scratch_dir, workspace_url, callback_url, srv_wiz_url,
                 provenance):
        self.scratch_dir = scratch_dir
        self.workspace_url = workspace_url
        self.callback_url = callback_url
        self.srv_wiz_url = srv_wiz_url
        self.provenance = provenance

        # from the provenance, extract out the version to run by exact hash if possible
        self.my_version = 'release'
        if len(provenance) > 0:
            if 'subactions' in provenance[0]:
                self.my_version = self.get_version_from_subactions(
                    'kb_Bowtie2', provenance[0]['subactions'])
        print('Running kb_Bowtie2 version = ' + self.my_version)

        self.ws = Workspace(self.workspace_url)
        self.bowtie2 = Bowtie2Runner(self.scratch_dir)
        self.parallel_runner = KBParallel(self.callback_url)
        self.qualimap = kb_QualiMap(self.callback_url)

    def get_version_from_subactions(self, module_name, subactions):
        # go through each sub action looking for
        if not subactions:
            return 'release'  # default to release if we can't find anything
        for sa in subactions:
            if 'name' in sa:
                if sa['name'] == module_name:
                    # local-docker-image implies that we are running in kb-test, so return 'dev'
                    if sa['commit'] == 'local-docker-image':
                        return 'dev'
                    # to check that it is a valid hash, make sure it is the right
                    # length and made up of valid hash characters
                    if re.match('[a-fA-F0-9]{40}$', sa['commit']):
                        return sa['commit']
        # again, default to setting this to release
        return 'release'

    def align(self, params):
        validated_params = self.validate_params(params)
        input_info = self.determine_input_info(validated_params)
        # input info provides information on the input and tells us if we should
        # run as a single_library or as a set:
        #     input_info = {'run_mode': '', 'info': [..], 'ref': '55/1/2'}

        assembly_or_genome_ref = validated_params['assembly_or_genome_ref']

        if input_info['run_mode'] == 'single_library':
            if 'output_alignment_name' not in validated_params:
                suffix = '_alignment'
                if 'output_alignment_suffix' in validated_params:
                    suffix = validated_params['output_alignment_suffix']
                validated_params[
                    'output_alignment_name'] = input_info['info'][1] + suffix
            single_lib_result = self.single_reads_lib_run(
                input_info,
                assembly_or_genome_ref,
                validated_params,
                create_report=validated_params['create_report'])

            return single_lib_result

        if input_info['run_mode'] == 'sample_set':
            reads = self.fetch_reads_refs_from_sampleset(
                input_info['ref'], input_info['info'], validated_params)
            self.build_bowtie2_index(assembly_or_genome_ref,
                                     validated_params['output_workspace'])

            print('Running on set of reads=')
            pprint(reads)

            tasks = []
            for r in reads:
                tasks.append(
                    self.build_single_execution_task(
                        r['ref'], params, r['alignment_output_name'],
                        r['condition']))

            batch_run_params = {
                'tasks': tasks,
                'runner': 'parallel',
                'max_retries': 2
            }
            if validated_params['concurrent_local_tasks'] is not None:
                batch_run_params['concurrent_local_tasks'] = validated_params[
                    'concurrent_local_tasks']
            if validated_params['concurrent_njsw_tasks'] is not None:
                batch_run_params['concurrent_njsw_tasks'] = validated_params[
                    'concurrent_njsw_tasks']
            results = self.parallel_runner.run_batch(batch_run_params)
            print('Batch run results=')
            pprint(results)
            batch_result = self.process_batch_result(results, validated_params,
                                                     reads, input_info['info'])
            return batch_result

        raise ('Improper run mode')

    def build_single_execution_task(self, reads_lib_ref, params, output_name,
                                    condition):
        task_params = copy.deepcopy(params)

        task_params['input_ref'] = reads_lib_ref
        task_params['output_alignment_name'] = output_name
        task_params['create_report'] = 0
        task_params['condition_label'] = condition

        return {
            'module_name': 'kb_Bowtie2',
            'function_name': 'align_reads_to_assembly_app',
            'version': self.my_version,
            'parameters': task_params
        }

    def single_reads_lib_run(self,
                             read_lib_info,
                             assembly_or_genome_ref,
                             validated_params,
                             create_report=False,
                             bowtie2_index_info=None):
        ''' run on one reads '''

        # download reads and prepare any bowtie2 index files
        input_configuration = self.prepare_single_run(
            read_lib_info, assembly_or_genome_ref, bowtie2_index_info,
            validated_params['output_workspace'])

        # run the actual program
        run_output_info = self.run_bowtie2_align_cli(input_configuration,
                                                     validated_params)

        # process the result and save the output
        upload_results = self.save_read_alignment_output(
            run_output_info, input_configuration, validated_params)
        run_output_info['upload_results'] = upload_results

        report_info = None
        if create_report:
            report_info = self.create_report_for_single_run(
                run_output_info, input_configuration, validated_params)

        self.clean(run_output_info)

        return {'output_info': run_output_info, 'report_info': report_info}

    def build_bowtie2_index(self, assembly_or_genome_ref, ws_for_cache):
        bowtie2IndexBuilder = Bowtie2IndexBuilder(self.scratch_dir,
                                                  self.workspace_url,
                                                  self.callback_url,
                                                  self.srv_wiz_url,
                                                  self.provenance)

        return bowtie2IndexBuilder.get_index({
            'ref': assembly_or_genome_ref,
            'ws_for_cache': ws_for_cache
        })

    def prepare_single_run(self, input_info, assembly_or_genome_ref,
                           bowtie2_index_info, ws_for_cache):
        ''' Given a reads ref and an assembly, setup the bowtie2 index '''
        # first setup the bowtie2 index of the assembly
        input_configuration = {'bowtie2_index_info': bowtie2_index_info}
        if not bowtie2_index_info:
            bowtie2IndexBuilder = Bowtie2IndexBuilder(self.scratch_dir,
                                                      self.workspace_url,
                                                      self.callback_url,
                                                      self.srv_wiz_url,
                                                      self.provenance)

            index_result = bowtie2IndexBuilder.get_index({
                'ref':
                assembly_or_genome_ref,
                'ws_for_cache':
                ws_for_cache
            })
            input_configuration['bowtie2_index_info'] = index_result

        # next download the reads
        read_lib_ref = input_info['ref']
        read_lib_info = input_info['info']
        reads_params = {
            'read_libraries': [read_lib_ref],
            'interleaved': 'false',
            'gzipped': None
        }
        ru = ReadsUtils(self.callback_url)
        reads = ru.download_reads(reads_params)['files']

        input_configuration['reads_lib_type'] = self.get_type_from_obj_info(
            read_lib_info).split('.')[1]
        input_configuration['reads_files'] = reads[read_lib_ref]
        input_configuration['reads_lib_ref'] = read_lib_ref

        return input_configuration

    def run_bowtie2_align_cli(self, input_configuration, validated_params):
        # pprint('======== input_configuration =====')
        # pprint(input_configuration)
        options = []
        run_output_info = {}

        # set the bowtie2 index location
        bt2_index_dir = input_configuration['bowtie2_index_info']['output_dir']
        bt2_index_basename = input_configuration['bowtie2_index_info'][
            'index_files_basename']
        options.extend(['-x', bt2_index_basename])

        # set the input reads
        if input_configuration['reads_lib_type'] == 'SingleEndLibrary':
            options.extend(
                ['-U', input_configuration['reads_files']['files']['fwd']])
            run_output_info['library_type'] = 'single_end'
        elif input_configuration['reads_lib_type'] == 'PairedEndLibrary':
            options.extend(
                ['-1', input_configuration['reads_files']['files']['fwd']])
            options.extend(
                ['-2', input_configuration['reads_files']['files']['rev']])
            run_output_info['library_type'] = 'paired_end'

        # setup the output file name
        output_dir = os.path.join(
            self.scratch_dir,
            'bowtie2_alignment_output_' + str(int(time.time() * 10000)))
        output_sam_file = os.path.join(output_dir, 'reads_alignment.sam')
        os.makedirs(output_dir)
        options.extend(['-S', output_sam_file])
        run_output_info['output_sam_file'] = output_sam_file
        run_output_info['output_dir'] = output_dir

        # parse all the other parameters
        if 'quality_score' in validated_params:
            options.append('--' + str(validated_params['quality_score']))

        if 'alignment_type' in validated_params:
            options.append('--' + str(validated_params['alignment_type']))

        if 'preset_options' in validated_params:
            if 'alignment_type' in validated_params and validated_params[
                    'alignment_type'] == 'local':
                options.append('--' + str(validated_params['preset_options'] +
                                          '-local'))
            else:
                options.append('--' + str(validated_params['preset_options']))

        if 'trim5' in validated_params:
            options.extend(['--trim5', str(validated_params['trim5'])])
        if 'trim3' in validated_params:
            options.extend(['--trim3', str(validated_params['trim3'])])
        if 'np' in validated_params:
            options.extend(['--np', str(validated_params['np'])])

        if 'minins' in validated_params:
            options.extend(['--minins', str(validated_params['minins'])])
        if 'maxins' in validated_params:
            options.extend(['--maxins', str(validated_params['maxins'])])

        # unfortunately, bowtie2 expects the index files to be in the current directory, and
        # you cannot configure it otherwise.  So run bowtie out of the index directory, but
        # place the output SAM file somewhere else
        self.bowtie2.run('bowtie2', options, cwd=bt2_index_dir)

        return run_output_info

    def save_read_alignment_output(self, run_output_info, input_configuration,
                                   validated_params):
        rau = ReadsAlignmentUtils(self.callback_url)
        destination_ref = validated_params[
            'output_workspace'] + '/' + validated_params[
                'output_alignment_name']
        condition = 'unknown'
        if 'condition_label' in validated_params:
            condition = validated_params['condition_label']
        upload_params = {
            'file_path': run_output_info['output_sam_file'],
            'destination_ref': destination_ref,
            'read_library_ref': input_configuration['reads_lib_ref'],
            'assembly_or_genome_ref':
            validated_params['assembly_or_genome_ref'],
            'condition': condition
        }
        upload_results = rau.upload_alignment(upload_params)
        return upload_results

    def clean(self, run_output_info):
        ''' Not really necessary on a single run, but if we are running multiple local subjobs, we
        should clean up files that have already been saved back up to kbase '''
        pass

    def create_report_for_single_run(self, run_output_info,
                                     input_configuration, validated_params):
        # first run qualimap
        qualimap_report = self.qualimap.run_bamqc(
            {'input_ref': run_output_info['upload_results']['obj_ref']})
        qc_result_zip_info = qualimap_report['qc_result_zip_info']

        # create report
        report_text = 'Ran on a single reads library.\n\n'
        alignment_info = self.get_obj_info(
            run_output_info['upload_results']['obj_ref'])
        report_text = 'Created ReadsAlignment: ' + str(
            alignment_info[1]) + '\n'
        report_text = '                        ' + run_output_info[
            'upload_results']['obj_ref'] + '\n'
        kbr = KBaseReport(self.callback_url)
        report_info = kbr.create_extended_report({
            'message':
            report_text,
            'objects_created': [{
                'ref':
                run_output_info['upload_results']['obj_ref'],
                'description':
                'ReadsAlignment'
            }],
            'report_object_name':
            'kb_Bowtie2_' + str(uuid.uuid4()),
            'direct_html_link_index':
            0,
            'html_links': [{
                'shock_id': qc_result_zip_info['shock_id'],
                'name': qc_result_zip_info['index_html_file_name'],
                'label': qc_result_zip_info['name']
            }],
            'workspace_name':
            validated_params['output_workspace']
        })
        return {
            'report_name': report_info['name'],
            'report_ref': report_info['ref']
        }

    def process_batch_result(self, batch_result, validated_params, reads,
                             input_set_info):

        n_jobs = len(batch_result['results'])
        n_success = 0
        n_error = 0
        ran_locally = 0
        ran_njsw = 0

        # reads alignment set items
        items = []
        objects_created = []

        for k in range(0, len(batch_result['results'])):
            job = batch_result['results'][k]
            result_package = job['result_package']
            if job['is_error']:
                n_error += 1
            else:
                n_success += 1
                output_info = result_package['result'][0]['output_info']
                ra_ref = output_info['upload_results']['obj_ref']
                # Note: could add a label to the alignment here?
                items.append({'ref': ra_ref, 'label': reads[k]['condition']})
                objects_created.append({'ref': ra_ref})

            if result_package['run_context']['location'] == 'local':
                ran_locally += 1
            if result_package['run_context']['location'] == 'njsw':
                ran_njsw += 1

        # Save the alignment set
        alignment_set_data = {'description': '', 'items': items}
        alignment_set_save_params = {
            'data':
            alignment_set_data,
            'workspace':
            validated_params['output_workspace'],
            'output_object_name':
            str(input_set_info[1]) + validated_params['output_obj_name_suffix']
        }

        set_api = SetAPI(self.srv_wiz_url)
        save_result = set_api.save_reads_alignment_set_v1(
            alignment_set_save_params)
        print('Saved ReadsAlignment=')
        pprint(save_result)
        objects_created.append({
            'ref':
            save_result['set_ref'],
            'description':
            'Set of all reads alignments generated'
        })
        set_name = save_result['set_info'][1]

        # run qualimap
        qualimap_report = self.qualimap.run_bamqc(
            {'input_ref': save_result['set_ref']})
        qc_result_zip_info = qualimap_report['qc_result_zip_info']

        # create the report
        report_text = 'Ran on SampleSet or ReadsSet.\n\n'
        report_text = 'Created ReadsAlignmentSet: ' + str(set_name) + '\n\n'
        report_text += 'Total ReadsLibraries = ' + str(n_jobs) + '\n'
        report_text += '        Successful runs = ' + str(n_success) + '\n'
        report_text += '            Failed runs = ' + str(n_error) + '\n'
        report_text += '       Ran on main node = ' + str(ran_locally) + '\n'
        report_text += '   Ran on remote worker = ' + str(ran_njsw) + '\n\n'

        print('Report text=')
        print(report_text)

        kbr = KBaseReport(self.callback_url)
        report_info = kbr.create_extended_report({
            'message':
            report_text,
            'objects_created':
            objects_created,
            'report_object_name':
            'kb_Bowtie2_' + str(uuid.uuid4()),
            'direct_html_link_index':
            0,
            'html_links': [{
                'shock_id': qc_result_zip_info['shock_id'],
                'name': qc_result_zip_info['index_html_file_name'],
                'label': qc_result_zip_info['name']
            }],
            'workspace_name':
            validated_params['output_workspace']
        })

        result = {
            'report_info': {
                'report_name': report_info['name'],
                'report_ref': report_info['ref']
            }
        }
        result['batch_output_info'] = batch_result

        return result

    def validate_params(self, params):
        validated_params = {}

        required_string_fields = [
            'input_ref', 'assembly_or_genome_ref', 'output_obj_name_suffix',
            'output_workspace'
        ]
        for field in required_string_fields:
            if field in params and params[field]:
                validated_params[field] = params[field]
            else:
                raise ValueError('"' + field +
                                 '" field required to run bowtie2 aligner app')

        optional_fields = [
            'quality_score', 'alignment_type', 'preset_options', 'trim5',
            'trim3', 'condition_label', 'np', 'minins', 'maxins',
            'output_alignment_suffix', 'output_alignment_name'
        ]
        for field in optional_fields:
            if field in params:
                if params[field] is not None:
                    validated_params[field] = params[field]

        validated_params['create_report'] = True
        if 'create_report' in params and params['create_report'] is not None:
            if int(params['create_report']) == 1:
                validated_params['create_report'] = True
            elif int(params['create_report']) == 0:
                validated_params['create_report'] = False
            else:
                raise ValueError(
                    '"create_report" field, if present, should be set to a boolean value: 0 or 1'
                )

        validated_params['concurrent_local_tasks'] = None
        validated_params['concurrent_njsw_tasks'] = None

        if 'concurrent_local_tasks' in params and params[
                'concurrent_local_tasks'] is not None:
            validated_params['concurrent_local_tasks'] = int(
                params['concurrent_local_tasks'])
        if 'concurrent_njsw_tasks' in params and params[
                'concurrent_njsw_tasks'] is not None:
            validated_params['concurrent_njsw_tasks'] = int(
                params['concurrent_njsw_tasks'])

        return validated_params

    def fetch_reads_refs_from_sampleset(self, ref, info, validated_params):
        """
        Note: adapted from kbaseapps/kb_hisat2 - file_util.py

        From the given object ref, return a list of all reads objects that are a part of that
        object. E.g., if ref is a ReadsSet, return a list of all PairedEndLibrary or SingleEndLibrary
        refs that are a member of that ReadsSet. This is returned as a list of dictionaries as follows:
        {
            "ref": reads object reference,
            "condition": condition string associated with that reads object
        }
        The only one required is "ref", all other keys may or may not be present, based on the reads
        object or object type in initial ref variable. E.g. a RNASeqSampleSet might have condition info
        for each reads object, but a single PairedEndLibrary may not have that info.
        If ref is already a Reads library, just returns a list with ref as a single element.
        """
        obj_type = self.get_type_from_obj_info(info)
        refs = list()
        refs_for_ws_info = list()
        if "KBaseSets.ReadsSet" in obj_type:
            print("Looking up reads references in ReadsSet object")
            set_api = SetAPI(self.srv_wiz_url)
            reads_set = set_api.get_reads_set_v1({
                'ref': ref,
                'include_item_info': 0
            })
            for reads in reads_set["data"]["items"]:
                refs.append({'ref': reads['ref'], 'condition': reads['label']})
                refs_for_ws_info.append({'ref': reads['ref']})
        elif "KBaseRNASeq.RNASeqSampleSet" in obj_type:
            print("Looking up reads references in RNASeqSampleSet object")
            sample_set = self.ws.get_objects2({"objects": [{
                "ref": ref
            }]})["data"][0]["data"]
            for i in range(len(sample_set["sample_ids"])):
                refs.append({
                    'ref': sample_set["sample_ids"][i],
                    'condition': sample_set["condition"][i]
                })
                refs_for_ws_info.append({'ref': sample_set["sample_ids"][i]})
        else:
            raise ValueError("Unable to fetch reads reference from object {} "
                             "which is a {}".format(ref, obj_type))

        # get object info so we can name things properly
        infos = self.ws.get_object_info3({'objects':
                                          refs_for_ws_info})['infos']

        name_ext = '_alignment'
        if 'output_alignment_suffix' in validated_params \
                and validated_params['output_alignment_suffix'] is not None:
            ext = validated_params['output_alignment_suffix'].replace(' ', '')
            if ext:
                name_ext = ext

        unique_name_lookup = {}
        for k in range(0, len(refs)):
            refs[k]['info'] = infos[k]
            name = infos[k][1]
            if name not in unique_name_lookup:
                unique_name_lookup[name] = 1
            else:
                unique_name_lookup[name] += 1
                name = name + '_' + str(unique_name_lookup[name])
            name = name + name_ext
            refs[k]['alignment_output_name'] = name

        return refs

    def determine_input_info(self, validated_params):
        ''' get info on the input_ref object and determine if we run once or run on a set '''
        info = self.get_obj_info(validated_params['input_ref'])
        obj_type = self.get_type_from_obj_info(info)
        if obj_type in [
                'KBaseAssembly.PairedEndLibrary',
                'KBaseAssembly.SingleEndLibrary', 'KBaseFile.PairedEndLibrary',
                'KBaseFile.SingleEndLibrary'
        ]:
            return {
                'run_mode': 'single_library',
                'info': info,
                'ref': validated_params['input_ref']
            }
        if obj_type == 'KBaseRNASeq.RNASeqSampleSet':
            return {
                'run_mode': 'sample_set',
                'info': info,
                'ref': validated_params['input_ref']
            }
        if obj_type == 'KBaseSets.ReadsSet':
            return {
                'run_mode': 'sample_set',
                'info': info,
                'ref': validated_params['input_ref']
            }

        raise ValueError('Object type of input_ref is not valid, was: ' +
                         str(obj_type))

    def get_type_from_obj_info(self, info):
        return info[2].split('-')[0]

    def get_obj_info(self, ref):
        return self.ws.get_object_info3({'objects': [{
            'ref': ref
        }]})['infos'][0]