Example #1
0
class VariationToVCF:
    def __init__(self, callback_url, scratch):
        self.scratch = scratch
        self.dfu = DataFileUtil(callback_url)

    def is_gz_file(filepath):
        with open(filepath, 'rb') as test_f:
            return binascii.hexlify(test_f.read(2)) == b'1f8b'

    def export_as_vcf(self, params):
        if 'input_var_ref' not in params:
            raise ValueError('Cannot export Variation- no input_var_ref field defined.')

        file = self.variation_to_vcf({'variation_ref': params['input_var_ref']})

        export_dir = os.path.join(self.scratch, file['variation_name'])
        os.makedirs(export_dir)

        try:
            shutil.move(file['path'], os.path.join(export_dir, os.path.basename(file['path'])))
        except shutil.Error as e:
            exit(e)

        dfupkg = self.dfu.package_for_download({
             'file_path': export_dir,
             'ws_refs': [params['input_var_ref']]
        })

        return {'shock_id': dfupkg['shock_id']}

    def variation_to_vcf(self, params):
        self.validate_params(params)

        print('downloading ws object data: '+params["variation_ref"])

        variation_obj = self.dfu.get_objects({'object_refs': [params['variation_ref']]})['data'][0]
        ws_type = variation_obj['info'][2]
        obj_name = variation_obj['info'][1]

        if 'KBaseGwasData.Variations' in ws_type:
            dl_path = self.process_vcf(self.scratch, variation_obj['data'])
        else:
            raise ValueError('Cannot write data to VCF; invalid WS type (' + ws_type +
                             ').  Supported types is KBaseGwasData.Variations')

        return {'path': dl_path, 'variation_name': obj_name}

    def process_vcf(self, output_vcf_file_path, data):
        obj = self.dfu.shock_to_file({
            'handle_id': data['vcf_handle_ref'],
            'file_path': output_vcf_file_path,
        })

        return obj['file_path']

    def validate_params(self, params):
        for key in ['variation_ref']:
            if key not in params:
                raise ValueError('required "' + key + '" field was not defined')
Example #2
0
class AssemblyToFasta:
    def __init__(self, callback_url, scratch):
        self.scratch = scratch
        self.dfu = DataFileUtil(callback_url)

    def export_as_fasta(self, ctx, params):
        """ Used almost exclusively for download only """
        # validate parameters
        if 'input_ref' not in params:
            raise ValueError(
                'Cannot export Assembly- not input_ref field defined.')

        # export to a file
        file = self.assembly_as_fasta(ctx, {'ref': params['input_ref']})

        # create the output directory and move the file there
        export_package_dir = os.path.join(self.scratch, file['assembly_name'])
        os.makedirs(export_package_dir)
        shutil.move(
            file['path'],
            os.path.join(export_package_dir, os.path.basename(file['path'])))

        # package it up and be done
        package_details = self.dfu.package_for_download({
            'file_path':
            export_package_dir,
            'ws_refs': [params['input_ref']]
        })

        return {'shock_id': package_details['shock_id']}

    def assembly_as_fasta(self, ctx, params):
        """ main function that accepts a ref to an object and writes a file """

        self.validate_params(params)

        print(f'downloading ws object data ({ params["ref"]})')
        assembly_object = self.dfu.get_objects(
            {'object_refs': [params['ref']]})['data'][0]
        ws_type = assembly_object['info'][2]
        obj_name = assembly_object['info'][1]

        if 'filename' in params:
            output_filename = params['filename']
        else:
            output_filename = obj_name + '.fa'

        output_fasta_file_path = os.path.join(self.scratch, output_filename)

        if 'KBaseGenomes.ContigSet' in ws_type:
            self.process_legacy_contigset(output_fasta_file_path,
                                          assembly_object['data'])
        elif 'KBaseGenomeAnnotations.Assembly' in ws_type:
            self.process_assembly(output_fasta_file_path,
                                  assembly_object['data'])

        else:
            raise ValueError(
                'Cannot write data to fasta; invalid WS type (' + ws_type +
                ').  Supported types are KBaseGenomes.ContigSet and ' +
                'KBaseGenomeAnnotations.Assembly')

        return {'path': output_fasta_file_path, 'assembly_name': obj_name}

    def fasta_rows_generator_from_contigset(self, contig_list):
        """ generates SeqRecords iterator for writing from a legacy contigset object """
        for contig in contig_list:
            description = ''
            if 'description' in contig and contig['description']:
                description = contig['description']
            yield SeqRecord(Seq(contig['sequence'], SingleLetterAlphabet),
                            id=contig['id'],
                            description=description)

    def process_legacy_contigset(self, output_fasta_path, data):
        SeqIO.write(self.fasta_rows_generator_from_contigset(data['contigs']),
                    output_fasta_path, "fasta")

    def process_assembly(self, output_fasta_path, data):
        self.dfu.shock_to_file({
            'handle_id': data['fasta_handle_ref'],
            'file_path': output_fasta_path,
            'unpack': 'uncompress'
        })

    def validate_params(self, params):
        for key in ['ref']:
            if key not in params:
                raise ValueError('required "' + key +
                                 '" field was not defined')
Example #3
0
class CorrelationUtil:
    def _mkdir_p(self, path):
        """
        _mkdir_p: make directory for given path
        """
        if not path:
            return
        try:
            os.makedirs(path)
        except OSError as exc:
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise

    def _validate_compute_corr_matrix_params(self, params):
        """
        _validate_compute_corr_matrix_params:
            validates params passed to compute_correlation_matrix method
        """

        logging.info('start validating compute_corrrelation_matrix params')

        # check for required parameters
        for p in ['input_obj_ref', 'workspace_name', 'corr_matrix_name']:
            if p not in params:
                raise ValueError(
                    '"{}" parameter is required, but missing'.format(p))

    def _validate_compute_correlation_across_matrices_params(self, params):
        """
        _validate_compute_correlation_across_matrices_params:
            validates params passed to compute_correlation_across_matrices method
        """

        logging.info(
            'start validating compute_correlation_across_matrices params')

        # check for required parameters
        for p in [
                'workspace_name', 'corr_matrix_name', 'matrix_ref_1',
                'matrix_ref_2'
        ]:
            if p not in params:
                raise ValueError(
                    '"{}" parameter is required, but missing'.format(p))

    def _fetch_taxon(self, amplicon_set_ref, amplicon_ids):
        logging.info('start fetching taxon info from AmpliconSet')
        taxons = dict()
        taxons_level = dict()
        amplicon_set_data = self.dfu.get_objects(
            {'object_refs': [amplicon_set_ref]})['data'][0]['data']

        amplicons = amplicon_set_data.get('amplicons')

        for amplicon_id in amplicon_ids:
            scientific_name = 'None'
            level = 'Unknown'
            try:
                scientific_name = amplicons.get(amplicon_id).get(
                    'taxonomy').get('scientific_name')
            except Exception:
                pass

            try:
                level = amplicons.get(amplicon_id).get('taxonomy').get(
                    'taxon_level')
            except Exception:
                pass

            taxons.update({amplicon_id: scientific_name})
            taxons_level.update({amplicon_id: level})

        # default empty taxons and taxons_level
        if set(taxons.values()) == {'None'}:
            taxons = None

        if set(taxons_level.values()) == {'Unknown'}:
            taxons_level = None

        return taxons, taxons_level

    def _build_table_content(self,
                             matrix_2D,
                             output_directory,
                             original_matrix_ref=[],
                             type='corr'):
        """
        _build_table_content: generate HTML table content for FloatMatrix2D object
        """

        page_content = """\n"""

        table_file_name = '{}_table.html'.format(type)
        data_file_name = '{}_data.json'.format(type)

        page_content += """<iframe height="900px" width="100%" """
        page_content += """src="{}" """.format(table_file_name)
        page_content += """style="border:none;"></iframe>\n"""

        row_ids = matrix_2D.get('row_ids')
        col_ids = matrix_2D.get('col_ids')
        values = matrix_2D.get('values')

        df = pd.DataFrame(values, index=row_ids, columns=col_ids)
        df = df.T
        links = df.stack().reset_index()

        columns = list()
        taxons = None
        taxons_level = None
        if len(original_matrix_ref) == 1:
            res = self.dfu.get_objects(
                {'object_refs': [original_matrix_ref[0]]})['data'][0]
            obj_type = res['info'][2]
            matrix_type = obj_type.split('Matrix')[0].split('.')[-1]
            if matrix_type == 'Amplicon':
                amplicon_set_ref = res['data'].get('amplicon_set_ref')
                if amplicon_set_ref:
                    taxons, taxons_level = self._fetch_taxon(
                        amplicon_set_ref, col_ids)
            columns.extend(
                ['{} 1'.format(matrix_type), '{} 2'.format(matrix_type)])
        elif len(original_matrix_ref) == 2:
            for matrix_ref in original_matrix_ref[::-1]:
                res = self.dfu.get_objects({'object_refs':
                                            [matrix_ref]})['data'][0]
                obj_type = res['info'][2]
                matrix_type = obj_type.split('Matrix')[0].split('.')[-1]
                if matrix_type == 'Amplicon':
                    amplicon_set_ref = res['data'].get('amplicon_set_ref')
                    if amplicon_set_ref:
                        taxons, taxons_level = self._fetch_taxon(
                            amplicon_set_ref, col_ids)
                columns.append(matrix_type)
        else:
            links.columns = ['Variable 1', 'Variable 2']

        # remove self-comparison
        links = links[links.iloc[:, 0] != links.iloc[:, 1]]

        if type == 'corr':
            columns.append('Correlation')
        elif type == 'sig':
            columns.append('Significance')
        else:
            columns.append('Value')

        links.columns = columns

        if taxons:
            links['Taxon'] = links.iloc[:, 0].map(taxons)

        if taxons_level:
            links['Taxon Level'] = links.iloc[:, 0].map(taxons_level)

        table_headers = links.columns.tolist()
        table_content = """\n"""
        # build header and footer
        table_content += """\n<thead>\n<tr>\n"""
        for table_header in table_headers:
            table_content += """\n <th>{}</th>\n""".format(table_header)
        table_content += """\n</tr>\n</thead>\n"""

        table_content += """\n<tfoot>\n<tr>\n"""
        for table_header in table_headers:
            table_content += """\n <th>{}</th>\n""".format(table_header)
        table_content += """\n</tr>\n</tfoot>\n"""

        logging.info('start generating table json file')
        data_array = links.values.tolist()

        total_rec = len(data_array)
        json_dict = {
            'draw': 1,
            'recordsTotal': total_rec,
            'recordsFiltered': total_rec,
            'data': data_array
        }

        with open(os.path.join(output_directory, data_file_name), 'w') as fp:
            json.dump(json_dict, fp)

        logging.info('start generating table html')
        with open(os.path.join(output_directory, table_file_name),
                  'w') as result_file:
            with open(
                    os.path.join(os.path.dirname(__file__), 'templates',
                                 'table_template.html'),
                    'r') as report_template_file:
                report_template = report_template_file.read()
                report_template = report_template.replace(
                    '<p>table_header</p>', table_content)
                report_template = report_template.replace(
                    'ajax_file_path', data_file_name)
                report_template = report_template.replace(
                    'deferLoading_size', str(total_rec))
                result_file.write(report_template)

        return page_content

    def _generate_visualization_content(self, output_directory,
                                        corr_matrix_obj_ref,
                                        corr_matrix_plot_path,
                                        scatter_plot_path):
        """
        <div class="tab">
            <button class="tablinks" onclick="openTab(event, 'CorrelationMatrix')" id="defaultOpen">Correlation Matrix</button>
        </div>

        <div id="CorrelationMatrix" class="tabcontent">
            <p>CorrelationMatrix_Content</p>
        </div>"""

        tab_def_content = ''
        tab_content = ''

        corr_data = self.dfu.get_objects(
            {'object_refs': [corr_matrix_obj_ref]})['data'][0]['data']

        coefficient_data = corr_data.get('coefficient_data')
        significance_data = corr_data.get('significance_data')
        original_matrix_ref = corr_data.get('original_matrix_ref')

        tab_def_content += """
        <div class="tab">
            <button class="tablinks" onclick="openTab(event, 'CorrelationMatrix')" id="defaultOpen">Correlation Matrix</button>
        """

        corr_table_content = self._build_table_content(
            coefficient_data,
            output_directory,
            original_matrix_ref=original_matrix_ref,
            type='corr')
        tab_content += """
        <div id="CorrelationMatrix" class="tabcontent">{}</div>""".format(
            corr_table_content)

        if significance_data:
            tab_def_content += """
            <button class="tablinks" onclick="openTab(event, 'SignificanceMatrix')">Significance Matrix</button>
            """
            sig_table_content = self._build_table_content(
                significance_data,
                output_directory,
                original_matrix_ref=original_matrix_ref,
                type='sig')
            tab_content += """
            <div id="SignificanceMatrix" class="tabcontent">{}</div>""".format(
                sig_table_content)

        if corr_matrix_plot_path:
            tab_def_content += """
            <button class="tablinks" onclick="openTab(event, 'CorrelationMatrixPlot')">Correlation Matrix Heatmap</button>
            """

            tab_content += """
            <div id="CorrelationMatrixPlot" class="tabcontent">
            """
            if corr_matrix_plot_path.endswith('.png'):
                corr_matrix_plot_name = 'CorrelationMatrixPlot.png'
                corr_matrix_plot_display_name = 'Correlation Matrix Plot'

                shutil.copy2(
                    corr_matrix_plot_path,
                    os.path.join(output_directory, corr_matrix_plot_name))

                tab_content += '<div class="gallery">'
                tab_content += '<a target="_blank" href="{}">'.format(
                    corr_matrix_plot_name)
                tab_content += '<img src="{}" '.format(corr_matrix_plot_name)
                tab_content += 'alt="{}" width="600" height="400">'.format(
                    corr_matrix_plot_display_name)
                tab_content += '</a><div class="desc">{}</div></div>'.format(
                    corr_matrix_plot_display_name)
            elif corr_matrix_plot_path.endswith('.html'):
                corr_matrix_plot_name = 'CorrelationMatrixPlot.html'

                shutil.copy2(
                    corr_matrix_plot_path,
                    os.path.join(output_directory, corr_matrix_plot_name))

                tab_content += '<iframe height="900px" width="100%" '
                tab_content += 'src="{}" '.format(corr_matrix_plot_name)
                tab_content += 'style="border:none;"></iframe>\n<p></p>\n'
            else:
                raise ValueError(
                    'unexpected correlation matrix plot format:\n{}'.format(
                        corr_matrix_plot_path))

            tab_content += """</div>"""

        if scatter_plot_path:

            tab_def_content += """
            <button class="tablinks" onclick="openTab(event, 'ScatterMatrixPlot')">Scatter Matrix Plot</button>
            """

            tab_content += """
            <div id="ScatterMatrixPlot" class="tabcontent">
            """

            scatter_plot_name = 'ScatterMatrixPlot.png'
            scatter_plot_display_name = 'Scatter Matrix Plot'

            shutil.copy2(scatter_plot_path,
                         os.path.join(output_directory, scatter_plot_name))

            tab_content += '<div class="gallery">'
            tab_content += '<a target="_blank" href="{}">'.format(
                scatter_plot_name)
            tab_content += '<img src="{}" '.format(scatter_plot_name)
            tab_content += 'alt="{}" width="600" height="400">'.format(
                scatter_plot_display_name)
            tab_content += '</a><div class="desc">{}</div></div>'.format(
                scatter_plot_display_name)

            tab_content += """</div>"""

        tab_def_content += """</div>"""

        return tab_def_content + tab_content

    def _generate_corr_html_report(self, corr_matrix_obj_ref,
                                   corr_matrix_plot_path, scatter_plot_path):
        """
        _generate_corr_html_report: generate html summary report for correlation
        """

        logging.info('Start generating html report')
        html_report = list()

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_directory)
        result_file_path = os.path.join(output_directory, 'corr_report.html')

        visualization_content = self._generate_visualization_content(
            output_directory, corr_matrix_obj_ref, corr_matrix_plot_path,
            scatter_plot_path)

        with open(result_file_path, 'w') as result_file:
            with open(
                    os.path.join(os.path.dirname(__file__), 'templates',
                                 'corr_template.html'),
                    'r') as report_template_file:
                report_template = report_template_file.read()
                report_template = report_template.replace(
                    '<p>Visualization_Content</p>', visualization_content)
                result_file.write(report_template)

        report_shock_id = self.dfu.file_to_shock({
            'file_path': output_directory,
            'pack': 'zip'
        })['shock_id']

        html_report.append({
            'shock_id':
            report_shock_id,
            'name':
            os.path.basename(result_file_path),
            'label':
            os.path.basename(result_file_path),
            'description':
            'HTML summary report for Compute Correlation App'
        })
        return html_report

    def _generate_corr_report(self,
                              corr_matrix_obj_ref,
                              workspace_name,
                              corr_matrix_plot_path,
                              scatter_plot_path=None):
        """
        _generate_report: generate summary report
        """
        logging.info('Start creating report')

        output_html_files = self._generate_corr_html_report(
            corr_matrix_obj_ref, corr_matrix_plot_path, scatter_plot_path)

        report_params = {
            'message':
            '',
            'objects_created': [{
                'ref': corr_matrix_obj_ref,
                'description': 'Correlation Matrix'
            }],
            'workspace_name':
            workspace_name,
            'html_links':
            output_html_files,
            'direct_html_link_index':
            0,
            'html_window_height':
            666,
            'report_object_name':
            'compute_correlation_matrix_' + str(uuid.uuid4())
        }

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {
            'report_name': output['name'],
            'report_ref': output['ref']
        }

        return report_output

    def _corr_for_matrix(self, input_obj_ref, method, dimension):
        """
        _corr_for_matrix: compute correlation matrix df for KBaseMatrices object
        """
        data_matrix = self.data_util.fetch_data({
            'obj_ref': input_obj_ref
        }).get('data_matrix')
        data_df = pd.read_json(data_matrix)
        data_df = data_df.reindex(index=natsorted(data_df.index))
        data_df = data_df.reindex(columns=natsorted(data_df.columns))

        corr_df = self.df_to_corr(data_df, method=method, dimension=dimension)

        return corr_df, data_df

    def _compute_significance(self, data_df, dimension):
        """
        _compute_significance: compute pairwsie significance dataframe
                               two-sided p-value for a hypothesis test
        """

        logging.info('Start computing significance matrix')
        if dimension == 'row':
            data_df = data_df.T

        data_df = data_df.dropna()._get_numeric_data()
        dfcols = pd.DataFrame(columns=data_df.columns)
        sig_df = dfcols.transpose().join(dfcols, how='outer')

        for r in data_df.columns:
            for c in data_df.columns:
                pvalue = stats.linregress(data_df[r], data_df[c])[3]
                sig_df[r][c] = round(pvalue, 4)

        return sig_df

    def _df_to_list(self, df, threshold=None):
        """
        _df_to_list: convert Dataframe to FloatMatrix2D matrix data
        """

        df.fillna(0, inplace=True)

        if threshold:
            drop_cols = list()
            for col in df.columns:
                if all(df[col] < threshold) and all(df[col] > -threshold):
                    drop_cols.append(col)
            df.drop(columns=drop_cols, inplace=True, errors='ignore')

            drop_idx = list()
            for idx in df.index:
                if all(df.loc[idx] < threshold) and all(
                        df.loc[idx] > -threshold):
                    drop_idx.append(idx)
            df.drop(index=drop_idx, inplace=True, errors='ignore')

        matrix_data = {
            'row_ids': df.index.tolist(),
            'col_ids': df.columns.tolist(),
            'values': df.values.tolist()
        }

        return matrix_data

    def _save_corr_matrix(self,
                          workspace_name,
                          corr_matrix_name,
                          corr_df,
                          sig_df,
                          method,
                          matrix_ref=None,
                          corr_threshold=None):
        """
        _save_corr_matrix: save KBaseExperiments.CorrelationMatrix object
        """
        logging.info('Start saving CorrelationMatrix')

        if not isinstance(workspace_name, int):
            ws_name_id = self.dfu.ws_name_to_id(workspace_name)
        else:
            ws_name_id = workspace_name

        corr_data = {}

        corr_data.update({
            'coefficient_data':
            self._df_to_list(corr_df, threshold=corr_threshold)
        })
        corr_data.update({'correlation_parameters': {'method': method}})
        if matrix_ref:
            corr_data.update({'original_matrix_ref': matrix_ref})

        if sig_df is not None:
            corr_data.update({'significance_data': self._df_to_list(sig_df)})

        obj_type = 'KBaseExperiments.CorrelationMatrix'
        info = self.dfu.save_objects({
            "id":
            ws_name_id,
            "objects": [{
                "type": obj_type,
                "data": corr_data,
                "name": corr_matrix_name
            }]
        })[0]

        return "%s/%s/%s" % (info[6], info[0], info[4])

    def _Matrix2D_to_df(self, Matrix2D):
        """
        _Matrix2D_to_df: transform a FloatMatrix2D to data frame
        """

        index = Matrix2D.get('row_ids')
        columns = Matrix2D.get('col_ids')
        values = Matrix2D.get('values')

        df = pd.DataFrame(values, index=index, columns=columns)

        return df

    def _corr_to_df(self, corr_matrix_ref):
        """
        retrieve correlation matrix ws object to coefficient_df and significance_df
        """

        corr_data = self.dfu.get_objects({'object_refs': [corr_matrix_ref]
                                          })['data'][0]['data']

        coefficient_data = corr_data.get('coefficient_data')
        significance_data = corr_data.get('significance_data')

        coefficient_df = self._Matrix2D_to_df(coefficient_data)

        significance_df = None
        if significance_data:
            significance_df = self._Matrix2D_to_df(significance_data)

        return coefficient_df, significance_df

    def _corr_df_to_excel(self, coefficient_df, significance_df, result_dir,
                          corr_matrix_ref):
        """
        write correlation matrix dfs into excel
        """

        corr_info = self.dfu.get_objects({'object_refs': [corr_matrix_ref]
                                          })['data'][0]['info']
        corr_name = corr_info[1]

        file_path = os.path.join(result_dir, corr_name + ".xlsx")

        writer = pd.ExcelWriter(file_path)

        coefficient_df.to_excel(writer, "coefficient_data", index=True)

        if significance_df is not None:
            significance_df.to_excel(writer, "significance_data", index=True)

        writer.close()

    def _update_taxonomy_index(self, data_df, amplicon_set_ref):

        logging.info(
            'start updating index with taxonomy info from AmpliconSet')

        amplicon_set_data = self.dfu.get_objects(
            {'object_refs': [amplicon_set_ref]})['data'][0]['data']

        amplicons = amplicon_set_data.get('amplicons')

        index = data_df.index.values

        replace_index = list()

        for idx in index:
            scientific_name = None
            try:
                scientific_name = amplicons.get(idx).get('taxonomy').get(
                    'scientific_name')
            except Exception:
                pass

            if scientific_name:
                replace_index.append(scientific_name + '_' + idx)
            else:
                replace_index.append(idx)

        for idx, val in enumerate(replace_index):
            index[idx] = val

        return data_df

    def _fetch_matrix_data(self, matrix_ref):

        logging.info('start fectching matrix data')

        res = self.dfu.get_objects({'object_refs': [matrix_ref]})['data'][0]
        obj_type = res['info'][2]

        if "KBaseMatrices" in obj_type:
            data_matrix = self.data_util.fetch_data({
                'obj_ref': matrix_ref
            }).get('data_matrix')
            data_df = pd.read_json(data_matrix)
            data_df = data_df.reindex(index=natsorted(data_df.index))
            data_df = data_df.reindex(columns=natsorted(data_df.columns))

            return data_df
        else:
            err_msg = 'Ooops! [{}] is not supported.\n'.format(obj_type)
            err_msg += 'Please supply KBaseMatrices object'
            raise ValueError("err_msg")

    def _compute_metrices_corr(self, df1, df2, method, compute_significance):

        df1.fillna(0, inplace=True)
        df2.fillna(0, inplace=True)

        col_1 = df1.columns
        col_2 = df2.columns
        idx_1 = df1.index
        idx_2 = df2.index

        common_col = col_1.intersection(col_2)
        logging.info('matrices share [{}] common columns'.format(
            common_col.size))

        if common_col.empty:
            raise ValueError('Matrices share no common columns')

        logging.info('start trimming original matrix')
        df1 = df1.loc[:][common_col]
        df2 = df2.loc[:][common_col]

        corr_df = pd.DataFrame(index=idx_1, columns=idx_2)
        sig_df = pd.DataFrame(index=idx_1, columns=idx_2)

        logging.info('start calculating correlation matrix')
        logging.info('sizing {} x {}'.format(idx_1.size, idx_2.size))
        counter = 0
        for idx_value in idx_1:
            for col_value in idx_2:

                if counter % 100000 == 0:
                    logging.info('computed {} corr/sig values'.format(counter))

                value_array_1 = df1.loc[idx_value].tolist()
                value_array_2 = df2.loc[col_value].tolist()

                if method == 'pearson':
                    corr_value, p_value = stats.pearsonr(
                        value_array_1, value_array_2)
                elif method == 'spearman':
                    corr_value, p_value = stats.spearmanr(
                        value_array_1, value_array_2)
                elif method == 'kendall':
                    corr_value, p_value = stats.kendalltau(
                        value_array_1, value_array_2)
                else:
                    err_msg = 'Input correlation method [{}] is not available.\n'.format(
                        method)
                    err_msg += 'Please choose one of {}'.format(CORR_METHOD)
                    raise ValueError(err_msg)

                corr_df.at[idx_value, col_value] = round(corr_value, 4)
                if compute_significance:
                    sig_df.at[idx_value, col_value] = round(p_value, 4)

                counter += 1

        if not compute_significance:
            sig_df = None

        return corr_df, sig_df

    def __init__(self, config):
        self.ws_url = config["workspace-url"]
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        self.scratch = config['scratch']

        self.data_util = DataUtil(config)
        self.dfu = DataFileUtil(self.callback_url)

        plt.switch_backend('agg')

    def df_to_corr(self, df, method='pearson', dimension='col'):
        """
        Compute pairwise correlation of dimension (col or row)

        method: one of ['pearson', 'kendall', 'spearman']
        """

        logging.info('Computing correlation matrix')

        if method not in CORR_METHOD:
            err_msg = 'Input correlation method [{}] is not available.\n'.format(
                method)
            err_msg += 'Please choose one of {}'.format(CORR_METHOD)
            raise ValueError(err_msg)

        if dimension == 'row':
            df = df.T
        elif dimension != 'col':
            err_msg = 'Input dimension [{}] is not available.\n'.format(
                dimension)
            err_msg += 'Please choose either "col" or "row"'
            raise ValueError(err_msg)

        corr_df = df.corr(method=method).round(4)

        return corr_df

    def plotly_corr_matrix(self, corr_df):
        logging.info('Plotting matrix of correlation')

        result_dir = os.path.join(self.scratch,
                                  str(uuid.uuid4()) + '_corr_matrix_plots')
        self._mkdir_p(result_dir)

        try:
            trace = go.Heatmap(z=corr_df.values,
                               x=corr_df.columns,
                               y=corr_df.index)
            data = [trace]
        except Exception:
            err_msg = 'Running plotly_corr_matrix returned an error:\n{}\n'.format(
                traceback.format_exc())
            raise ValueError(err_msg)
        else:
            corr_matrix_plot_path = os.path.join(result_dir,
                                                 'corr_matrix_plots.html')
            logging.info('Saving plot to:\n{}'.format(corr_matrix_plot_path))
            plot(data, filename=corr_matrix_plot_path)

        return corr_matrix_plot_path

    def plot_corr_matrix(self, corr_df):
        """
        plot_corr_matrix: genreate correlation matrix plot
        """
        logging.info('Plotting matrix of correlation')

        result_dir = os.path.join(self.scratch,
                                  str(uuid.uuid4()) + '_corr_matrix_plots')
        self._mkdir_p(result_dir)

        try:
            plt.clf()
            matrix_size = corr_df.index.size
            figsize = 10 if matrix_size / 5 < 10 else matrix_size / 5
            fig, ax = plt.subplots(figsize=(figsize, figsize))
            cax = ax.matshow(corr_df)
            plt.xticks(list(range(len(corr_df.columns))),
                       corr_df.columns,
                       rotation='vertical',
                       fontstyle='italic')
            plt.yticks(list(range(len(corr_df.columns))),
                       corr_df.columns,
                       fontstyle='italic')
            plt.colorbar(cax)
        except Exception:
            err_msg = 'Running plot_corr_matrix returned an error:\n{}\n'.format(
                traceback.format_exc())
            raise ValueError(err_msg)
        else:
            corr_matrix_plot_path = os.path.join(result_dir,
                                                 'corr_matrix_plots.png')
            logging.info('Saving plot to:\n{}'.format(corr_matrix_plot_path))
            plt.savefig(corr_matrix_plot_path)

        return corr_matrix_plot_path

    def plot_scatter_matrix(self,
                            df,
                            dimension='col',
                            alpha=0.2,
                            diagonal='kde',
                            figsize=(10, 10)):
        """
        plot_scatter_matrix: generate scatter plot for dimension (col or row)
                             ref: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.plotting.scatter_matrix.html
        """
        logging.info('Plotting matrix of scatter')

        result_dir = os.path.join(self.scratch,
                                  str(uuid.uuid4()) + '_scatter_plots')
        self._mkdir_p(result_dir)

        if dimension == 'row':
            df = df.T
        elif dimension != 'col':
            err_msg = 'Input dimension [{}] is not available.\n'.format(
                dimension)
            err_msg += 'Please choose either "col" or "row"'
            raise ValueError(err_msg)

        try:
            plt.clf()
            sm = pd.plotting.scatter_matrix(df,
                                            alpha=alpha,
                                            diagonal=diagonal,
                                            figsize=figsize)

            # Change label rotation
            [s.xaxis.label.set_rotation(45) for s in sm.reshape(-1)]
            [s.yaxis.label.set_rotation(45) for s in sm.reshape(-1)]

            # # May need to offset label when rotating to prevent overlap of figure
            [s.get_yaxis().set_label_coords(-1.5, 0.5) for s in sm.reshape(-1)]

            # Hide all ticks
            [s.set_xticks(()) for s in sm.reshape(-1)]
            [s.set_yticks(()) for s in sm.reshape(-1)]
        except Exception:
            err_msg = 'Running scatter_matrix returned an error:\n{}\n'.format(
                traceback.format_exc())
            raise ValueError(err_msg)
        else:
            scatter_plot_path = os.path.join(result_dir, 'scatter_plots.png')
            logging.info('Saving plot to:\n{}'.format(scatter_plot_path))
            plt.savefig(scatter_plot_path)

        return scatter_plot_path

    def compute_correlation_across_matrices(self, params):
        """
        matrix_ref_1: object reference of a matrix
        matrix_ref_2: object reference of a matrix
        workspace_name: workspace name objects to be saved to
        corr_matrix_name: correlation matrix object name
        method: correlation method, one of ['pearson', 'kendall', 'spearman']
        plot_corr_matrix: plot correlation matrix in report, default False
        compute_significance: also compute Significance in addition to correlation matrix
        """

        logging.info(
            '--->\nrunning CorrelationUtil.compute_correlation_across_matrices\n'
            + 'params:\n{}'.format(json.dumps(params, indent=1)))

        self._validate_compute_correlation_across_matrices_params(params)

        matrix_ref_1 = params.get('matrix_ref_1')
        matrix_ref_2 = params.get('matrix_ref_2')
        workspace_name = params.get('workspace_name')
        corr_matrix_name = params.get('corr_matrix_name')
        corr_threshold = params.get('corr_threshold')

        method = params.get('method', 'pearson')
        if method not in CORR_METHOD:
            err_msg = 'Input correlation method [{}] is not available.\n'.format(
                method)
            err_msg += 'Please choose one of {}'.format(CORR_METHOD)
            raise ValueError(err_msg)
        plot_corr_matrix = params.get('plot_corr_matrix', False)
        compute_significance = params.get('compute_significance', False)

        matrix_1_type = self.dfu.get_objects({'object_refs': [matrix_ref_1]
                                              })['data'][0]['info'][2]

        # making sure otu_ids are on the column of table
        if "AmpliconMatrix" in matrix_1_type:
            matrix_ref_1, matrix_ref_2 = matrix_ref_2, matrix_ref_1

        df1 = self._fetch_matrix_data(matrix_ref_1)
        df2 = self._fetch_matrix_data(matrix_ref_2)

        corr_df, sig_df = self._compute_metrices_corr(df1, df2, method,
                                                      compute_significance)

        if plot_corr_matrix:
            corr_matrix_plot_path = self.plotly_corr_matrix(corr_df)
        else:
            corr_matrix_plot_path = None

        corr_matrix_obj_ref = self._save_corr_matrix(
            workspace_name,
            corr_matrix_name,
            corr_df,
            sig_df,
            method,
            matrix_ref=[matrix_ref_1, matrix_ref_2],
            corr_threshold=corr_threshold)

        returnVal = {'corr_matrix_obj_ref': corr_matrix_obj_ref}

        report_output = self._generate_corr_report(corr_matrix_obj_ref,
                                                   workspace_name,
                                                   corr_matrix_plot_path)

        returnVal.update(report_output)

        return returnVal

    def compute_correlation_matrix(self, params):
        """
        input_obj_ref: object reference of a matrix
        workspace_name: workspace name objects to be saved to
        dimension: compute correlation on column or row, one of ['col', 'row']
        corr_matrix_name: correlation matrix object name
        method: correlation method, one of ['pearson', 'kendall', 'spearman']
        compute_significance: compute pairwise significance value, default False
        plot_corr_matrix: plot correlation matrix in repor, default False
        plot_scatter_matrix: plot scatter matrix in report, default False
        """

        logging.info(
            '--->\nrunning CorrelationUtil.compute_correlation_matrix\n' +
            'params:\n{}'.format(json.dumps(params, indent=1)))

        self._validate_compute_corr_matrix_params(params)

        input_obj_ref = params.get('input_obj_ref')
        workspace_name = params.get('workspace_name')
        corr_matrix_name = params.get('corr_matrix_name')

        method = params.get('method', 'pearson')
        dimension = params.get('dimension', 'row')
        plot_corr_matrix = params.get('plot_corr_matrix', False)
        plot_scatter_matrix = params.get('plot_scatter_matrix', False)
        compute_significance = params.get('compute_significance', False)

        res = self.dfu.get_objects({'object_refs': [input_obj_ref]})['data'][0]
        obj_type = res['info'][2]

        if "KBaseMatrices" in obj_type:
            corr_df, data_df = self._corr_for_matrix(input_obj_ref, method,
                                                     dimension)
            sig_df = None
            if compute_significance:
                sig_df = self._compute_significance(data_df, dimension)
        else:
            err_msg = 'Ooops! [{}] is not supported.\n'.format(obj_type)
            err_msg += 'Please supply KBaseMatrices object'
            raise ValueError("err_msg")

        if plot_corr_matrix:
            corr_matrix_plot_path = self.plotly_corr_matrix(corr_df)
        else:
            corr_matrix_plot_path = None

        if plot_scatter_matrix:
            scatter_plot_path = self.plot_scatter_matrix(data_df,
                                                         dimension=dimension)
        else:
            scatter_plot_path = None

        corr_matrix_obj_ref = self._save_corr_matrix(
            workspace_name,
            corr_matrix_name,
            corr_df,
            sig_df,
            method,
            matrix_ref=[input_obj_ref])

        returnVal = {'corr_matrix_obj_ref': corr_matrix_obj_ref}

        report_output = self._generate_corr_report(corr_matrix_obj_ref,
                                                   workspace_name,
                                                   corr_matrix_plot_path,
                                                   scatter_plot_path)

        returnVal.update(report_output)

        return returnVal

    def export_corr_matrix_excel(self, params):
        """
        export CorrelationMatrix as Excel
        """

        corr_matrix_ref = params.get('input_ref')

        coefficient_df, significance_df = self._corr_to_df(corr_matrix_ref)

        result_dir = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(result_dir)

        self._corr_df_to_excel(coefficient_df, significance_df, result_dir,
                               corr_matrix_ref)

        package_details = self.dfu.package_for_download({
            'file_path':
            result_dir,
            'ws_refs': [corr_matrix_ref]
        })

        return {'shock_id': package_details['shock_id']}
Example #4
0
class PCAUtil:
    def _mkdir_p(self, path):
        """
        _mkdir_p: make directory for given path
        """
        if not path:
            return
        try:
            os.makedirs(path)
        except OSError as exc:
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise

    def _validate_run_pca_params(self, params):
        """
        _validate_run_pca_params:
            validates params passed to run_pca method
        """

        logging.info('start validating run_pca params')

        # check for required parameters
        for p in ['input_obj_ref', 'workspace_name', 'pca_matrix_name']:
            if p not in params:
                raise ValueError(
                    '"{}" parameter is required, but missing'.format(p))

    def _df_to_list(self, df):
        """
        _df_to_list: convert Dataframe to FloatMatrix2D matrix data
        """

        df.index = df.index.astype('str')
        df.columns = df.columns.astype('str')
        df.fillna(0, inplace=True)
        matrix_data = {
            'row_ids': df.index.tolist(),
            'col_ids': df.columns.tolist(),
            'values': df.values.tolist()
        }

        return matrix_data

    def _pca_df_to_excel(self, pca_df, components_df, result_dir,
                         pca_matrix_ref):
        """
        write PCA matrix df into excel
        """
        logging.info('writting pca data frame to excel file')
        pca_matrix_obj = self.dfu.get_objects(
            {'object_refs': [pca_matrix_ref]})['data'][0]
        pca_matrix_info = pca_matrix_obj['info']
        pca_matrix_name = pca_matrix_info[1]

        file_path = os.path.join(result_dir, pca_matrix_name + ".xlsx")

        writer = pd.ExcelWriter(file_path)

        pca_df.to_excel(writer, "principal_component_matrix", index=True)
        if components_df is not None:
            components_df.to_excel(writer,
                                   "component_variance_matrix",
                                   index=True)

        writer.close()

    def _Matrix2D_to_df(self, Matrix2D):
        """
        _Matrix2D_to_df: transform a FloatMatrix2D to data frame
        """

        index = Matrix2D.get('row_ids')
        columns = Matrix2D.get('col_ids')
        values = Matrix2D.get('values')

        df = pd.DataFrame(values, index=index, columns=columns)

        return df

    def _pca_to_df(self, pca_matrix_ref):
        """
        retrieve pca matrix ws object to pca_df
        """
        logging.info('converting pca matrix to data frame')
        pca_data = self.dfu.get_objects({'object_refs':
                                         [pca_matrix_ref]})['data'][0]['data']

        rotation_matrix_data = pca_data.get('rotation_matrix')
        components_matrix_data = pca_data.get('components_matrix')

        explained_variance = pca_data.get('explained_variance')
        explained_variance_ratio = pca_data.get('explained_variance_ratio')
        singular_values = pca_data.get('singular_values')
        dimension = pca_data.get('pca_parameters').get('dimension')
        original_matrix_ref = pca_data.get('original_matrix_ref')

        pca_df = self._Matrix2D_to_df(rotation_matrix_data)
        components_df = None
        if components_matrix_data:
            components_df = self._Matrix2D_to_df(components_matrix_data)
            components_df.loc['explained_variance'] = explained_variance
            components_df.loc[
                'explained_variance_ratio'] = explained_variance_ratio
            components_df.loc['singular_values'] = singular_values

        if original_matrix_ref:
            logging.info(
                'appending instance group information to pca data frame')
            obj_data = self.dfu.get_objects(
                {'object_refs': [original_matrix_ref]})['data'][0]['data']

            attributemapping_ref = obj_data.get(
                '{}_attributemapping_ref'.format(dimension))

            am_data = self.dfu.get_objects(
                {'object_refs': [attributemapping_ref]})['data'][0]['data']

            attributes = am_data.get('attributes')
            instances = am_data.get('instances')
            am_df = pd.DataFrame(data=list(instances.values()),
                                 columns=list(
                                     map(lambda x: x.get('attribute'),
                                         attributes)),
                                 index=instances.keys())

            pca_df = pca_df.merge(am_df,
                                  left_index=True,
                                  right_index=True,
                                  how='left',
                                  validate='one_to_one')

        return pca_df, components_df

    def _save_pca_matrix(self, workspace_name, input_obj_ref, pca_matrix_name,
                         rotation_matrix_df, components_df, explained_variance,
                         explained_variance_ratio, singular_values,
                         n_components, dimension):

        logging.info('saving PCAMatrix')

        if not isinstance(workspace_name, int):
            ws_name_id = self.dfu.ws_name_to_id(workspace_name)
        else:
            ws_name_id = workspace_name

        pca_data = {}

        pca_data.update(
            {'rotation_matrix': self._df_to_list(rotation_matrix_df)})
        pca_data.update({'components_matrix': self._df_to_list(components_df)})
        pca_data.update({'explained_variance': explained_variance})
        pca_data.update({'explained_variance_ratio': explained_variance_ratio})
        pca_data.update({'singular_values': singular_values})
        pca_data.update({
            'pca_parameters': {
                'n_components': str(n_components),
                'dimension': str(dimension)
            }
        })
        pca_data.update({'original_matrix_ref': input_obj_ref})

        obj_type = 'KBaseExperiments.PCAMatrix'
        info = self.dfu.save_objects({
            "id":
            ws_name_id,
            "objects": [{
                "type": obj_type,
                "data": pca_data,
                "name": pca_matrix_name
            }]
        })[0]

        return "%s/%s/%s" % (info[6], info[0], info[4])

    def _creat_biplot(self,
                      xs,
                      ys,
                      coeff,
                      first_component,
                      second_component,
                      bi_plot_path,
                      labels=None):
        plt.clf()
        n = coeff.shape[0]
        scalex = 1.0 / (xs.max() - xs.min())
        scaley = 1.0 / (ys.max() - ys.min())
        plt.scatter(xs * scalex, ys * scaley, s=5)
        for i in range(n):
            plt.arrow(0, 0, coeff[i, 0], coeff[i, 1], color='r', alpha=0.5)
            if labels is None:
                plt.text(coeff[i, 0] * 1.15,
                         coeff[i, 1] * 1.15,
                         "Var" + str(i + 1),
                         color='green',
                         ha='center',
                         va='center')
            else:
                plt.text(coeff[i, 0] * 1.15,
                         coeff[i, 1] * 1.15,
                         labels[i],
                         color='green',
                         ha='center',
                         va='center')

        plt.xlabel("PC{}".format(first_component))
        plt.ylabel("PC{}".format(second_component))
        plt.grid()
        plt.savefig(bi_plot_path)

    def _pca_for_matrix(self, input_obj_ref, n_components, dimension):
        """
        _pca_for_matrix: perform PCA analysis for matrix object
        """

        data_matrix = self.data_util.fetch_data({
            'obj_ref': input_obj_ref
        }).get('data_matrix')

        data_df = pd.read_json(data_matrix)
        data_df.fillna(0, inplace=True)

        if dimension == 'col':
            data_df = data_df.T
        elif dimension != 'row':
            err_msg = 'Input dimension [{}] is not available.\n'.format(
                dimension)
            err_msg += 'Please choose either "col" or "row"'
            raise ValueError(err_msg)

        if n_components > min(data_df.index.size, data_df.columns.size):
            raise ValueError(
                'Number of components should be less than min(n_samples, n_features)'
            )

        # normalize sample
        # logging.info("Standardizing the matrix")
        # s_values = StandardScaler().fit_transform(data_df.values)
        # skip normalizing sample
        s_values = data_df.values

        # Projection to ND
        pca = PCA(n_components=n_components, whiten=True)
        principalComponents = pca.fit_transform(s_values)
        explained_variance = list(pca.explained_variance_)
        explained_variance_ratio = list(pca.explained_variance_ratio_)

        components = pca.components_
        singular_values = list(pca.singular_values_)

        col = list()
        for i in range(n_components):
            col.append('principal_component_{}'.format(i + 1))

        rotation_matrix_df = pd.DataFrame(data=principalComponents,
                                          columns=col,
                                          index=data_df.index)

        components_df = pd.DataFrame(data=components,
                                     columns=data_df.columns,
                                     index=col).transpose()

        rotation_matrix_df.fillna(0, inplace=True)

        return (rotation_matrix_df, components_df, explained_variance,
                explained_variance_ratio, singular_values)

    def _generate_pca_html_report(self, score_plots, loading_plots, bi_plots,
                                  n_components):

        logging.info('start generating html report')
        html_report = list()

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_directory)
        result_file_path = os.path.join(output_directory, 'pca_report.html')

        visualization_content = ''
        biplot_content = ''
        loading_content = ''

        for score_plot in score_plots:
            shutil.copy2(
                score_plot,
                os.path.join(output_directory, os.path.basename(score_plot)))
            visualization_content += '<iframe height="900px" width="100%" '
            visualization_content += 'src="{}" '.format(
                os.path.basename(score_plot))
            visualization_content += 'style="border:none;"></iframe>\n<p></p>\n'

        for loading_plot in loading_plots:
            shutil.copy2(
                loading_plot,
                os.path.join(output_directory, os.path.basename(loading_plot)))
            loading_content += '<iframe height="900px" width="100%" '
            loading_content += 'src="{}" '.format(
                os.path.basename(loading_plot))
            loading_content += 'style="border:none;"></iframe>\n<p></p>\n'

        for bi_plot in bi_plots:
            shutil.copy2(
                bi_plot,
                os.path.join(output_directory, os.path.basename(bi_plot)))
            biplot_content += '<iframe height="900px" width="100%" '
            biplot_content += 'src="{}" '.format(os.path.basename(bi_plot))
            biplot_content += 'style="border:none;"></iframe>\n<p></p>\n'

        with open(result_file_path, 'w') as result_file:
            with open(
                    os.path.join(os.path.dirname(__file__), 'templates',
                                 'pca_template.html'),
                    'r') as report_template_file:
                report_template = report_template_file.read()
                report_template = report_template.replace(
                    '<p>Visualization_Content</p>', visualization_content)
                report_template = report_template.replace(
                    'n_components', '{} Components'.format(n_components))
                report_template = report_template.replace(
                    '<p>BiPlot</p>', biplot_content)
                report_template = report_template.replace(
                    '<p>LoadingPlot</p>', loading_content)
                result_file.write(report_template)

        report_shock_id = self.dfu.file_to_shock({
            'file_path': output_directory,
            'pack': 'zip'
        })['shock_id']

        html_report.append({
            'shock_id':
            report_shock_id,
            'name':
            os.path.basename(result_file_path),
            'label':
            os.path.basename(result_file_path),
            'description':
            'HTML summary report for ExpressionMatrix Cluster App'
        })
        return html_report

    def _generate_pca_report(self, pca_ref, score_plots, loading_plots,
                             bi_plots, workspace_name, n_components):
        logging.info('creating report')

        output_html_files = self._generate_pca_html_report(
            score_plots, loading_plots, bi_plots, n_components)

        objects_created = list()
        objects_created.append({'ref': pca_ref, 'description': 'PCA Matrix'})

        report_params = {
            'message': '',
            'workspace_name': workspace_name,
            'objects_created': objects_created,
            'html_links': output_html_files,
            'direct_html_link_index': 0,
            'html_window_height': 1050,
            'report_object_name': 'kb_pca_report_' + str(uuid.uuid4())
        }

        kbase_report_client = KBaseReport(self.callback_url)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {
            'report_name': output['name'],
            'report_ref': output['ref']
        }

        return report_output

    def _append_instance_group(self, plot_pca_matrix, obj_data, dimension):
        plot_pca_matrix = plot_pca_matrix.copy()

        if dimension == 'row':
            attribute_mapping = obj_data.get('row_mapping')
        elif dimension == 'col':
            attribute_mapping = obj_data.get('col_mapping')
        else:
            raise ValueError('Unexpected dimension')

        if not attribute_mapping:
            logging.warning(
                'Matrix object does not have {}_mapping attribute'.format(
                    dimension))
            # build matrix with unify color and shape
            return plot_pca_matrix
        else:
            # append instance col mapping from row/col_mapping
            plot_pca_matrix['instance'] = plot_pca_matrix.index.map(
                attribute_mapping)

        return plot_pca_matrix

    def _build_size_pca_matrix(self, plot_pca_matrix, obj_data, dimension,
                               attribute_name):
        """
        _build_size_pca_matrix: append attribute value to rotation_matrix
        """
        logging.info('appending attribute value for sizing to rotation matrix')

        plot_pca_matrix = plot_pca_matrix.copy()

        if dimension == 'row':
            attribute_mapping = obj_data.get('row_mapping')
            attribute_mapping_ref = obj_data.get('row_attributemapping_ref')
        elif dimension == 'col':
            attribute_mapping = obj_data.get('col_mapping')
            attribute_mapping_ref = obj_data.get('col_attributemapping_ref')
        else:
            raise ValueError('Unexpected dimension')

        if not attribute_mapping:
            logging.warning(
                'Matrix object does not have {}_mapping attribute'.format(
                    dimension))
            # build matrix with unify color and shape
            return plot_pca_matrix
        else:
            # append instance col mapping from row/col_mapping
            plot_pca_matrix['instance'] = plot_pca_matrix.index.map(
                attribute_mapping)

        res = self.dfu.get_objects({'object_refs':
                                    [attribute_mapping_ref]})['data'][0]
        attri_data = res['data']
        attri_name = res['info'][1]

        attributes = attri_data.get('attributes')

        attr_pos = None
        for idx, attribute in enumerate(attributes):
            if attribute.get('attribute') == attribute_name:
                attr_pos = idx
                break

        if attr_pos is None:
            raise ValueError('Cannot find attribute [{}] in [{}]'.format(
                attribute_name, attri_name))

        instances = attri_data.get('instances')

        plot_pca_matrix['attribute_value_size'] = None
        for instance_name, attri_values in instances.items():
            plot_pca_matrix.loc[
                plot_pca_matrix.instance == instance_name,
                ['attribute_value_size']] = attri_values[attr_pos]

        return plot_pca_matrix

    def _build_color_pca_matrix(self, plot_pca_matrix, obj_data, dimension,
                                attribute_name):
        """
        _build_color_pca_matrix: append attribute value to rotation_matrix
        """
        logging.info(
            'appending attribute value for grouping color to rotation matrix')

        plot_pca_matrix = plot_pca_matrix.copy()

        if dimension == 'row':
            attribute_mapping = obj_data.get('row_mapping')
            attribute_mapping_ref = obj_data.get('row_attributemapping_ref')
        elif dimension == 'col':
            attribute_mapping = obj_data.get('col_mapping')
            attribute_mapping_ref = obj_data.get('col_attributemapping_ref')
        else:
            raise ValueError('Unexpected dimension')

        if not attribute_mapping:
            logging.warning(
                'Matrix object does not have {}_mapping attribute'.format(
                    dimension))
            # build matrix with unify color and shape
            return plot_pca_matrix
        else:
            # append instance col mapping from row/col_mapping
            plot_pca_matrix['instance'] = plot_pca_matrix.index.map(
                attribute_mapping)

        res = self.dfu.get_objects({'object_refs':
                                    [attribute_mapping_ref]})['data'][0]
        attri_data = res['data']
        attri_name = res['info'][1]

        attributes = attri_data.get('attributes')

        attr_pos = None
        for idx, attribute in enumerate(attributes):
            if attribute.get('attribute') == attribute_name:
                attr_pos = idx
                break

        if attr_pos is None:
            raise ValueError('Cannot find attribute [{}] in [{}]'.format(
                attribute_name, attri_name))

        instances = attri_data.get('instances')

        plot_pca_matrix['attribute_value_color'] = None
        for instance_name, attri_values in instances.items():
            plot_pca_matrix.loc[
                plot_pca_matrix.instance == instance_name,
                ['attribute_value_color']] = attri_values[attr_pos]

        return plot_pca_matrix

    def _build_2_comp_trace(self, plot_pca_matrix, components_x, components_y):

        traces = []

        if 'attribute_value_color' in plot_pca_matrix.columns and 'attribute_value_size' in plot_pca_matrix.columns:

            maximum_marker_size = 10
            try:
                sizeref = 2. * float(
                    max(plot_pca_matrix['attribute_value_size'])) / (
                        maximum_marker_size**2)
            except Exception:
                print('failed to run _build_2_comp_trace')
                print(traceback.format_exc())
                print(sys.exc_info()[2])
                error_msg = "Failed to calculate data point value size."
                error_msg += "Some data value in your matrix is not numerical."
                raise ValueError(error_msg)

            for name in set(plot_pca_matrix.attribute_value_color):
                attribute_value_size = plot_pca_matrix.loc[plot_pca_matrix[
                    'attribute_value_color'].eq(name)].attribute_value_size
                size_list = list(
                    map(abs, list(map(float, attribute_value_size))))
                for idx, val in enumerate(size_list):
                    if val == 0:
                        size_list[idx] = sys.float_info.min
                trace = go.Scatter(
                    x=list(plot_pca_matrix.loc[plot_pca_matrix[
                        'attribute_value_color'].eq(name)][components_x]),
                    y=list(plot_pca_matrix.loc[plot_pca_matrix[
                        'attribute_value_color'].eq(name)][components_y]),
                    mode='markers',
                    name=name,
                    text=list(plot_pca_matrix.loc[plot_pca_matrix[
                        'attribute_value_color'].eq(name)].index),
                    textposition='bottom center',
                    marker=go.Marker(symbol='circle',
                                     sizemode='area',
                                     sizeref=sizeref,
                                     size=size_list,
                                     sizemin=2,
                                     line=go.Line(
                                         color='rgba(217, 217, 217, 0.14)',
                                         width=0.5),
                                     opacity=0.8))
                traces.append(trace)
        elif 'attribute_value_color' in plot_pca_matrix.columns:
            for name in set(plot_pca_matrix.attribute_value_color):
                trace = go.Scatter(
                    x=list(plot_pca_matrix.loc[plot_pca_matrix[
                        'attribute_value_color'].eq(name)][components_x]),
                    y=list(plot_pca_matrix.loc[plot_pca_matrix[
                        'attribute_value_color'].eq(name)][components_y]),
                    mode='markers',
                    name=name,
                    text=list(plot_pca_matrix.loc[plot_pca_matrix[
                        'attribute_value_color'].eq(name)].index),
                    textposition='bottom center',
                    marker=go.Marker(size=10,
                                     opacity=0.8,
                                     line=go.Line(
                                         color='rgba(217, 217, 217, 0.14)',
                                         width=0.5)))
                traces.append(trace)
        elif 'attribute_value_size' in plot_pca_matrix.columns:

            maximum_marker_size = 10
            try:
                sizeref = 2. * float(
                    max(plot_pca_matrix['attribute_value_size'])) / (
                        maximum_marker_size**2)
            except Exception:
                print('failed to run _build_2_comp_trace')
                print(traceback.format_exc())
                print(sys.exc_info()[2])
                error_msg = "Failed to calculate data point value size."
                error_msg += "Some data value in your matrix is not numerical."
                raise ValueError(error_msg)

            for name in set(plot_pca_matrix.instance):
                attribute_value_size = plot_pca_matrix.loc[
                    plot_pca_matrix['instance'].eq(name)].attribute_value_size
                size_list = list(
                    map(abs, list(map(float, attribute_value_size))))
                for idx, val in enumerate(size_list):
                    if val == 0:
                        size_list[idx] = sys.float_info.min
                trace = go.Scatter(
                    x=list(plot_pca_matrix.loc[plot_pca_matrix['instance'].eq(
                        name)][components_x]),
                    y=list(plot_pca_matrix.loc[plot_pca_matrix['instance'].eq(
                        name)][components_y]),
                    mode='markers',
                    name=name,
                    text=list(plot_pca_matrix.loc[
                        plot_pca_matrix['instance'].eq(name)].index),
                    textposition='bottom center',
                    marker=go.Marker(symbol='circle',
                                     sizemode='area',
                                     sizeref=sizeref,
                                     size=size_list,
                                     sizemin=2,
                                     line=go.Line(
                                         color='rgba(217, 217, 217, 0.14)',
                                         width=0.5),
                                     opacity=0.8))
                traces.append(trace)
        else:
            trace = go.Scatter(x=list(plot_pca_matrix[components_x]),
                               y=list(plot_pca_matrix[components_y]),
                               mode='markers',
                               name='score plot',
                               text=list(plot_pca_matrix.index),
                               textposition='bottom center',
                               marker=go.Marker(
                                   size=10,
                                   opacity=0.8,
                                   line=go.Line(
                                       color='rgba(217, 217, 217, 0.14)',
                                       width=0.5)))
            traces.append(trace)

        return traces

    def _plot_score_pca_matrix(self, plot_pca_matrix, n_components):

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_directory)
        result_file_paths = []

        all_pairs = list(itertools.combinations(range(1, n_components + 1), 2))

        for pair in all_pairs:
            first_component = pair[0]
            second_component = pair[1]
            result_file_path = os.path.join(
                output_directory,
                'pca_score_plot_{}_{}.html'.format(first_component,
                                                   second_component))

            traces = self._build_2_comp_trace(
                plot_pca_matrix,
                'principal_component_{}'.format(first_component),
                'principal_component_{}'.format(second_component))

            data = go.Data(traces)
            layout = go.Layout(
                xaxis=go.XAxis(title='PC{}'.format(first_component),
                               showline=False),
                yaxis=go.YAxis(title='PC{}'.format(second_component),
                               showline=False))
            fig = go.Figure(data=data, layout=layout)

            plot(fig, filename=result_file_path)

            result_file_paths.append(result_file_path)

        return result_file_paths

    def _plot_loading_pca_matrix(self, components_df, n_components):

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_directory)
        result_file_paths = []

        all_pairs = list(itertools.combinations(range(1, n_components + 1), 2))

        for pair in all_pairs:
            first_component = pair[0]
            second_component = pair[1]
            result_file_path = os.path.join(
                output_directory,
                'pca_loading_plot_{}_{}.html'.format(first_component,
                                                     second_component))

            traces = list()
            data = go.Data(traces)
            layout = go.Layout(
                xaxis=go.XAxis(title='PC{}'.format(first_component),
                               showline=False),
                yaxis=go.YAxis(title='PC{}'.format(second_component),
                               showline=False))
            fig = go.Figure(data=data, layout=layout)

            coeff = list()
            coeff.append(components_df['principal_component_{}'.format(
                first_component)])
            coeff.append(components_df['principal_component_{}'.format(
                second_component)])
            coeff = np.transpose(coeff)

            loading_x = list()
            loading_y = list()
            loading_text = list()
            for idx, position in enumerate(coeff):
                loading_x.append(0)
                loading_y.append(0)
                loading_text.append('0')

                loading_x.append(position[0])
                loading_y.append(position[1])
                loading_text.append(components_df.index[idx])

            fig.add_trace(
                go.Scatter(x=loading_x,
                           y=loading_y,
                           mode="lines+markers",
                           name="loading plot",
                           text=loading_text,
                           textposition="bottom center"))

            plot(fig, filename=result_file_path)

            result_file_paths.append(result_file_path)

        return result_file_paths

    def _plot_biplot_pca_matrix(self, plot_pca_matrix, components_df,
                                n_components):

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_directory)
        result_file_paths = []

        all_pairs = list(itertools.combinations(range(1, n_components + 1), 2))

        for pair in all_pairs:
            first_component = pair[0]
            second_component = pair[1]
            result_file_path = os.path.join(
                output_directory,
                'pca_biplot_plot_{}_{}.html'.format(first_component,
                                                    second_component))

            traces = self._build_2_comp_trace(
                plot_pca_matrix,
                'principal_component_{}'.format(first_component),
                'principal_component_{}'.format(second_component))

            data = go.Data(traces)
            layout = go.Layout(
                xaxis=go.XAxis(title='PC{}'.format(first_component),
                               showline=False),
                yaxis=go.YAxis(title='PC{}'.format(second_component),
                               showline=False))
            fig = go.Figure(data=data, layout=layout)

            coeff = list()
            coeff.append(components_df['principal_component_{}'.format(
                first_component)])
            coeff.append(components_df['principal_component_{}'.format(
                second_component)])
            coeff = np.transpose(coeff)

            loading_x = list()
            loading_y = list()
            loading_text = list()
            for idx, position in enumerate(coeff):
                loading_x.append(0)
                loading_y.append(0)
                loading_text.append('0')

                loading_x.append(position[0])
                loading_y.append(position[1])
                loading_text.append(components_df.index[idx])

            fig.add_trace(
                go.Scatter(x=loading_x,
                           y=loading_y,
                           mode="lines+markers",
                           name="loading plot",
                           text=loading_text,
                           textposition="bottom center"))

            plot(fig, filename=result_file_path)

            result_file_paths.append(result_file_path)

        return result_file_paths

    def _validate_pca_matrix(self, obj_data, dimension, color_marker_by,
                             scale_size_by):

        if dimension == 'row':
            attribute_mapping = obj_data.get('row_mapping')
            attributemapping_ref = obj_data.get('row_attributemapping_ref')
            if not attributemapping_ref:
                # handle Functional Profile
                base_object_ref = obj_data.get('base_object_ref')
                base_object_data = self.dfu.get_objects(
                    {'object_refs': [base_object_ref]})['data'][0]['data']
                attributemapping_ref = base_object_data.get(
                    'row_attributemapping_ref')
                obj_data['row_attributemapping_ref'] = attributemapping_ref
            if not attribute_mapping and attributemapping_ref:
                am_data = self.dfu.get_objects(
                    {'object_refs': [attributemapping_ref]})['data'][0]['data']
                attribute_mapping = {x: x for x in am_data['instances'].keys()}
                obj_data['row_mapping'] = attribute_mapping
        elif dimension == 'col':
            attribute_mapping = obj_data.get('col_mapping')
            attributemapping_ref = obj_data.get('col_attributemapping_ref')
            if not attributemapping_ref:
                # handle Functional Profile
                base_object_ref = obj_data.get('base_object_ref')
                base_object_data = self.dfu.get_objects(
                    {'object_refs': [base_object_ref]})['data'][0]['data']
                attributemapping_ref = base_object_data.get(
                    'row_attributemapping_ref')
                obj_data['row_attributemapping_ref'] = attributemapping_ref
            if not attribute_mapping and attributemapping_ref:
                am_data = self.dfu.get_objects(
                    {'object_refs': [attributemapping_ref]})['data'][0]['data']
                attribute_mapping = {x: x for x in am_data['instances'].keys()}
                obj_data['col_mapping'] = attribute_mapping
        else:
            raise ValueError('Unexpected dimension')

        if not attribute_mapping:
            if (color_marker_by and color_marker_by.get('attribute_color')[0]) or \
               (scale_size_by and scale_size_by.get('attribute_size')[0]):
                raise ValueError(
                    'Matrix object is not associated with any {} attribute mapping'
                    .format(dimension))

        return obj_data

    def __init__(self, config):
        self.ws_url = config["workspace-url"]
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        self.scratch = config['scratch']

        self.data_util = DataUtil(config)
        self.dfu = DataFileUtil(self.callback_url)

        plt.switch_backend('agg')

    def run_pca(self, params):
        """
        perform PCA analysis on matrix

        input_obj_ref: object reference of a matrix
        workspace_name: the name of the workspace
        pca_matrix_name: name of PCA (KBaseExperiments.PCAMatrix) object

        n_components - number of components (default 2)
        dimension: compute correlation on column or row, one of ['col', 'row']
        """

        logging.info('--->\nrunning NetworkUtil.build_network\n' +
                     'params:\n{}'.format(json.dumps(params, indent=1)))

        self._validate_run_pca_params(params)

        input_obj_ref = params.get('input_obj_ref')
        workspace_name = params.get('workspace_name')
        pca_matrix_name = params.get('pca_matrix_name')

        n_components = int(params.get('n_components', 2))
        dimension = params.get('dimension', 'col')

        res = self.dfu.get_objects({'object_refs': [input_obj_ref]})['data'][0]
        obj_data = res['data']
        obj_type = res['info'][2]

        obj_data = self._validate_pca_matrix(obj_data, dimension,
                                             params.get('color_marker_by'),
                                             params.get('scale_size_by'))

        if "KBaseMatrices" in obj_type or 'KBaseProfile' in obj_type:

            (rotation_matrix_df, components_df, explained_variance,
             explained_variance_ratio,
             singular_values) = self._pca_for_matrix(input_obj_ref,
                                                     n_components, dimension)
        else:
            err_msg = 'Ooops! [{}] is not supported.\n'.format(obj_type)
            err_msg += 'Please supply KBaseMatrices or KBaseProfile object'
            raise ValueError(err_msg)

        pca_ref = self._save_pca_matrix(workspace_name, input_obj_ref,
                                        pca_matrix_name, rotation_matrix_df,
                                        components_df, explained_variance,
                                        explained_variance_ratio,
                                        singular_values, n_components,
                                        dimension)

        plot_pca_matrix = self._append_instance_group(
            rotation_matrix_df.copy(), obj_data, dimension)

        if params.get('color_marker_by'):
            plot_pca_matrix = self._build_color_pca_matrix(
                plot_pca_matrix, obj_data, dimension,
                params.get('color_marker_by').get('attribute_color')[0])

        if params.get('scale_size_by'):
            plot_pca_matrix = self._build_size_pca_matrix(
                plot_pca_matrix, obj_data, dimension,
                params.get('scale_size_by').get('attribute_size')[0])

        returnVal = {'pca_ref': pca_ref}

        report_output = self._generate_pca_report(
            pca_ref, self._plot_score_pca_matrix(plot_pca_matrix,
                                                 n_components),
            self._plot_loading_pca_matrix(components_df, n_components),
            self._plot_biplot_pca_matrix(plot_pca_matrix, components_df,
                                         n_components), workspace_name,
            n_components)

        returnVal.update(report_output)

        return returnVal

    def export_pca_matrix_excel(self, params):
        """
        export PCAMatrix as Excel
        """
        logging.info('start exporting pca matrix')
        pca_matrix_ref = params.get('input_ref')

        pca_df, components_df = self._pca_to_df(pca_matrix_ref)

        result_dir = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(result_dir)

        self._pca_df_to_excel(pca_df, components_df, result_dir,
                              pca_matrix_ref)

        package_details = self.dfu.package_for_download({
            'file_path':
            result_dir,
            'ws_refs': [pca_matrix_ref]
        })

        return {'shock_id': package_details['shock_id']}
Example #5
0
class Utils:
    def __init__(self, config):
        self.ws_url = config["workspace-url"]
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        self.shock_url = config['shock-url']
        self.srv_wiz_url = config['srv-wiz-url']
        self.scratch = config['scratch']
        self.dfu = DataFileUtil(self.callback_url)

    @staticmethod
    def validate_params(params, expected, opt_param=set()):
        """Validates that required parameters are present. Warns if unexpected parameters appear"""
        expected = set(expected)
        opt_param = set(opt_param)
        pkeys = set(params)
        if expected - pkeys:
            raise ValueError("Required keys {} not in supplied parameters"
                             .format(", ".join(expected - pkeys)))
        defined_param = expected | opt_param
        for param in params:
            if param not in defined_param:
                logging.warning("Unexpected parameter {} supplied".format(param))

    def _ws_obj_to_cobra(self, ref):
        ret = self.dfu.get_objects({'object_refs': [ref]})['data'][0]
        name = ret['info'][1]
        
        #old nasty method
        #model = cobrakbase.convert_kmodel(ret['data'])
        #if 'genome_ref' in ret['data']:
        #    logging.info(f"Annotating model with genome information: {ret['data']['genome_ref']}")
        #    genome = self.dfu.get_objects(
        #        {'object_refs': [ret['data']['genome_ref']]})['data'][0]['data']
        #    cobrakbase.annotate_model_with_genome(model, genome)
        
        #fbamodel object wraps json data
        fbamodel = KBaseFBAModel(ret['data'])
        
        builder = KBaseFBAModelToCobraBuilder(fbamodel)
        
        if 'genome_ref' in ret['data']:
            logging.info(f"Annotating model with genome information: {ret['data']['genome_ref']}")
            genome = self.dfu.get_objects(
                {'object_refs': [ret['data']['genome_ref']]})['data'][0]['data']
            #adding Genome to the Builder
            builder.with_genome(KBaseGenome(genome))
                         
        #converts to cobra model object with builder
        model = builder.build()

        modelseed = cobrakbase.modelseed.from_local('/kb/module/data/')
        print(cobrakbase.annotate_model_with_modelseed(model, modelseed))

        return name, model

    def to_sbml(self, params):
        """Convert a FBAModel to a SBML file"""
        files = {}
        _id, cobra_model = self._ws_obj_to_cobra(params['input_ref'])
        files['file_path'] = os.path.join(params['destination_dir'], _id + ".xml")
        cobra.io.write_sbml_model(cobra_model, files['file_path'])

        return _id, files

    def export(self, file, name, input_ref):
        """Saves a set of files to SHOCK for export"""
        export_package_dir = os.path.join(self.scratch, name + str(uuid.uuid4()))
        os.makedirs(export_package_dir)
        shutil.move(file, os.path.join(export_package_dir, os.path.basename(file)))

        # package it up and be done
        package_details = self.dfu.package_for_download({
            'file_path': export_package_dir,
            'ws_refs': [input_ref]
        })

        return {'shock_id': package_details['shock_id']}
Example #6
0
class AttributesUtil:
    def __init__(self, config):
        self.ws_url = config["workspace-url"]
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        self.shock_url = config['shock-url']
        self.srv_wiz_url = config['srv-wiz-url']
        self.scratch = config['scratch']
        self.dfu = DataFileUtil(self.callback_url)
        self.kbse = KBaseSearchEngine(config['search-url'])
        self.data_util = DataUtil(config)
        self.wsClient = workspaceService(self.ws_url, token=self.token)
        self.DEFAULT_ONTOLOGY_ID = "Custom:Term"
        self.DEFAULT_UNIT_ID = "Custom:Unit"
        self.ONT_LABEL_DEL = " - "
        self.ONT_TERM_DEL = ":"

    @staticmethod
    def validate_params(params, expected, opt_param=set()):
        """Validates that required parameters are present. Warns if unexpected parameters appear"""
        expected = set(expected)
        opt_param = set(opt_param)
        pkeys = set(params)
        if expected - pkeys:
            raise ValueError(
                "Required keys {} not in supplied parameters".format(
                    ", ".join(expected - pkeys)))
        defined_param = expected | opt_param
        for param in params:
            if param not in defined_param:
                logging.warning(
                    "Unexpected parameter {} supplied".format(param))

    def file_to_attribute_mapping(self, params):
        """Convert a user supplied file to a compound set"""
        if 'input_file_path' in params:
            scratch_file_path = params['input_file_path']
        elif 'input_shock_id' in params:
            scratch_file_path = self.dfu.shock_to_file({
                'shock_id':
                params['input_shock_id'],
                'file_path':
                self.scratch
            }).get('file_path')
        else:
            raise ValueError(
                "Must supply either a input_shock_id or input_file_path")
        attr_mapping = self._file_to_am_obj(scratch_file_path)
        info = self.dfu.save_objects({
            "id":
            params['output_ws_id'],
            "objects": [{
                "type": "KBaseExperiments.AttributeMapping",
                "data": attr_mapping,
                "name": params['output_obj_name']
            }]
        })[0]
        return {
            "attribute_mapping_ref": "%s/%s/%s" % (info[6], info[0], info[4])
        }

    def append_file_to_attribute_mapping(self,
                                         staging_file_subdir_path,
                                         old_am_ref,
                                         output_ws_id,
                                         new_am_name=None):
        """append an attribute mapping file to existing attribute mapping object
        """

        download_staging_file_params = {
            'staging_file_subdir_path': staging_file_subdir_path
        }
        scratch_file_path = self.dfu.download_staging_file(
            download_staging_file_params).get('copy_file_path')

        append_am_data = self._file_to_am_obj(scratch_file_path)

        old_am_obj = self.dfu.get_objects({'object_refs':
                                           [old_am_ref]})['data'][0]

        old_am_info = old_am_obj['info']
        old_am_name = old_am_info[1]
        old_am_data = old_am_obj['data']

        new_am_data = self._check_and_append_am_data(old_am_data,
                                                     append_am_data)

        if not new_am_name:
            current_time = time.localtime()
            new_am_name = old_am_name + time.strftime('_%H_%M_%S_%Y_%m_%d',
                                                      current_time)

        info = self.dfu.save_objects({
            "id":
            output_ws_id,
            "objects": [{
                "type": "KBaseExperiments.AttributeMapping",
                "data": new_am_data,
                "name": new_am_name
            }]
        })[0]
        return {
            "attribute_mapping_ref": "%s/%s/%s" % (info[6], info[0], info[4])
        }

    def update_matrix_attribute_mapping(self, params):

        dimension = params.get('dimension')
        if dimension not in ['col', 'row']:
            raise ValueError('Please use "col" or "row" for input dimension')

        workspace_name = params.get('workspace_name')

        old_matrix_ref = params.get('input_matrix_ref')
        old_matrix_obj = self.dfu.get_objects(
            {'object_refs': [old_matrix_ref]})['data'][0]
        old_matrix_info = old_matrix_obj['info']
        old_matrix_data = old_matrix_obj['data']

        old_am_ref = old_matrix_data.get(
            '{}_attributemapping_ref'.format(dimension))

        if not isinstance(workspace_name, int):
            workspace_id = self.dfu.ws_name_to_id(workspace_name)
        else:
            workspace_id = workspace_name

        if not old_am_ref:
            raise ValueError(
                'Matrix object does not have {} attribute mapping'.format(
                    dimension))

        new_am_ref = self.append_file_to_attribute_mapping(
            params['staging_file_subdir_path'], old_am_ref, workspace_id,
            params['output_am_obj_name'])['attribute_mapping_ref']

        old_matrix_data['{}_attributemapping_ref'.format(
            dimension)] = new_am_ref

        info = self.dfu.save_objects({
            "id":
            workspace_id,
            "objects": [{
                "type": old_matrix_info[2],
                "data": old_matrix_data,
                "name": params['output_matrix_obj_name']
            }]
        })[0]

        new_matrix_obj_ref = "%s/%s/%s" % (info[6], info[0], info[4])

        objects_created = [{
            'ref': new_am_ref,
            'description': 'Updated Attribute Mapping'
        }, {
            'ref': new_matrix_obj_ref,
            'description': 'Updated Matrix'
        }]

        report_params = {
            'message': '',
            'objects_created': objects_created,
            'workspace_name': workspace_name,
            'report_object_name':
            'import_matrix_from_biom_' + str(uuid.uuid4())
        }

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        return {
            'new_matrix_obj_ref': new_matrix_obj_ref,
            'new_attribute_mapping_ref': new_am_ref,
            'report_name': output['name'],
            'report_ref': output['ref']
        }

    def _check_and_append_am_data(self, old_am_data, append_am_data):

        exclude_keys = {'attributes', 'instances'}
        new_am_data = {
            k: old_am_data[k]
            for k in set(list(old_am_data.keys())) - exclude_keys
        }

        old_attrs = old_am_data.get('attributes')
        old_insts = old_am_data.get('instances')

        append_attrs = append_am_data.get('attributes')
        append_insts = append_am_data.get('instances')

        # checking duplicate attributes
        old_attrs_names = [old_attr.get('attribute') for old_attr in old_attrs]
        append_attrs_names = [
            append_attr.get('attribute') for append_attr in append_attrs
        ]

        duplicate_attrs = set(old_attrs_names).intersection(append_attrs_names)

        if duplicate_attrs:
            error_msg = 'Duplicate attribute mappings: [{}]'.format(
                duplicate_attrs)
            raise ValueError(error_msg)

        # checking missing instances
        missing_inst = old_insts.keys() - append_insts.keys()

        if missing_inst:
            error_msg = 'Appended attribute mapping misses [{}] instances'.format(
                missing_inst)
            raise ValueError(error_msg)

        new_attrs = old_attrs + append_attrs
        new_am_data['attributes'] = new_attrs

        new_insts = deepcopy(old_insts)

        for inst_name, val in new_insts.items():
            append_val = append_insts.get(inst_name)
            val.extend(append_val)

        new_am_data['instances'] = new_insts

        return new_am_data

    def _am_data_to_df(self, data):
        """
        Converts a compound set object data to a dataframe
        """

        attributes = pd.DataFrame(data['attributes'])
        attributes.rename(columns=lambda x: x.replace("ont", "ontology").
                          capitalize().replace("_", " "))
        instances = pd.DataFrame(data['instances'])
        am_df = attributes.join(instances)

        return am_df

    def _clusterset_data_to_df(self, data):
        """
        Converts a cluster set object data to a dataframe
        """

        original_matrix_ref = data.get('original_data')
        data_matrix = self.data_util.fetch_data({
            'obj_ref': original_matrix_ref
        }).get('data_matrix')

        data_df = pd.read_json(data_matrix)
        clusters = data.get('clusters')

        id_name_list = [
            list(cluster.get('id_to_data_position').keys())
            for cluster in clusters
        ]
        id_names = [item for sublist in id_name_list for item in sublist]

        if set(data_df.columns.tolist()) == set(
                id_names):  # cluster is based on columns
            data_df = data_df.T

        cluster_names = [None] * data_df.index.size

        cluster_id = 0
        for cluster in clusters:
            item_ids = list(cluster.get('id_to_data_position').keys())
            item_idx = [data_df.index.get_loc(item_id) for item_id in item_ids]

            for idx in item_idx:
                cluster_names[idx] = cluster_id

            cluster_id += 1

        data_df['cluster'] = cluster_names

        return data_df

    def _ws_obj_to_df(self, input_ref):
        """Converts workspace obj to a DataFrame"""
        res = self.dfu.get_objects({'object_refs': [input_ref]})['data'][0]
        name = res['info'][1]

        obj_type = res['info'][2]

        if "KBaseExperiments.AttributeMapping" in obj_type:
            cs_df = self._am_data_to_df(res['data'])
        elif "KBaseExperiments.ClusterSet" in obj_type:
            cs_df = self._clusterset_data_to_df(res['data'])
        else:
            err_msg = 'Ooops! [{}] is not supported.\n'.format(obj_type)
            err_msg += 'Please supply KBaseExperiments.AttributeMapping or KBaseExperiments.ClusterSet'
            raise ValueError("err_msg")

        return name, cs_df, obj_type

    def _file_to_am_obj(self, scratch_file_path):
        try:
            df = pd.read_excel(scratch_file_path, dtype='str')
        except XLRDError:
            df = pd.read_csv(scratch_file_path, sep=None, dtype='str')
        df = df.replace('nan', '')
        if df.columns[1].lower() == "attribute ontology id":
            am_obj = self._df_to_am_obj(df)
        else:
            am_obj = self._isa_df_to_am_object(df)
        return am_obj

    def _df_to_am_obj(self, am_df):
        """Converts a dataframe from a user file to a compound set object"""
        if not len(am_df):
            raise ValueError("No attributes in supplied files")

        attribute_df = am_df.filter(regex="[Uu]nit|[Aa]ttribute")
        instance_df = am_df.drop(attribute_df.columns, axis=1)
        if not len(instance_df.columns):
            raise ValueError(
                "Unable to find any instance columns in supplied file")

        attribute_df.rename(
            columns=lambda x: x.lower().replace(" ontology ", "_ont_").strip(),
            inplace=True)
        if "attribute" not in attribute_df.columns:
            raise ValueError(
                "Unable to find a 'attribute' column in supplied file")
        attribute_df['source'] = 'upload'
        attribute_fields = ('attribute', 'unit', 'attribute_ont_id',
                            'unit_ont_id', 'source')
        attributes = attribute_df.filter(
            items=attribute_fields).to_dict('records')
        print(attributes)
        self._validate_attribute_values(
            am_df.set_index(attribute_df.attribute).iterrows())

        attribute_mapping = {
            'ontology_mapping_method': "User Curation",
            'attributes': [self._add_ontology_info(f) for f in attributes],
            'instances': instance_df.to_dict('list')
        }

        return attribute_mapping

    def _isa_df_to_am_object(self, isa_df):
        skip_columns = {
            'Raw Data File', 'Derived Data File', 'Array Data File',
            'Image File'
        }
        if 'Sample Name' in isa_df.columns and not any(
                isa_df['Sample Name'].duplicated()):
            isa_df.set_index('Sample Name', inplace=True)
        elif 'Assay Name' in isa_df.columns and not any(
                isa_df['Assay Name'].duplicated()):
            isa_df.set_index('Assay Name', inplace=True)
        elif not any(isa_df[isa_df.columns[0]].duplicated()):
            logging.warning(f'Using {isa_df.columns[0]} as ID column')
            isa_df.set_index(isa_df.columns[0], inplace=True)
        else:
            raise ValueError(
                "Unable to detect an ID column that was unigue for each row. "
                f"Considered 'Sample Names', 'Assay Names' and {isa_df.columns[0]}"
            )
        self._validate_attribute_values(isa_df.iteritems())

        attribute_mapping = {
            'ontology_mapping_method': "User Curation - ISA format"
        }
        attribute_mapping[
            'attributes'], new_skip_cols = self._get_attributes_from_isa(
                isa_df, skip_columns)
        reduced_isa = isa_df.drop(columns=new_skip_cols, errors='ignore')
        attribute_mapping['instances'] = reduced_isa.T.to_dict('list')

        return attribute_mapping

    def _validate_attribute_values(self, attribute_series):
        errors = {}
        for attr, vals in attribute_series:
            try:
                validator = getattr(AttributeValidation, attr)
                attr_errors = validator(vals)
                if attr_errors:
                    errors[attr] = attr_errors
            except AttributeError:
                continue

        if errors:
            for attr, attr_errors in errors.items():
                logging.error(
                    f'Attribute {attr} had the following validation errors:\n'
                    "\n".join(attr_errors) + '\n')
                raise ValueError(
                    f'The following attributes failed validation: {", ".join(errors)}'
                    f'\n See the log for details')

    def _get_attributes_from_isa(self, isa_df, skip_columns):
        attributes = []
        # associate attribute columns with the other columns that relate to them
        for i, col in enumerate(isa_df.columns):
            if col.startswith('Term Source REF'):
                skip_columns.add(col)
                last_attr = attributes[-1]
                if '_unit' in last_attr:
                    last_attr['_unit_ont'] = col
                else:
                    last_attr['_val_ont'] = col

            elif col.startswith('Term Accession Number'):
                # If the term Accession is a web link only grab the last bit
                # Similarly, sometimes the number is prefixed with the term source e.x. UO_0000012
                isa_df[col] = isa_df[col].map(
                    lambda x: x.split("/")[-1].split("_")[-1])
                skip_columns.add(col)
                last_attr = attributes[-1]
                if '_unit' in last_attr:
                    last_attr['_unit_accession'] = col
                else:
                    last_attr['_val_accession'] = col

            elif col.startswith('Unit'):
                skip_columns.add(col)
                last_attr = attributes[-1]
                if last_attr.get('unit'):
                    raise ValueError(
                        "More than one unit column is supplied for attribute {}"
                        .format(last_attr['attribute']))
                last_attr['_unit'] = col

            elif col not in skip_columns:
                split_col = col.split("|", 1)
                if len(split_col) > 1:
                    attributes.append({
                        "attribute": split_col[0],
                        "attribute_ont_id": split_col[1],
                        "source": "upload"
                    })
                else:
                    attributes.append({"attribute": col, "source": "upload"})

        # handle the categories for each attribute
        for i, attribute in enumerate(attributes):
            if '_val_accession' in attribute:
                category_df = isa_df[[
                    attribute['attribute'],
                    attribute.pop('_val_ont'),
                    attribute.pop('_val_accession')
                ]].drop_duplicates()
                category_df[
                    'attribute_ont_id'] = category_df.iloc[:, 1].str.cat(
                        category_df.iloc[:, 2], ":")
                category_df['value'] = category_df[attribute['attribute']]
                cats = category_df.set_index(attribute['attribute'])[[
                    'value', 'attribute_ont_id'
                ]].to_dict('index')
                attribute['categories'] = {
                    k: self._add_ontology_info(v)
                    for k, v in cats.items()
                }

            if '_unit' in attribute:
                units = isa_df[attribute.pop('_unit')].unique()
                if len(units) > 1:
                    raise ValueError(
                        "More than one unit type is supplied for attribute {}: {}"
                        .format(attribute['attribute'], units))
                attribute['unit'] = units[0]
                if '_unit_ont' in attribute:
                    unit_ont = isa_df[attribute.pop('_unit_ont')].str.cat(
                        isa_df[attribute.pop('_unit_accession')],
                        ":").unique()
                    if len(units) > 1:
                        raise ValueError(
                            "More than one unit ontology is supplied for attribute "
                            "{}: {}".format(attribute['attribute'], unit_ont))
                    attribute['unit_ont_id'] = unit_ont[0]
            attributes[i] = self._add_ontology_info(attribute)
        return attributes, skip_columns

    def _search_ontologies(self, term, closest=False):
        """
        Match to an existing KBase ontology term
        :param term: Test to match
        :param closest: if false, term must exactly match an ontology ID
        :return: dict(ontology_ref, id)
        """
        params = {
            "object_types": ["OntologyTerm"],
            "match_filter": {
                "lookup_in_keys": {
                    "id": {
                        "value": term
                    }
                }
            },
            "access_filter": {
                "with_private": 0,
                "with_public": 1
            },
            "pagination": {
                "count": 1
            },
            "post_processing": {
                "skip_data": 1
            }
        }
        if closest:
            params['match_filter'] = {"full_text_in_all": term}
        res = self.kbse.search_objects(params)
        if not res['objects']:
            return None
        term = res['objects'][0]
        return {
            "ontology_ref": term['guid'].split(":")[1],
            "id": term['key_props']['id']
        }

    def _add_ontology_info(self, attribute):
        """Searches KBASE ontologies for terms matching the user supplied attributes and units.
        Add the references if found"""
        optionals = {
            "unit",
            "unit_ont_id",
            "unit_ont_ref",
        }
        attribute = {
            k: v
            for k, v in attribute.items() if k not in optionals or v != ""
        }
        ont_info = self._search_ontologies(
            attribute.get('attribute_ont_id', "").replace("_", ":"))
        if ont_info:
            attribute['attribute_ont_ref'] = ont_info['ontology_ref']
            attribute['attribute_ont_id'] = ont_info['id']
        elif not attribute.get(
                'attribute_ont_id') or attribute['attribute_ont_id'] == ":":
            attribute.pop('attribute_ont_id', None)

        if attribute.get('unit'):
            ont_info = self._search_ontologies(
                attribute.get('unit_ont_id', '').replace("_", ":"))
            if ont_info:
                attribute['unit_ont_ref'] = ont_info['ontology_ref']
                attribute['unit_ont_id'] = ont_info['id']
            elif not attribute.get(
                    'attribute_ont_id') or attribute['unit_ont_id'] == ":":
                attribute.pop('unit_ont_id', None)

        return attribute

    def to_tsv(self, params):
        """Convert an compound set to TSV file"""
        files = {}

        _id, df, obj_type = self._ws_obj_to_df(params['input_ref'])
        files['file_path'] = os.path.join(params['destination_dir'],
                                          _id + ".tsv")
        df.to_csv(files['file_path'], sep="\t", index=False)

        return _id, files

    def to_excel(self, params):
        """Convert an compound set to Excel file"""
        files = {}

        _id, df, obj_type = self._ws_obj_to_df(params['input_ref'])
        files['file_path'] = os.path.join(params['destination_dir'],
                                          _id + ".xlsx")

        writer = pd.ExcelWriter(files['file_path'])

        if "KBaseExperiments.AttributeMapping" in obj_type:
            df.to_excel(writer, "Attributes", index=False)
        elif "KBaseExperiments.ClusterSet" in obj_type:
            df.to_excel(writer, "ClusterSet", index=True)
        # else is checked in `_ws_obj_to_df`

        writer.save()

        return _id, files

    def export(self, file, name, input_ref):
        """Saves a set of files to SHOCK for export"""
        export_package_dir = os.path.join(self.scratch,
                                          name + str(uuid.uuid4()))
        os.makedirs(export_package_dir)
        shutil.move(file,
                    os.path.join(export_package_dir, os.path.basename(file)))

        # package it up and be done
        package_details = self.dfu.package_for_download({
            'file_path': export_package_dir,
            'ws_refs': [input_ref]
        })

        return {'shock_id': package_details['shock_id']}
Example #7
0
class FeatureSetDownload:
    def __init__(self, config):
        self.cfg = config
        self.scratch = config['scratch']
        self.gsu = GenomeSearchUtil(os.environ['SDK_CALLBACK_URL'])
        self.dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL'])
        self.ws = Workspace(config["workspace-url"])

    @staticmethod
    def validate_params(params, expected={"workspace_name", "featureset_name"}):
        expected = set(expected)
        pkeys = set(params)
        if expected - pkeys:
            raise ValueError("Required keys {} not in supplied parameters"
                             .format(", ".join(expected - pkeys)))

    def to_tsv(self, params):
        working_dir = os.path.join(self.scratch,
                                   'featureset-download-'+str(uuid.uuid4()))
        os.makedirs(working_dir)
        header = ['Feature Id', 'Aliases', 'Genome', 'Type', 'Function']

        fs_name, fs_dicts = self.make_featureset_dict(params['featureset_ref'])
        files = {'file_path': "{}/{}.tsv".format(working_dir, fs_name)}
        writer = csv.DictWriter(open(files['file_path'], 'w'), header, delimiter='\t',
                                lineterminator='\n')
        writer.writeheader()
        for feat in fs_dicts:
            writer.writerow(feat)
        return fs_name, files

    def make_featureset_dict(self, fs_ref):
        features = []
        ret = self.dfu.get_objects({'object_refs': [fs_ref]})['data'][0]
        feat_set = ret['data']
        fs_name = ret['info'][1]

        feat_by_genome = defaultdict(list)
        for k, v in feat_set['elements'].items():
            feat_by_genome[v[0]].append(k)

        for genome, fids in feat_by_genome.items():
            genome_name = self.ws.get_object_info3({'objects': [{'ref': genome}]})['infos'][0][1]
            res = self.gsu.search({'ref': genome,
                                   'structured_query': {'feature_id': fids},
                                   'sort_by': [['contig_id', 1]],
                                   'start': 0,
                                   'limit': len(fids)
                                   })

            for feat in res['features']:
                features.append({'Feature Id': feat['feature_id'],
                                 'Aliases': ", ".join(sorted(feat['aliases'].keys())),
                                 'Genome': "{} ({})".format(genome_name, genome),
                                 'Type': feat['feature_type'],
                                 'Function': feat['function']
                                 })
        return fs_name, features

    def export(self, files, name, params):
        export_package_dir = os.path.join(self.scratch, name+str(uuid.uuid4()))
        os.makedirs(export_package_dir)
        for file in files:
            shutil.move(file, os.path.join(export_package_dir,
                                           os.path.basename(file)))

        # package it up and be done
        package_details = self.dfu.package_for_download({
            'file_path': export_package_dir,
            'ws_refs': [params['featureset_ref']]
        })

        return {'shock_id': package_details['shock_id']}
    def export_genome_as_genbank(self, ctx, params):
        """
        :param params: instance of type "ExportParams" (input and output
           structure functions for standard downloaders) -> structure:
           parameter "input_ref" of String
        :returns: instance of type "ExportOutput" -> structure: parameter
           "shock_id" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN export_genome_as_genbank
        print('export_genome_as_genbank -- paramaters = ')

        # validate parameters
        if 'input_ref' not in params:
            raise ValueError(
                'Cannot run export_genome_as_genbank- no "input_ref" field defined.'
            )

        # get WS metadata to get ws_name and obj_name
        ws = Workspace(url=self.cfg.workspaceURL)
        info = ws.get_object_info_new({
            'objects': [{
                'ref': params['input_ref']
            }],
            'includeMetadata': 0,
            'ignoreErrors': 0
        })[0]

        genome_to_genbank_params = {'genome_ref': params['input_ref']}

        # export to file (building from KBase Genome Object)
        result = self.genome_to_genbank(
            ctx, genome_to_genbank_params)[0]['genbank_file']

        # create the output directory and move the file there
        export_package_dir = os.path.join(self.cfg.sharedFolder, info[1])
        os.makedirs(export_package_dir)
        shutil.move(
            result['file_path'],
            os.path.join(export_package_dir,
                         os.path.basename(result['file_path'])))

        # export original uploaded GenBank file if it existed.
        exporter = GenomeToGenbank(self.cfg)
        original_result_full = exporter.export_original_genbank(
            ctx, genome_to_genbank_params)
        if original_result_full is not None:
            original_result = original_result_full['genbank_file']
            shutil.move(
                original_result['file_path'],
                os.path.join(export_package_dir,
                             os.path.basename(original_result['file_path'])))

        # Make warning file about genes only.
        warning_filename = "README.txt"
        with open(os.path.join(export_package_dir, warning_filename),
                  'w') as temp_file:
            temp_file.write(
                'This directory includes the KBase-derived GenBank file and also '
                + '(if you originally uploaded the genome from an annotated ' +
                'GenBank file) the original GenBank input.')

        # package it up and be done
        dfUtil = DataFileUtil(self.cfg.callbackURL)
        package_details = dfUtil.package_for_download({
            'file_path':
            export_package_dir,
            'ws_refs': [params['input_ref']]
        })

        output = {'shock_id': package_details['shock_id']}

        print('export complete -- result = ')
        pprint(output)
        #END export_genome_as_genbank

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method export_genome_as_genbank return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
Example #9
0
class sample_uploader:
    '''
    Module Name:
    sample_uploader

    Module Description:
    A KBase module: sample_uploader
    '''

    ######## WARNING FOR GEVENT USERS ####### noqa
    # Since asynchronous IO can lead to methods - even the same method -
    # interrupting each other, you must be *very* careful when using global
    # state. A method could easily clobber the state set by another while
    # the latter method is running.
    ######################################### noqa
    VERSION = "0.0.12"
    GIT_URL = "https://github.com/kbaseapps/sample_uploader"
    GIT_COMMIT_HASH = "5134b679279c84128b0ca5b684fa75dacf7dba59"

    #BEGIN_CLASS_HEADER
    #END_CLASS_HEADER

    # config contains contents of config file in a hash or None if it couldn't
    # be found
    def __init__(self, config):
        #BEGIN_CONSTRUCTOR
        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.workspace_url = config['workspace-url']
        self.scratch = config['scratch']
        # janky, but works for now
        self.sw_url = config.get('kbase-endpoint') + '/service_wizard'
        self.dfu = DataFileUtil(url=self.callback_url)
        logging.basicConfig(format='%(created)s %(levelname)s: %(message)s',
                            level=logging.INFO)
        #END_CONSTRUCTOR
        pass

    def import_samples(self, ctx, params):
        """
        :param params: instance of type "ImportSampleInputs" -> structure:
           parameter "sample_set_ref" of String, parameter "sample_file" of
           String, parameter "workspace_name" of String, parameter
           "workspace_id" of Long, parameter "file_format" of String,
           parameter "description" of String, parameter "set_name" of String,
           parameter "header_row_index" of Long, parameter "id_field" of
           String, parameter "output_format" of String, parameter
           "taxonomy_source" of String, parameter "num_otus" of Long,
           parameter "incl_seq" of Long, parameter "otu_prefix" of String,
           parameter "share_within_workspace" of Long
        :returns: instance of type "ImportSampleOutputs" -> structure:
           parameter "report_name" of String, parameter "report_ref" of
           String, parameter "sample_set" of type "SampleSet" -> structure:
           parameter "samples" of list of type "sample_info" -> structure:
           parameter "id" of type "sample_id", parameter "name" of String,
           parameter "description" of String, parameter "sample_set_ref" of
           String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN import_samples
        print(f"Beginning sample import with following parameters:")
        print(f"params -- {params}")
        sample_set = {"samples": []}
        # We subtract by 1 for zero indexing.
        if params.get('sample_set_ref'):
            ret = self.dfu.get_objects(
                {'object_refs': [params['sample_set_ref']]})['data'][0]
            sample_set = ret['data']
            set_name = ret['info'][1]
            save_ws_id = params['sample_set_ref'].split('/')[0]
        else:
            if not params.get('set_name'):
                raise ValueError(
                    f"Sample set name required, when new SampleSet object is created."
                )
            set_name = params['set_name']
            save_ws_id = params.get('workspace_id')
        if params.get('header_row_index'):
            header_row_index = int(params["header_row_index"]) - 1
        else:
            header_row_index = 0
            if params.get('file_format') == "SESAR":
                header_row_index = 1

        username = ctx['user_id']

        if params.get('file_format') == 'ENIGMA':
            # ENIGMA_mappings['verification_mapping'].update(
            #     {key: ("is_string", []) for key in ENIGMA_mappings['basic_columns']}
            # )
            sample_set = import_samples_from_file(
                params, self.sw_url, self.workspace_url, username,
                ctx['token'], ENIGMA_mappings['column_mapping'],
                ENIGMA_mappings.get('groups',
                                    []), ENIGMA_mappings['date_columns'],
                ENIGMA_mappings.get('column_unit_regex',
                                    []), sample_set, header_row_index)
        elif params.get('file_format') == 'SESAR':
            # SESAR_mappings['verification_mapping'].update(
            #     {key: ("is_string", []) for key in SESAR_mappings['basic_columns']}
            # )
            sample_set = import_samples_from_file(
                params, self.sw_url, self.workspace_url, username,
                ctx['token'], SESAR_mappings['column_mapping'],
                SESAR_mappings.get('groups',
                                   []), SESAR_mappings['date_columns'],
                SESAR_mappings.get('column_unit_regex',
                                   []), sample_set, header_row_index)
        elif params.get('file_format') == 'KBASE':
            sample_set = import_samples_from_file(params, self.sw_url,
                                                  self.workspace_url, username,
                                                  ctx['token'], {}, [], [], [],
                                                  sample_set, header_row_index)
        else:
            raise ValueError(
                f"Only SESAR and ENIGMA formats are currently supported for importing samples. "
                "File of format {params.get('file_format')} not supported.")

        obj_info = self.dfu.save_objects({
            'id':
            save_ws_id,
            'objects': [{
                "name": set_name,
                "type": "KBaseSets.SampleSet",
                "data": sample_set
            }]
        })[0]

        sample_set_ref = '/'.join(
            [str(obj_info[6]),
             str(obj_info[0]),
             str(obj_info[4])])
        sample_file_name = os.path.basename(
            params['sample_file']).split('.')[0] + '_OTU'

        # -- Format outputs below --
        # if output file format specified, add one to output
        if params.get('output_format') in ['csv', 'xls']:
            otu_path = sample_set_to_OTU_sheet(sample_set, sample_file_name,
                                               self.scratch, params)
            file_links = [{
                'path':
                otu_path,
                'name':
                os.path.basename(otu_path),
                'label':
                "OTU template file",
                'description':
                "file with each column containing the assigned sample_id and sample "
                "name of each saved sample. Intended for uploading OTU data."
            }]
        else:
            file_links = []

        if params.get('incl_input_in_output'):
            sample_file = params.get('sample_file')
            if not os.path.isfile(sample_file):
                # try prepending '/staging/' to file and check then
                if os.path.isfile(os.path.join('/staging', sample_file)):
                    sample_file = os.path.join('/staging', sample_file)
                else:
                    raise ValueError(
                        f"input file {sample_file} does not exist.")
            sample_file_copy = os.path.join(self.scratch,
                                            os.path.basename(sample_file))
            shutil.copy(sample_file, sample_file_copy)
            file_links.append({
                "path":
                sample_file_copy,
                "name":
                os.path.basename(sample_file_copy),
                "label":
                "Input Sample file",
                "description":
                "Input file provided to create the sample set."
            })

        # create report
        report_client = KBaseReport(self.callback_url)
        report_name = "SampleSet_import_report_" + str(uuid.uuid4())
        report_info = report_client.create_extended_report({
            'message':
            f"SampleSet object named \"{set_name}\" imported.",
            'objects_created': [{
                'ref': sample_set_ref
            }],
            'file_links':
            file_links,
            'report_object_name':
            report_name,
            'workspace_name':
            params['workspace_name']
        })
        output = {
            'report_ref': report_info['ref'],
            'report_name': report_info['name'],
            'sample_set': sample_set,
            'sample_set_ref': sample_set_ref
        }
        #END import_samples

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method import_samples return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]

    def generate_OTU_sheet(self, ctx, params):
        """
        :param params: instance of type "GenerateOTUSheetParams" (Generate a
           customized OTU worksheet using a SampleSet input to generate the
           appropriate columns.) -> structure: parameter "workspace_name" of
           String, parameter "workspace_id" of Long, parameter
           "sample_set_ref" of String, parameter "output_name" of String,
           parameter "output_format" of String, parameter "num_otus" of Long,
           parameter "taxonomy_source" of String, parameter "incl_seq" of
           Long, parameter "otu_prefix" of String
        :returns: instance of type "GenerateOTUSheetOutputs" -> structure:
           parameter "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN generate_OTU_sheet
        # first we download sampleset
        sample_set_ref = params.get('sample_set_ref')
        ret = self.dfu.get_objects({'object_refs':
                                    [sample_set_ref]})['data'][0]
        sample_set = ret['data']
        if params.get('output_name'):
            output_name = params.get('output_name')
        else:
            # if output_name not specified use name of sample_set as output + "_OTUs"
            output_name = ret['info'][1] + "_OTUs"
        otu_path = sample_set_to_OTU_sheet(sample_set, output_name,
                                           self.scratch, params)
        report_client = KBaseReport(self.callback_url)
        report_name = "Generate_OTU_sheet_report_" + str(uuid.uuid4())
        report_info = report_client.create_extended_report({
            'file_links': [{
                'path':
                otu_path,
                'name':
                os.path.basename(otu_path),
                'label':
                "CSV with headers for OTU",
                'description':
                "CSV file with each column containing the assigned sample_id and sample "
                "name of each saved sample. Intended for uploading OTU data."
            }],
            'report_object_name':
            report_name,
            'workspace_name':
            params['workspace_name']
        })
        output = {
            'report_ref': report_info['ref'],
            'report_name': report_info['name'],
        }

        #END generate_OTU_sheet

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method generate_OTU_sheet return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]

    def update_sample_set_acls(self, ctx, params):
        """
        :param params: instance of type "update_sample_set_acls_params" ->
           structure: parameter "workspace_name" of String, parameter
           "workspace_id" of Long, parameter "sample_set_ref" of String,
           parameter "new_users" of list of String, parameter "is_reader" of
           Long, parameter "is_writer" of Long, parameter "is_admin" of Long,
           parameter "share_within_workspace" of Long
        :returns: instance of type "update_sample_set_acls_output" ->
           structure: parameter "status" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN update_sample_set_acls

        # first get sample_set object
        sample_set_ref = params.get('sample_set_ref')
        ret = self.dfu.get_objects({'object_refs':
                                    [sample_set_ref]})['data'][0]
        sample_set = ret['data']
        sample_url = get_sample_service_url(self.sw_url)

        acls = {'read': [], 'write': [], 'admin': []}

        if params.get('share_within_workspace'):
            acls = get_workspace_user_perms(self.workspace_url,
                                            params.get('workspace_id'),
                                            ctx['token'], ctx['user_id'], acls)

        for new_user in params.get('new_users'):
            if params.get('is_admin'):
                acls['admin'].append(new_user)
            elif params.get('is_writer'):
                acls['write'].append(new_user)
            elif params.get('is_reader'):
                acls['read'].append(new_user)

        for sample in sample_set['samples']:
            sample_id = sample['id']
            status = update_acls(sample_url, sample_id, acls, ctx['token'])
        output = {"status": status}
        #END update_sample_set_acls

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method update_sample_set_acls return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]

    def export_samples(self, ctx, params):
        """
        :param params: instance of type "ExportParams" (export function for
           samples) -> structure: parameter "input_ref" of String, parameter
           "file_format" of String
        :returns: instance of type "ExportOutput" -> structure: parameter
           "shock_id" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN export_samples
        if not params.get('input_ref'):
            raise ValueError(f"variable input_ref required")
        sample_set_ref = params.get('input_ref')
        output_file_format = params.get('file_format', 'SESAR')

        ret = self.dfu.get_objects({'object_refs':
                                    [sample_set_ref]})['data'][0]
        sample_set = ret['data']
        sample_set_name = ret['info'][1]
        sample_url = get_sample_service_url(self.sw_url)

        export_package_dir = os.path.join(self.scratch, "output")
        if not os.path.isdir(export_package_dir):
            os.mkdir(export_package_dir)
        output_file = os.path.join(export_package_dir,
                                   '_'.join(sample_set_name.split()) + ".csv")

        sample_set_to_output(sample_set, sample_url, ctx['token'], output_file,
                             output_file_format)

        # package it up
        package_details = self.dfu.package_for_download({
            'file_path':
            export_package_dir,
            'ws_refs': [params['input_ref']]
        })

        output = {
            'shock_id': package_details['shock_id'],
            'result_dir': export_package_dir
        }
        #END export_samples

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method export_samples return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]

    def link_reads(self, ctx, params):
        """
        Create links between samples and reads objects
        :param params: instance of mapping from String to unspecified object
        :returns: instance of type "ReportResults" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN link_reads
        ss = SampleService(self.sw_url, token=ctx['token'], service_ver='beta')
        sample_set_ref = params['sample_set_ref']
        sample_set = SampleSet(self.dfu, sample_set_ref)
        links = [(d['sample_name'], d['reads_ref']) for d in params['links']]

        for sample_name, reads_ref in links:
            node_id, version, sample_id = sample_set.get_sample_info(
                sample_name)
            p = dict(
                upa=reads_ref,
                id=sample_id,
                version=version,
                node=node_id,
                update=1,
            )
            ret = ss.create_data_link(p)

        report_client = KBaseReport(self.callback_url)
        report_info = report_client.create_extended_report({
            'workspace_name':
            params['workspace_name'],
        })
        output = {
            'report_name': report_info['name'],
            'report_ref': report_info['ref'],
        }
        #END link_reads

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method link_reads return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]

    def status(self, ctx):
        #BEGIN_STATUS
        returnVal = {
            'state': "OK",
            'message': "",
            'version': self.VERSION,
            'git_url': self.GIT_URL,
            'git_commit_hash': self.GIT_COMMIT_HASH
        }
        #END_STATUS
        return [returnVal]
Example #10
0
class MDSUtils:

    R_BIN = '/kb/deployment/bin'
    MDS_OUT_DIR = 'mds_output'
    PARAM_IN_WS = 'workspace_name'
    PARAM_IN_MATRIX = 'input_obj_ref'
    PARAM_OUT_MATRIX = 'mds_matrix_name'

    def _mkdir_p(self, path):
        """
        _mkdir_p: make directory for given path
        """
        if not path:
            return
        try:
            os.makedirs(path)
        except OSError as exc:
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise

    def _validate_run_mds_params(self, params):
        """
        _validate_run_mds_params:
            validates params passed to run_mds method
        """

        logging.info('start validating run_mds params')

        # check for required parameters
        for p in [self.PARAM_IN_MATRIX, self.PARAM_IN_WS, self.PARAM_OUT_MATRIX]:
            if p not in params:
                raise ValueError('"{}" parameter is required, but missing'.format(p))

    def _build_rMDS_script(self, params):
        """
        _build_rMDS_script: build a sequence of R command calls according to params
        Note: To run the NMDS, we will use the function metaMDS from the vegan package.
        # The metaMDS function requires only a community-by-species matrix.
        """
        data_file_path = params.get('datafile', None)
        if not data_file_path:
            return ''

        exists = os.path.isfile(os.path.join(self.output_dir, os.path.basename(data_file_path)))
        if not exists:
            shutil.copyfile(data_file_path,
                            os.path.join(self.output_dir, os.path.basename(data_file_path)))

        n_components = params.get('n_components', 2)
        max_iter = params.get('max_iter', 300)
        run_metric = True if params.get('metric', 0) else False
        dist_metric = params.get('distance_metric', 'bray')

        mds_cfg = 'distance="' + dist_metric + '",try=20,trymax=' + str(max_iter) + \
                  ',autotransform=TRUE,noshare=0.1,expand=TRUE,trace=1,' + \
                  'plot=FALSE,engine=c("monoMDS","isoMDS"),k=' + str(n_components)
        if run_metric:
            mds_cfg += 'metric=True'

        mds_scrpt = 'library(vegan)\n'
        mds_scrpt += 'library(jsonlite)\n'
        mds_scrpt += 'vg_data <- read.table("' + data_file_path + \
                     '",header=TRUE,row.names=1,sep="")\n'
        # remove the last (taxonomy) column
        # mds_scrpt += 'vg_data<-vg_data[,1:dim(vg_data)[2]-1]\n'
        # Function metaMDS returns an object of class metaMDS.
        mds_scrpt += 'vg_data.mds <- metaMDS(vg_data,' + mds_cfg + ')\n'
        mds_scrpt += 'vg_data.mds\n'

        # save the results in the memory
        # 1) store species ordination
        mds_scrpt += 'variableScores <- vg_data.mds$species\n'
        # 2) store site ordination
        mds_scrpt += 'sampleScores <- vg_data.mds$points\n'
        # 3) store other ordination results
        mds_scrpt += 'stress <- vg_data.mds$stress\n'
        mds_scrpt += 'dist_metric <- vg_data.mds$distance\n'
        mds_scrpt += 'dist_matrix <- vg_data.mds$diss\n'
        mds_scrpt += 'dist_call <- vg_data.mds$distcall\n'
        mds_scrpt += 'converged <- vg_data.mds$converged\n'
        mds_scrpt += 'dims <- vg_data.mds$ndim\n'
        mds_scrpt += 'tries <- vg_data.mds$tries\n'
        mds_scrpt += 'maxits <- vg_data.mds$maxits\n'
        mds_scrpt += 'func_call <- vg_data.mds$call\n'
        mds_scrpt += 'mds_data <- vg_data.mds$data\n'

        # save the results to the current dir
        # Write CSV in R
        mds_scrpt += 'write.csv(dist_matrix,file="dist_matrix.csv",row.names=TRUE,na="")\n'
        mds_scrpt += 'write.csv(variableScores,file="species_ordination.csv",' + \
                     'row.names=TRUE,na="")\n'
        mds_scrpt += 'write.csv(sampleScores,file="site_ordination.csv",row.names=TRUE,na="")\n'

        # Write JSON in R
        mds_scrpt += 'write_json(toJSON(dist_matrix),path="dist_matrix.json",pretty=TRUE,' + \
                     'auto_unbox=FALSE)\n'
        mds_scrpt += 'write_json(toJSON(variableScores),path="species_ordination.json",' + \
                     'pretty=TRUE,auto_unbox=FALSE)\n'
        mds_scrpt += 'write_json(toJSON(sampleScores),path="site_ordination.json",' + \
                     'pretty=TRUE,auto_unbox=FALSE)\n'
        mds_scrpt += 'item_name=c("stress","distance_metric","dist_call","converged",' + \
                     '"dimesions","trials","maxits")\n'
        mds_scrpt += 'item_value=c(stress,dist_metric,dist_call,converged,dims,tries,maxits)\n'
        mds_scrpt += 'df <- data.frame(item_name,item_value,stringsAsFactors=FALSE)\n'
        mds_scrpt += 'write_json(toJSON(df),path="others.json",pretty=TRUE,auto_unbox=FALSE)\n'

        # save mds plots
        '''
        mds_scrpt += 'bmp(file="saving_mds_plot.bmp",width=580,height=580,units="px",' + \
                     'res=100, pointsize=12)\n'
        mds_scrpt += 'plot(vg_data.mds,type="n",display="sites")\n'
        mds_scrpt += 'points(vg_data.mds)\n'
        mds_scrpt += 'dev.off()\n'
        mds_scrpt += 'pdf(file="saving_mds_plot.pdf",width=6,height=6)\n'
        mds_scrpt += 'plot(vg_data.mds,type="n",display="sites")\n'
        mds_scrpt += 'points(vg_data.mds)\n'
        mds_scrpt += 'dev.off()\n'
        mds_scrpt += 'pdf(file="mds_plot_withlabel.pdf",width=6,height=6)\n'
        mds_scrpt += 'plot(vg_data.mds,type="n",display="sites")\n'
        mds_scrpt += 'ordilabel(vg_data.mds,dis="sites",cex=1.2,font=3,fill="hotpink",col="blue")\n'
        mds_scrpt += 'dev.off()\n'
        mds_scrpt += 'pdf(file="mds_plot_withcolor.pdf",width=6,height=6)\n'
        mds_scrpt += 'fig <- ordiplot(vg_data.mds,type="none")\n'
        mds_scrpt += 'points(fig,"sites",pch=21,col="red",bg="yellow")\n'
        mds_scrpt += 'points(fig,"species",pch=21,col="green",bg="blue")\n'
        # mds_scrpt += 'text(fig, "species", col="blue", cex=0.9)\n'
        mds_scrpt += 'dev.off()\n'
        '''
        # If there is user input plotting script:
        plt_scrpt = params.get('plot_script', '').lower()
        if plt_scrpt and re.match("^plot\(\s*[a-zA-Z]+.*\)$", plt_scrpt):
            arr_plt = plt_scrpt.split(',')
            arr_plt[0] = 'plot(vg_data.mds'  # make sure to pass the correct data
            plt_scrpt = (',').join(arr_plt)
            if len(arr_plt) == 1:
                plt_scrpt += ')'
            plt_type = params.get('plot_type', 'pdf').lower()
            if not plt_type:
                plt_type = 'pdf'

            plt_name = params.get('plot_name', 'usr_plt_name').lower()
            if not plt_name:
                plt_name = 'usr_plt_name'
            plt_name += '.' + plt_type

            if plt_type == 'jpg':
                plt_type = 'jpeg'
            if plt_type == 'ps':
                plt_type = 'postscript'
                mds_scrpt += plt_type
                mds_scrpt += '(file="' + plt_name + '")\n'
            if plt_type == 'tiff':
                mds_scrpt += plt_type
                mds_scrpt += '(file="' + plt_name + '",width=4,height=4,units="in",' + \
                             'compression="lzw",res=300)\n'
            if plt_type in ['jpg', 'jpeg', 'bmp', 'png']:
                mds_scrpt += plt_type
                mds_scrpt += '(file="' + plt_name + '",width=580,height=580,units="px",' + \
                             'res=100, pointsize=12)\n'

            mds_scrpt += plt_scrpt + '\n'
            mds_scrpt += 'dev.off()\n'

        logging.info('R script: {}'.format(mds_scrpt))

        mds_rscript = 'mds_script.R'
        rscrpt_file_path = os.path.join(self.output_dir, mds_rscript)

        with open(rscrpt_file_path, 'w') as r_file:
            r_file.write(mds_scrpt)
        return rscrpt_file_path

    def _execute_r_script(self, rfile_name):
        """
        _execute_r_script: Calling the Rscript executable to run the R script in rfile_name
        """
        logging.info('Calling R......')

        result_dir = os.path.dirname(rfile_name)
        if not result_dir:
            result_dir = self.working_dir

        rcmd = [os.path.join(self.R_BIN, 'Rscript')]
        rcmd.append(rfile_name)

        logging.info('Running metaMDS script in current working directory: {}'.format(result_dir))
        exitCode = 0
        try:
            complete_proc = subprocess.run(rcmd, cwd=result_dir, stdin=subprocess.PIPE,
                                           stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
                                           close_fds=True)
            exitCode = complete_proc.returncode
            if (exitCode == 0):
                logging.info('\n{}'.format(complete_proc.stdout))
                logging.info('\n{} was executed successfully, exit code was: {}'.format(
                    ' '.join(rcmd), str(exitCode)))
                logging.info("Finished calling R.")
            else:
                logging.info('Error running command: {} Exit Code: '.format(
                    ' '.join(rcmd), str(exitCode)))
                logging.info('\n{}'.format(complete_proc.stderr))
        except subprocess.CalledProcessError as sub_e:
            exitCode = -99
            logging.info('Caught subprocess.CalledProcessError {}'.format(sub_e))

        return exitCode

    def _df_to_list(self, df):
        """
        _df_to_list: convert Dataframe to FloatMatrix2D matrix data
        """

        df.index = df.index.astype('str')
        df.columns = df.columns.astype('str')
        df.fillna(0, inplace=True)
        matrix_data = {'row_ids': df.index.tolist(),
                       'col_ids': df.columns.tolist(),
                       'values': df.values.tolist()}

        return matrix_data

    def _mds_df_to_excel(self, mds_df, distance_df, result_dir, mds_matrix_ref):
        """
        write MDS matrix df into excel
        """
        logging.info('writting mds data frame to excel file')
        mds_matrix_obj = self.dfu.get_objects({'object_refs': [mds_matrix_ref]})['data'][0]
        mds_matrix_info = mds_matrix_obj['info']
        mds_matrix_name = mds_matrix_info[1]

        file_path = os.path.join(result_dir, mds_matrix_name + ".xlsx")
        writer = pd.ExcelWriter(file_path)

        mds_df.to_excel(writer, "mds_matrix", index=True)
        if distance_df:
            distance_df.to_excel(writer, "mds_distance_matrix", index=True)

        writer.close()

    def _Matrix2D_to_df(self, Matrix2D):
        """
        _Matrix2D_to_df: transform a FloatMatrix2D to data frame
        """

        index = Matrix2D.get('row_ids')
        columns = Matrix2D.get('col_ids')
        values = Matrix2D.get('values')

        df = pd.DataFrame(values, index=index, columns=columns)

        return df

    def _mds_to_df(self, mds_matrix_ref):
        """
        retrieve MDS matrix ws object to mds_df
        """
        logging.info('converting mds matrix to data frame')
        mds_data = self.dfu.get_objects({'object_refs': [mds_matrix_ref]})['data'][0]['data']

        rotation_matrix_data = mds_data.get('rotation_matrix')
        distance_matrix_data = mds_data.get('distance_matrix')
        original_matrix_ref = mds_data.get('original_matrix_ref')
        dimension = mds_data.get('mds_parameters').get('n_components')

        mds_df = self._Matrix2D_to_df(rotation_matrix_data)
        distance_df = None
        if distance_matrix_data:
            distance_df = self._Matrix2D_to_df(distance_matrix_data)

        if original_matrix_ref:
            logging.info('appending instance group information to mds data frame')
            obj_data = self.dfu.get_objects(
                {'object_refs': [original_matrix_ref]})['data'][0]['data']

            attributemapping_ref = obj_data.get('{}_attributemapping_ref'.format(dimension))

            am_data = self.dfu.get_objects(
                {'object_refs': [attributemapping_ref]})['data'][0]['data']

            attributes = am_data.get('attributes')
            instances = am_data.get('instances')
            am_df = pd.DataFrame(data=list(instances.values()),
                                 columns=list(map(lambda x: x.get('attribute'), attributes)),
                                 index=instances.keys())

            mds_df = mds_df.merge(am_df, left_index=True, right_index=True, how='left',
                                  validate='one_to_one')

        return mds_df, distance_df

    def _save_mds_matrix(self, workspace_name, input_obj_ref, mds_matrix_name,
                         distance_df, mds_params_df, site_ordin_df, species_ordin_df):

        logging.info('Saving MDSMatrix...')

        if not isinstance(workspace_name, int):
            ws_name_id = self.dfu.ws_name_to_id(workspace_name)
        else:
            ws_name_id = workspace_name

        mds_data = {}

        mds_data.update({'distance_matrix': self._df_to_list(distance_df)})
        mds_data.update({'site_ordination': self._df_to_list(site_ordin_df)})
        mds_data.update({'species_ordination': self._df_to_list(species_ordin_df)})
        mds_data.update({'mds_parameters': self._df_to_list(mds_params_df)})
        mds_data.update({'original_matrix_ref': input_obj_ref})
        mds_data.update({'rotation_matrix': self._df_to_list(distance_df)})

        obj_type = 'KBaseExperiments.PCAMatrix'
        info = self.dfu.save_objects({
            "id": ws_name_id,
            "objects": [{
                "type": obj_type,
                "data": mds_data,
                "name": mds_matrix_name
            }]
        })[0]

        return "%s/%s/%s" % (info[6], info[0], info[4])

    def _zip_folder(self, folder_path, output_path):
        """
        _zip_folder: Zip the contents of an entire folder (with that folder included in the
         archive). Empty subfolders could be included in the archive as well if the 'Included
         all subfolders, including empty ones' portion.
         portion is used.
        """
        with zipfile.ZipFile(output_path, 'w',
                             zipfile.ZIP_DEFLATED,
                             allowZip64=True) as ziph:
            for root, folders, files in os.walk(folder_path):
                # Include all subfolders, including empty ones.
                for folder_name in folders:
                    absolute_fpath = os.path.join(root, folder_name)
                    relative_fpath = os.path.join(os.path.basename(root), folder_name)
                    logging.info("Adding folder {} to archive.".format(absolute_fpath))
                    ziph.write(absolute_fpath, relative_fpath)
                for f in files:
                    absolute_path = os.path.join(root, f)
                    relative_path = os.path.join(os.path.basename(root), f)
                    logging.info("Adding file {} to archive.".format(absolute_path))
                    ziph.write(absolute_path, relative_path)

        logging.info("{} created successfully.".format(output_path))

    def _generate_output_file_list(self, out_dir):
        """
        _generate_output_file_list: zip result files and generate file_links for report
        """

        logging.info('Start packing result files from MDS...')

        output_files = list()

        output_dir = os.path.join(self.working_dir, str(uuid.uuid4()))
        self._mkdir_p(output_dir)
        mds_output = os.path.join(output_dir, 'metaMDS_output.zip')
        self._zip_folder(out_dir, mds_output)

        output_files.append({'path': mds_output,
                             'name': os.path.basename(mds_output),
                             'label': os.path.basename(mds_output),
                             'description': 'Output file(s) generated by metaMDS'})
        return output_files

    def _generate_mds_html_report(self, mds_outdir, n_components):

        logging.info('Start generating html report for MDS results...')
        html_report = list()

        mds_plots = list()
        for root, folders, files in os.walk(mds_outdir):
            # Find the image files by their extensions.
            for f in files:
                if re.match('^[a-zA-Z]+.*.(html)$', f):  # jpeg|jpg|bmp|png|tiff|pdf|ps|
                    absolute_path = os.path.join(root, f)
                    logging.info("Adding file {} to plot archive.".format(absolute_path))
                    mds_plots.append(absolute_path)

        result_dir = os.path.join(self.working_dir, str(uuid.uuid4()))
        self._mkdir_p(result_dir)
        result_file_path = os.path.join(result_dir, 'mds_result.html')

        visualization_content = ''

        for mds_plot in mds_plots:
            shutil.copy2(mds_plot,
                         os.path.join(result_dir, os.path.basename(mds_plot)))
            visualization_content += '<iframe height="900px" width="100%" '
            visualization_content += 'src="{}" '.format(os.path.basename(mds_plot))
            visualization_content += 'style="border:none;"></iframe>\n<p></p>\n'


        with open(result_file_path, 'w') as result_file:
            with open(os.path.join(os.path.dirname(__file__), 'templates', 'mds_template.html'),
                      'r') as report_template_file:
                report_template = report_template_file.read()
                report_template = report_template.replace('<p>Visualization_Content</p>',
                                                          visualization_content)
                report_template = report_template.replace('n_components',
                                                          '{} Components'.format(n_components))
                result_file.write(report_template)

        report_shock_id = self.dfu.file_to_shock({'file_path': result_dir,
                                                  'pack': 'zip'})['shock_id']

        html_report.append({'shock_id': report_shock_id,
                            'name': os.path.basename(result_file_path),
                            'label': os.path.basename(result_file_path),
                            'description': 'HTML summary report for MDS Matrix App'
                            })
        return html_report

    def _generate_mds_report(self, mds_ref, output_dir, workspace_name, n_components):
        logging.info('Creating MDS report...')

        output_files = self._generate_output_file_list(output_dir)
        output_html_files = self._generate_mds_html_report(output_dir, n_components)

        objects_created = list()
        objects_created.append({'ref': mds_ref,
                                'description': 'MDS Matrix'})

        report_params = {'message': '',
                         'workspace_name': workspace_name,
                         'file_links': output_files,
                         'objects_created': objects_created,
                         'html_links': output_html_files,
                         'direct_html_link_index': 0,
                         'html_window_height': 666,
                         'report_object_name': 'kb_mds_report_' + str(uuid.uuid4())}

        kbase_report_client = KBaseReport(self.callback_url)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {'report_name': output['name'], 'report_ref': output['ref']}

        return report_output

    def _get_metadata_from_obj(self):
        """
        Get metadata from obj and return simplified pd.DataFrame
        :return:
        """

        logging.info('Retrieving metadata..')

        # KBase obj data
        mdf = self.dfu.get_objects({'object_refs': [self.attribute_mapping_obj_ref]})
        attr_l = mdf['data'][0]['data']['attributes']

        # Get index location in mdf(metadata) of chosen color and scale
        color_index = None
        size_index = None
        for i in range(len(attr_l)):
            if attr_l[i]['attribute'] == self.color_marker_by:
                color_index = i
            if attr_l[i]['attribute'] == self.scale_size_by:
                size_index = i

        # Make list of color and scale data
        color_data = []
        size_data = []
        mdf_indx = mdf['data'][0]['data']['instances'].keys()
        for sample in mdf_indx:
            if color_index is not None:
                color_data.append(mdf['data'][0]['data']['instances'][sample][color_index])
            if size_index is not None:
                try:
                    size_data.append(float(mdf['data'][0]['data']['instances'][sample][size_index]))
                except:
                    logging.info('ERROR: scaling is not int or float. scaling has been dropped')
                    self.scale_size_by = None
                    size_index = None

        # mdf is now new pd.DataFrame that only includes needed data
        mdf = pd.DataFrame(index=mdf_indx, columns=[self.color_marker_by, self.scale_size_by])
        if color_index is not None:
            mdf[self.color_marker_by] = color_data
        if size_index is not None:
            mdf[self.scale_size_by] = size_data

        return mdf

    def _get_metadata_from_file(self):
        """
        Get metadata from file and return simplified pd.DataFrame
        :return:
        """

        logging.info('Retrieving metadata..')

        mdf = pd.read_csv(self.metadata_file, sep='\t', index_col=0)

        logging.info('MDF: {}'.format(mdf))

        mdf = mdf[[self.color_marker_by, self.scale_size_by]]

        return mdf

    def _plot_with_grouping(self):
        logging.info('Plotting with grouping: "{}", and "{}"'.format(self.color_marker_by, self.scale_size_by))

        # Both can not be the same right now.. mdf is now new pd would lead to problems
        if self.color_marker_by == self.scale_size_by:
            logging.info('ERROR: both color and scale are same field. scale set to None')
            self.scale_size_by = None

        if self.attribute_mapping_obj_ref is not None:
            mdf = self._get_metadata_from_obj()
        elif self.metadata_file is not None:
            mdf = self._get_metadata_from_file()
        else:
            FileNotFoundError('No metadata file was specified')

        # Get site data from previously saved file
        site_ordin_df = pd.read_csv(os.path.join(self.output_dir, "site_ordination.csv"), index_col=0)
        logging.info('SITE_ORDIN_DF:\n {}'.format(site_ordin_df))

        # Check if metadata file is valid for this method
        for sample in site_ordin_df.index:
            try:
                mdf.loc[sample]
            except KeyError:
                raise KeyError('One or more samples in site_ordination is not found in chosen metadata obj. If you ran '
                               'this using files, you might need to transpose the data in your files so samples are '
                               'rows and OTU are columns.')

        # Fill site_ordin_df with metadata from mdf
        site_ordin_df['color'] = None
        site_ordin_df['size'] = None
        for ID in site_ordin_df.index:
            site_ordin_df['color'].loc[ID] = mdf[self.color_marker_by].loc[ID]
            site_ordin_df['size'].loc[ID] = mdf[self.scale_size_by].loc[ID]

        site_ordin_df.fillna('na', inplace=True)

        # Plot
        if self.color_marker_by is not None and self.scale_size_by is not None and all(
                isinstance(x, (int, float)) for x in list(site_ordin_df['size'])):
            fig = px.scatter(site_ordin_df, x="MDS1", y="MDS2", color="color", size="size",
                             hover_name=site_ordin_df.index)
        elif self.color_marker_by is not None:
            fig = px.scatter(site_ordin_df, x="MDS1", y="MDS2", color="color", hover_name=site_ordin_df.index)
        elif self.scale_size_by is not None:
            fig = px.scatter(site_ordin_df, x="MDS1", y="MDS2", size="size", hover_name=site_ordin_df.index)

        # Save plotly_fig.html and return path
        plotly_html_file_path = os.path.join(self.output_dir, "plotly_fig.html")
        plot(fig, filename=plotly_html_file_path)
        return plotly_html_file_path

    def __init__(self, config):

        self.ws_url = config["workspace-url"]
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        self.scratch = config['scratch']
        self.dfu = DataFileUtil(self.callback_url, service_ver='release')
        self.working_dir = self.scratch

        self.data_util = DataUtil(config)
        self.dfu = DataFileUtil(self.callback_url)
        self.output_dir = os.path.join(self.working_dir, self.MDS_OUT_DIR)
        self._mkdir_p(self.output_dir)

        # If input is from files, then pd.DataFrame needs to be transposed in run_metaMDS_with_file method
        self.need_to_transpose = True


    def run_metaMDS(self, params):
        """
        run_metaMDS: perform metaMDS analysis on matrix
        :param input_obj_ref: object reference of a matrix
        :param workspace_name: the name of the workspace
        :param mds_matrix_name: name of MDS (KBaseExperiments.MDSMatrix) object
        :param n_components - dimentionality of the reduced space (default 2)
        :param max_iter: maximum iterations allowed
        :param metric: indication of running metric or non-metric MDS
        :param distance_metric: distance the ordination will be performed on, default to "bray"
        """

        logging.info('--->\nrunning metaMDS with input\n' +
                     'params:\n{}'.format(json.dumps(params, indent=1)))

        self._validate_run_mds_params(params)

        input_obj_ref = params.get(self.PARAM_IN_MATRIX)
        workspace_name = params.get(self.PARAM_IN_WS)
        mds_matrix_name = params.get(self.PARAM_OUT_MATRIX)
        n_components = int(params.get('n_components', 2))

        res = self.dfu.get_objects({'object_refs': [input_obj_ref]})['data'][0]
        obj_data = res['data']
        obj_name = res['info'][1]
        obj_type = res['info'][2]

        max_size = len(obj_data['data']['col_ids'])
        if n_components > max_size:
            raise ValueError('Number of components should be less than number of samples')

        exitCode = -99
        if "KBaseMatrices" in obj_type:
            # create the input file from obj_data
            matrix_tab = obj_data['data']['values']
            row_ids = obj_data['data']['row_ids']
            col_ids = obj_data['data']['col_ids']
            matrix_df = pd.DataFrame(matrix_tab, index=row_ids, columns=col_ids)
            # Transpose DataFrame
            matrix_df = matrix_df.T
            self.need_to_transpose = False

            matrix_data_file = os.path.join(self.output_dir, obj_name + '.csv')
            with open(matrix_data_file, 'w') as m_file:
                matrix_df.to_csv(m_file, sep='\t')

            params['datafile'] = matrix_data_file
            exitCode = self.run_metaMDS_with_file(params)
        else:
            err_msg = 'Ooops! [{}] is not supported.\n'.format(obj_type)
            err_msg += 'Please provide a KBaseMatrices object'
            raise ValueError(err_msg)

        if exitCode == -99:
            raise ValueError('Caught subprocess.CalledProcessError while calling R.')

        # saving the mds_matrix object
        # read metaMDS results from files into data frames
        dist_matrix_df = pd.read_csv(os.path.join(self.output_dir, "dist_matrix.csv"))
        mds_params_df = pd.read_json(os.path.join(self.output_dir, "others.json"))
        site_ordin_df = pd.read_csv(os.path.join(self.output_dir, "site_ordination.csv"))
        species_ordin_df = pd.read_csv(os.path.join(self.output_dir, "species_ordination.csv"))

        mds_ref = self._save_mds_matrix(workspace_name, input_obj_ref, mds_matrix_name,
                                        dist_matrix_df, mds_params_df, site_ordin_df,
                                        species_ordin_df)
        returnVal = {'mds_ref': mds_ref}

        # generating report
        report_output = self._generate_mds_report(mds_ref, self.output_dir,
                                                  workspace_name, n_components)

        returnVal.update(report_output)
        return returnVal

    def run_metaMDS_with_file(self, params):
        """
        run_metaMDS_with_file: perform metaMDS analysis on matrix
        :param datafile: a file that contains the matrix data
        :param workspace_name: the name of the workspace
        :param mds_matrix_name: name of MDS (KBaseExperiments.MDSMatrix) object
        :param n_components - dimentionality of the reduced space (default 2)
        :param max_iter: maximum iterations allowed
        :param metric: indication of running metric or non-metric MDS
        :param distance_metric: distance the ordination will be performed on, default to "bray"
        """

        # Variables for Grouping Features
        self.attribute_mapping_obj_ref = params.get('attribute_mapping_obj_ref')
        self.metadata_file = params.get('metadata_file')
        self.color_marker_by = params.get('color_marker_by')
        if self.color_marker_by is not None:
            try:
                self.color_marker_by = self.color_marker_by['attribute_color'][0]
            except KeyError:
                raise KeyError('Expected dictionary with key "attribute_color" containing a list of one element. '
                               'Instead found: {}'.format(self.color_marker_by))
        self.scale_size_by = params.get('scale_size_by')
        if self.scale_size_by is not None:
            try:
                self.scale_size_by = self.scale_size_by['attribute_size'][0]
            except KeyError:
                raise KeyError('Expected dictionary with key "attribute_size" containing a list of one element. '
                               'Instead found: {}'.format(self.scale_size_by))

        logging.info('--->\nrunning metaMDS with input \n' +
                     'params:\n{}'.format(json.dumps(params, indent=1)))

        rscrpt_file = self._build_rMDS_script(params)
        logging.info('--->\nR script file has been written to {}'.format(rscrpt_file))

        result = self._execute_r_script(rscrpt_file)

        # Make and save plotly fig
        if self.color_marker_by is not None or self.scale_size_by is not None:
            self._plot_with_grouping()

        return result

    def export_mds_matrix_excel(self, params):
        """
        export MDSMatrix as Excel
        """
        logging.info('start exporting mds matrix')
        mds_matrix_ref = params.get('input_ref')

        mds_df, components_df = self._mds_to_df(mds_matrix_ref)

        result_dir = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(result_dir)

        self._mds_df_to_excel(mds_df, components_df, result_dir, mds_matrix_ref)

        package_details = self.dfu.package_for_download({
            'file_path': result_dir,
            'ws_refs': [mds_matrix_ref]
        })

        return {'shock_id': package_details['shock_id']}
Example #11
0
class PCAUtil:

    def _mkdir_p(self, path):
        """
        _mkdir_p: make directory for given path
        """
        if not path:
            return
        try:
            os.makedirs(path)
        except OSError as exc:
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise

    def _validate_run_pca_params(self, params):
        """
        _validate_run_pca_params:
            validates params passed to run_pca method
        """

        logging.info('start validating run_pca params')

        # check for required parameters
        for p in ['input_obj_ref', 'workspace_name', 'pca_matrix_name']:
            if p not in params:
                raise ValueError('"{}" parameter is required, but missing'.format(p))

    def _df_to_list(self, df):
        """
        _df_to_list: convert Dataframe to FloatMatrix2D matrix data
        """

        df.index = df.index.astype('str')
        df.columns = df.columns.astype('str')
        df.fillna(0, inplace=True)
        matrix_data = {'row_ids': df.index.tolist(),
                       'col_ids': df.columns.tolist(),
                       'values': df.values.tolist()}

        return matrix_data

    def _pca_df_to_excel(self, pca_df, components_df, result_dir, pca_matrix_ref):
        """
        write PCA matrix df into excel
        """
        logging.info('writting pca data frame to excel file')
        pca_matrix_obj = self.dfu.get_objects({'object_refs': [pca_matrix_ref]})['data'][0]
        pca_matrix_info = pca_matrix_obj['info']
        pca_matrix_name = pca_matrix_info[1]

        file_path = os.path.join(result_dir, pca_matrix_name + ".xlsx")

        writer = pd.ExcelWriter(file_path)

        pca_df.to_excel(writer, "principal_component_matrix", index=True)
        if components_df is not None:
            components_df.to_excel(writer, "component_variance_matrix", index=True)

        writer.close()

    def _Matrix2D_to_df(self, Matrix2D):
        """
        _Matrix2D_to_df: transform a FloatMatrix2D to data frame
        """

        index = Matrix2D.get('row_ids')
        columns = Matrix2D.get('col_ids')
        values = Matrix2D.get('values')

        df = pd.DataFrame(values, index=index, columns=columns)

        return df

    def _pca_to_df(self, pca_matrix_ref):
        """
        retrieve pca matrix ws object to pca_df
        """
        logging.info('converting pca matrix to data frame')
        pca_data = self.dfu.get_objects({'object_refs': [pca_matrix_ref]})['data'][0]['data']
       
        rotation_matrix_data = pca_data.get('rotation_matrix')
        # exit(rotation_matrix_data)   {'col_ids': ['principal_component_1', 'principal_component_2'], 'row_ids': ['WRI_RS00010_CDS_1', 'WRI_RS00015_CDS_1', 'WRI_RS00025_CDS_1'], 'values': [[-0.45, 1.06], [-0.69, -0.92], [1.14, -0.13]]}
        components_matrix_data = pca_data.get('components_matrix')
        
        explained_variance = pca_data.get('explained_variance')
        #exit(explained_variance) [0.628769688409428, 0.371230311590572]
        explained_variance_ratio = pca_data.get('explained_variance_ratio')
        #exit(explained_variance_ratio) [0.628769688409428, 0.371230311590572] 
        singular_values = pca_data.get('singular_values')
        #exit(singular_values)
        dimension = pca_data.get('pca_parameters').get('dimension') 
        
        original_matrix_ref = pca_data.get('original_matrix_ref')

        pca_df = self._Matrix2D_to_df(rotation_matrix_data)
        components_df = None
        if components_matrix_data:
            components_df = self._Matrix2D_to_df(components_matrix_data)
            components_df.loc['explained_variance'] = explained_variance
            components_df.loc['explained_variance_ratio'] = explained_variance_ratio
            components_df.loc['singular_values'] = singular_values

        if original_matrix_ref:
            logging.info('appending instance group information to pca data frame')
            obj_data = self.dfu.get_objects({'object_refs': [original_matrix_ref]})['data'][0]['data']

            attributemapping_ref = obj_data.get('{}_attributemapping_ref'.format(dimension))

            am_data = self.dfu.get_objects({'object_refs': [attributemapping_ref]})['data'][0]['data']

            attributes = am_data.get('attributes')
            instances = am_data.get('instances')
            am_df = pd.DataFrame(data=list(instances.values()),
                                 columns=list(map(lambda x: x.get('attribute'), attributes)),
                                 index=instances.keys())

            pca_df = pca_df.merge(am_df, left_index=True, right_index=True, how='left',
                                  validate='one_to_one')

        return pca_df, components_df

    def _save_pca_matrix(self, workspace_name, input_obj_ref, pca_matrix_name, rotation_matrix_df,
                         components_df, explained_variance, explained_variance_ratio,
                         singular_values, n_components, dimension):

        logging.info('saving PCAMatrix')

        if not isinstance(workspace_name, int):
            ws_name_id = self.dfu.ws_name_to_id(workspace_name)
        else:
            ws_name_id = workspace_name

        pca_data = {}

        pca_data.update({'rotation_matrix': self._df_to_list(rotation_matrix_df)})
        pca_data.update({'components_matrix': self._df_to_list(components_df)})
        pca_data.update({'explained_variance': explained_variance})
        pca_data.update({'explained_variance_ratio': explained_variance_ratio})
        pca_data.update({'singular_values': singular_values})
        pca_data.update({'pca_parameters': {'n_components': str(n_components),
                                            'dimension': str(dimension)}})
        pca_data.update({'original_matrix_ref': input_obj_ref})
        

        obj_type = 'KBaseExperiments.PCAMatrix'
        info = self.dfu.save_objects({
            "id": ws_name_id,
            "objects": [{
                "type": obj_type,
                "data": pca_data,
                "name": pca_matrix_name
            }]
        })[0]
        
        return "%s/%s/%s" % (info[6], info[0], info[4])

    def _pca_for_matrix(self, input_obj_ref, n_components, dimension):
        """
        _pca_for_matrix: perform PCA analysis for matrix object
        """

        data_matrix = self.data_util.fetch_data({'obj_ref': input_obj_ref}).get('data_matrix')

        data_df = pd.read_json(data_matrix)
        data_df.fillna(0, inplace=True)

        if dimension == 'col':
            data_df = data_df.T
        elif dimension != 'row':
            err_msg = 'Input dimension [{}] is not available.\n'.format(dimension)
            err_msg += 'Please choose either "col" or "row"'
            raise ValueError(err_msg)

        if n_components > min(data_df.index.size, data_df.columns.size):
            raise ValueError('Number of components should be less than min(n_samples, n_features)')

        # normalize sample
        # logging.info("Standardizing the matrix")
        # s_values = StandardScaler().fit_transform(data_df.values)
        # skip normalizing sample
        s_values = data_df.values

        # Projection to ND
        pca = PCA(n_components=n_components, whiten=True)
        principalComponents = pca.fit_transform(s_values)
        explained_variance = list(pca.explained_variance_)
        explained_variance_ratio = list(pca.explained_variance_ratio_)

        components = pca.components_
        singular_values = list(pca.singular_values_)
        

        col = list()
        for i in range(n_components):
            col.append('principal_component_{}'.format(i+1))

        rotation_matrix_df = pd.DataFrame(data=principalComponents,
                                          columns=col,
                                          index=data_df.index)

        components_df = pd.DataFrame(data=components,
                                     columns=data_df.columns,
                                     index=col).transpose()

        rotation_matrix_df.fillna(0, inplace=True)

        return (rotation_matrix_df, components_df, explained_variance, explained_variance_ratio,
                singular_values)

    def _generate_pca_html_report(self, pca_plots, n_components):

        logging.info('start generating html report')
        html_report = list()

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_directory)
        result_file_path = os.path.join(output_directory, 'pca_report.html')

        visualization_content = ''

        for pca_plot in pca_plots:
            shutil.copy2(pca_plot,
                         os.path.join(output_directory, os.path.basename(pca_plot)))
            visualization_content += '<iframe height="900px" width="100%" '
            visualization_content += 'src="{}" '.format(os.path.basename(pca_plot))
            visualization_content += 'style="border:none;"></iframe>\n<p></p>\n'

        with open(result_file_path, 'w') as result_file:
            with open(os.path.join(os.path.dirname(__file__), 'templates', 'pca_template.html'),
                      'r') as report_template_file:
                report_template = report_template_file.read()
                report_template = report_template.replace('<p>Visualization_Content</p>',
                                                          visualization_content)
                report_template = report_template.replace('n_components',
                                                          '{} Components'.format(n_components))
                result_file.write(report_template)

        report_shock_id = self.dfu.file_to_shock({'file_path': output_directory,
                                                  'pack': 'zip'})['shock_id']

        html_report.append({'shock_id': report_shock_id,
                            'name': os.path.basename(result_file_path),
                            'label': os.path.basename(result_file_path),
                            'description': 'HTML summary report for ExpressionMatrix Cluster App'
                            })
        return html_report

    def _generate_pca_report(self, pca_ref, pca_plots, workspace_name, n_components):
        logging.info('creating report')

        output_html_files = self._generate_pca_html_report(pca_plots, n_components)

        objects_created = list()
        objects_created.append({'ref': pca_ref,
                                'description': 'PCA Matrix'})

        report_params = {'message': '',
                         'workspace_name': workspace_name,
                         'objects_created': objects_created,
                         'html_links': output_html_files,
                         'direct_html_link_index': 0,
                         'html_window_height': 666,
                         'report_object_name': 'kb_pca_report_' + str(uuid.uuid4())}

        kbase_report_client = KBaseReport(self.callback_url)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {'report_name': output['name'], 'report_ref': output['ref']}

        return report_output

    def _append_instance_group(self, plot_pca_matrix, obj_data, dimension):
        plot_pca_matrix = plot_pca_matrix.copy()
        #exit(obj_data)  {'attributes': {'Instrument': 'Old Faithful', 'Scientist': 'Marie Currie'}, 'col_attributemapping_ref': '44071/7/79', 'col_mapping': {'instance_1': 'test_col_instance_1', 'instance_2': 'test_col_instance_2', 'instance_3': 'test_col_instance_3', 'instance_4': 'test_col_instance_4'}, 'col_normalization': 'test_col_normalization', 'data': {'col_ids': ['instance_1', 'instance_2', 'instance_3', 'instance_4'], 'row_ids': ['WRI_RS00010_CDS_1', 'WRI_RS00015_CDS_1', 'WRI_RS00025_CDS_1'], 'values': [[0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8], [None, None, 1.1, 1.2]]}, 'description': 'test_desc', 'row_attributemapping_ref': '44071/8/71', 'row_mapping': {'WRI_RS00010_CDS_1': 'test_row_instance_1', 'WRI_RS00015_CDS_1': 'test_row_instance_2', 'WRI_RS00025_CDS_1': 'test_row_instance_3'}, 'row_normalization': 'test_row_normalization', 'scale': 'log2', 'search_attributes': ['Scientist | Marie Currie', 'Instrument | Old Faithful']}
        if dimension == 'row':
            attribute_mapping = obj_data.get('row_mapping')
        elif dimension == 'col':
            attribute_mapping = obj_data.get('col_mapping')
        else:
            raise ValueError('Unexpected dimension')

        if not attribute_mapping:
            logging.warning('Matrix object does not have {}_mapping attribute'.format(dimension))
            # build matrix with unify color and shape
            return plot_pca_matrix
        else:
            # append instance col mapping from row/col_mapping
            plot_pca_matrix['instance'] = plot_pca_matrix.index.map(attribute_mapping)
            #exit(plot_pca_matrix)  
            ''' principal_component_1         ...                      instance
            WRI_RS00010_CDS_1              -0.853094         ...           test_row_instance_1
            WRI_RS00015_CDS_1              -0.247377         ...           test_row_instance_2
            WRI_RS00025_CDS_1               1.100471         ...           test_row_instance_3'''


        return plot_pca_matrix

    def _build_size_pca_matrix(self, plot_pca_matrix, obj_data, dimension, attribute_name):
        """
        _build_size_pca_matrix: append attribute value to rotation_matrix
        """
        logging.info('appending attribute value for sizing to rotation matrix')

        plot_pca_matrix = plot_pca_matrix.copy()

        if dimension == 'row':
            attribute_mapping = obj_data.get('row_mapping')
            attribute_mapping_ref = obj_data.get('row_attributemapping_ref')
        elif dimension == 'col':
            attribute_mapping = obj_data.get('col_mapping')
            attribute_mapping_ref = obj_data.get('col_attributemapping_ref')
        else:
            raise ValueError('Unexpected dimension')

        if not attribute_mapping:
            logging.warning('Matrix object does not have {}_mapping attribute'.format(dimension))
            # build matrix with unify color and shape
            return plot_pca_matrix
        else:
            # append instance col mapping from row/col_mapping
            # exit(plot_pca_matrix.index.map(attribute_mapping))  Index(['test_row_instance_1', 'test_row_instance_2', 'test_row_instance_3'], dtype='object')
            plot_pca_matrix['instance'] = plot_pca_matrix.index.map(attribute_mapping)

        res = self.dfu.get_objects({'object_refs': [attribute_mapping_ref]})['data'][0]
        attri_data = res['data']
        attri_name = res['info'][1]

        attributes = attri_data.get('attributes')
        #exit(attributes)  [{'attribute': 'test_attribute_1', 'attribute_ont_id': 'OBI_0500020', 'source': 'upload', 'unit': 'Hour', 'unit_ont_id': 'UO:0000032', 'unit_ont_ref': '6308/15/6'}, {'attribute': 'test_attribute_2', 'attribute_ont_id': 'CHEBI:9168', 'attribute_ont_ref': '6308/19/1', 'source': 'upload', 'unit': 'nanogram per milliliter', 'unit_ont_id': 'UO:0000275', 'unit_ont_ref': '6308/15/6'}, {'attribute': 'test_attribute_3', 'attribute_ont_id': 'CHEBI:9168', 'attribute_ont_ref': '6308/19/1', 'source': 'upload', 'unit': 'nanogram per milliliter', 'unit_ont_id': 'UO:0000275', 'unit_ont_ref': '6308/15/6'}]


        attr_pos = None
        for idx, attribute in enumerate(attributes):
            if attribute.get('attribute') == attribute_name:
                attr_pos = idx
                break

        if attr_pos is None:
            raise ValueError('Cannot find attribute [{}] in [{}]'.format(attribute_name,
                                                                         attri_name))

        instances = attri_data.get('instances')
        #exit(instances) {'test_row_instance_1': ['1', '4', '7'], 'test_row_instance_2': ['3', '4', '8'], 'test_row_instance_3': ['3', '6', '7']}

        plot_pca_matrix['attribute_value_size'] = None
        for instance_name, attri_values in instances.items():
            plot_pca_matrix.loc[plot_pca_matrix.instance == instance_name,
                                ['attribute_value_size']] = attri_values[attr_pos]
        #exit(plot_pca_matrix)
        '''
        principal_component_1          ...           attribute_value_size
        WRI_RS00010_CDS_1              -0.853094          ...                              1
        WRI_RS00015_CDS_1              -0.247377          ...                              3
        WRI_RS00025_CDS_1               1.100471          ...                              3

        '''
        return plot_pca_matrix

    def _build_color_pca_matrix(self, plot_pca_matrix, obj_data, dimension, attribute_name):
        """
        _build_color_pca_matrix: append attribute value to rotation_matrix
        """
        logging.info('appending attribute value for grouping color to rotation matrix')

        plot_pca_matrix = plot_pca_matrix.copy()

        if dimension == 'row':
            attribute_mapping = obj_data.get('row_mapping')
            attribute_mapping_ref = obj_data.get('row_attributemapping_ref')
        elif dimension == 'col':
            attribute_mapping = obj_data.get('col_mapping')
            attribute_mapping_ref = obj_data.get('col_attributemapping_ref')
        else:
            raise ValueError('Unexpected dimension')

        if not attribute_mapping:
            logging.warning('Matrix object does not have {}_mapping attribute'.format(dimension))
            # build matrix with unify color and shape
            return plot_pca_matrix
        else:
            # append instance col mapping from row/col_mapping
            plot_pca_matrix['instance'] = plot_pca_matrix.index.map(attribute_mapping)

        res = self.dfu.get_objects({'object_refs': [attribute_mapping_ref]})['data'][0]
        attri_data = res['data']
        attri_name = res['info'][1]

        attributes = attri_data.get('attributes')

        attr_pos = None
        for idx, attribute in enumerate(attributes):
            if attribute.get('attribute') == attribute_name:
                attr_pos = idx
                break

        if attr_pos is None:
            raise ValueError('Cannot find attribute [{}] in [{}]'.format(attribute_name,
                                                                         attri_name))

        instances = attri_data.get('instances')
        #exit(instances) {'test_row_instance_1': ['1', '4', '7'], 'test_row_instance_2': ['3', '4', '8'], 'test_row_instance_3': ['3', '6', '7']}
        plot_pca_matrix['attribute_value_color'] = None
        for instance_name, attri_values in instances.items():
            #exit(attri_values) ['1', '4', '7']
            plot_pca_matrix.loc[plot_pca_matrix.instance == instance_name,
                                ['attribute_value_color']] = attri_values[attr_pos]

        return plot_pca_matrix

    def _build_2_comp_trace(self, plot_pca_matrix, components_x, components_y):
        #exit(plot_pca_matrix)
        '''
        principal_component_1          ...           attribute_value_size
        WRI_RS00010_CDS_1              -0.853094          ...                              1
        WRI_RS00015_CDS_1              -0.247377          ...                              3
        WRI_RS00025_CDS_1               1.100471          ...                              3

        '''
        #exit(components_x)  principal_component_1
        #exit(components_y)  principal_component_2
        traces = []

        if 'attribute_value_color' in plot_pca_matrix.columns and 'attribute_value_size' in plot_pca_matrix.columns:

            maximum_marker_size = 10
            sizeref = 2.*float(max(plot_pca_matrix['attribute_value_size']))/(maximum_marker_size**2)

            for name in set(plot_pca_matrix.attribute_value_color):
                attribute_value_size = plot_pca_matrix.loc[plot_pca_matrix['attribute_value_color'].eq(name)].attribute_value_size
                size_list = list(map(abs, list(map(float, attribute_value_size))))
                for idx, val in enumerate(size_list):
                    if val == 0:
                        size_list[idx] = sys.float_info.min
                trace = go.Scatter(
                    x=list(plot_pca_matrix.loc[plot_pca_matrix['attribute_value_color'].eq(name)][components_x]),
                    y=list(plot_pca_matrix.loc[plot_pca_matrix['attribute_value_color'].eq(name)][components_y]),
                    mode='markers',
                    name=name,
                    text=list(plot_pca_matrix.loc[plot_pca_matrix['attribute_value_color'].eq(name)].index),
                    textposition='bottom center',
                    marker=go.Marker(symbol='circle', sizemode='area', sizeref=sizeref,
                                     size=size_list, sizemin=2,
                                     line=go.Line(color='rgba(217, 217, 217, 0.14)', width=0.5),
                                     opacity=0.8))
                traces.append(trace)
        elif 'attribute_value_color' in plot_pca_matrix.columns:
            for name in set(plot_pca_matrix.attribute_value_color):
                trace = go.Scatter(
                    x=list(plot_pca_matrix.loc[plot_pca_matrix['attribute_value_color'].eq(name)][components_x]),
                    y=list(plot_pca_matrix.loc[plot_pca_matrix['attribute_value_color'].eq(name)][components_y]),
                    mode='markers',
                    name=name,
                    text=list(plot_pca_matrix.loc[plot_pca_matrix['attribute_value_color'].eq(name)].index),
                    textposition='bottom center',
                    marker=go.Marker(size=10, opacity=0.8, line=go.Line(color='rgba(217, 217, 217, 0.14)',
                                                                        width=0.5)))
                traces.append(trace)
        elif 'attribute_value_size' in plot_pca_matrix.columns:

            maximum_marker_size = 10
            sizeref = 2.*float(max(plot_pca_matrix['attribute_value_size']))/(maximum_marker_size**2)

            for name in set(plot_pca_matrix.instance):
                attribute_value_size = plot_pca_matrix.loc[plot_pca_matrix['instance'].eq(name)].attribute_value_size
                size_list = list(map(abs, list(map(float, attribute_value_size))))
                for idx, val in enumerate(size_list):
                    if val == 0:
                        size_list[idx] = sys.float_info.min
                trace = go.Scatter(
                    x=list(plot_pca_matrix.loc[plot_pca_matrix['instance'].eq(name)][components_x]),
                    y=list(plot_pca_matrix.loc[plot_pca_matrix['instance'].eq(name)][components_y]),
                    mode='markers',
                    name=name,
                    text=list(plot_pca_matrix.loc[plot_pca_matrix['instance'].eq(name)].index),
                    textposition='bottom center',
                    marker=go.Marker(symbol='circle', sizemode='area', sizeref=sizeref,
                                     size=size_list, sizemin=2,
                                     line=go.Line(color='rgba(217, 217, 217, 0.14)', width=0.5),
                                     opacity=0.8))
                traces.append(trace)
        else:
            trace = go.Scatter(
                x=list(plot_pca_matrix[components_x]),
                y=list(plot_pca_matrix[components_y]),
                mode='markers',
                text=list(plot_pca_matrix.index),
                textposition='bottom center',
                marker=go.Marker(size=10, opacity=0.8,
                                 line=go.Line(color='rgba(217, 217, 217, 0.14)', width=0.5)))
            traces.append(trace)

        return traces

    def _plot_pca_matrix(self, plot_pca_matrix, n_components):

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_directory)
        result_file_paths = []

        all_pairs = list(itertools.combinations(range(1, n_components+1), 2))

        for pair in all_pairs:
            first_component = pair[0]
            second_component = pair[1]
            result_file_path = os.path.join(output_directory, 'pca_plot_{}_{}.html'.format(
                                                                                first_component,
                                                                                second_component))

            traces = self._build_2_comp_trace(plot_pca_matrix,
                                              'principal_component_{}'.format(first_component),
                                              'principal_component_{}'.format(second_component))

            data = go.Data(traces)
            layout = go.Layout(xaxis=go.XAxis(title='PC{}'.format(first_component), showline=False),
                               yaxis=go.YAxis(title='PC{}'.format(second_component), showline=False))
            fig = go.Figure(data=data, layout=layout)

            plot(fig, filename=result_file_path)

            result_file_paths.append(result_file_path)

        return result_file_paths

    def _validate_pca_matrix(self, obj_data, dimension,
                             color_marker_by, scale_size_by):

        if dimension == 'row':
            attribute_mapping = obj_data.get('row_mapping')
        elif dimension == 'col':
            attribute_mapping = obj_data.get('col_mapping')
        else:
            raise ValueError('Unexpected dimension')

        if not attribute_mapping:
            if (color_marker_by and color_marker_by.get('attribute_color')[0]) or \
               (scale_size_by and scale_size_by.get('attribute_size')[0]):
                raise ValueError('Matrix object is not associated with any {} attribute mapping'.format(dimension))

    def __init__(self, config):
        self.ws_url = config["workspace-url"]
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        self.scratch = config['scratch']

        self.data_util = DataUtil(config)
        self.dfu = DataFileUtil(self.callback_url)

        plt.switch_backend('agg')

    def run_pca(self, params):
        """
        perform PCA analysis on matrix

        input_obj_ref: object reference of a matrix
        workspace_name: the name of the workspace
        pca_matrix_name: name of PCA (KBaseExperiments.PCAMatrix) object

        n_components - number of components (default 2)
        dimension: compute correlation on column or row, one of ['col', 'row']
        """

        logging.info('--->\nrunning NetworkUtil.build_network\n' +
            'params:\n{}'.format(json.dumps(params, indent=1)))

        self._validate_run_pca_params(params)

        input_obj_ref = params.get('input_obj_ref')
        workspace_name = params.get('workspace_name')
        pca_matrix_name = params.get('pca_matrix_name')

        n_components = int(params.get('n_components', 2))
        dimension = params.get('dimension', 'row')

        res = self.dfu.get_objects({'object_refs': [input_obj_ref]})['data'][0]
        obj_data = res['data']
        obj_type = res['info'][2]

        self._validate_pca_matrix(obj_data, dimension,
                                  params.get('color_marker_by'), params.get('scale_size_by'))

        if "KBaseMatrices" in obj_type:

            (rotation_matrix_df, components_df, explained_variance,
             explained_variance_ratio, singular_values) = self._pca_for_matrix(input_obj_ref,
                                                                               n_components,
                                                                               dimension)
        else:
            err_msg = 'Ooops! [{}] is not supported.\n'.format(obj_type)
            err_msg += 'Please supply KBaseMatrices object'
            raise ValueError("err_msg")

        
        pca_ref = self._save_pca_matrix(workspace_name, input_obj_ref, pca_matrix_name,
                                        rotation_matrix_df, components_df, explained_variance,
                                        explained_variance_ratio, singular_values,
                                        n_components, dimension)

        plot_pca_matrix = self._append_instance_group(rotation_matrix_df.copy(), obj_data,
                                                      dimension)

        if params.get('color_marker_by'):
            plot_pca_matrix = self._build_color_pca_matrix(
                                            plot_pca_matrix, obj_data, dimension,
                                            params.get('color_marker_by').get('attribute_color')[0])
        #exit(plot_pca_matrix)  
        '''
        principal_component_1          ...            attribute_value_color
        WRI_RS00010_CDS_1              -0.853094          ...                                4
        WRI_RS00015_CDS_1              -0.247377          ...                                4
        WRI_RS00025_CDS_1               1.100471          ...                                6
        ''' 
        if params.get('scale_size_by'):
            plot_pca_matrix = self._build_size_pca_matrix(
                                            plot_pca_matrix, obj_data, dimension,
                                            params.get('scale_size_by').get('attribute_size')[0])

        returnVal = {'pca_ref': pca_ref}

        report_output = self._generate_pca_report(pca_ref,
                                                  self._plot_pca_matrix(plot_pca_matrix,
                                                                        n_components),
                                                  workspace_name,
                                                  n_components)

        returnVal.update(report_output)
        return returnVal

    def export_pca_matrix_excel(self, params):
        """
        export PCAMatrix as Excel
        """
        logging.info('start exporting pca matrix')
        pca_matrix_ref = params.get('input_ref')  #44071/62/8
        pca_df, components_df = self._pca_to_df(pca_matrix_ref)
        
        #exit(pca_df)
        '''
        principal_component_1  principal_component_2
        WRI_RS00010_CDS_1                  -0.45                   1.06
        WRI_RS00015_CDS_1                  -0.69                  -0.92
        WRI_RS00025_CDS_1                   1.14                  -0.13
        '''
        result_dir = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(result_dir)
        #exit(result_dir)                       #/kb/module/work/tmp/54e9610e-9b51-4296-a971-333f9f154a1f

        self._pca_df_to_excel(pca_df, components_df, result_dir, pca_matrix_ref)

        package_details = self.dfu.package_for_download({
            'file_path': result_dir,
            'ws_refs': [pca_matrix_ref]
        })

        return {'shock_id': package_details['shock_id']}
Example #12
0
class FeatureSetDownload:
    def __init__(self, config):
        self.cfg = config
        self.scratch = config['scratch']
        self.gsu = GenomeSearchUtil(os.environ['SDK_CALLBACK_URL'])
        self.dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL'])
        self.ws = Workspace(config["workspace-url"])

    @staticmethod
    def validate_params(params,
                        expected={"workspace_name", "featureset_name"}):
        expected = set(expected)
        pkeys = set(params)
        if expected - pkeys:
            raise ValueError(
                "Required keys {} not in supplied parameters".format(
                    ", ".join(expected - pkeys)))

    def to_tsv(self, params):
        working_dir = os.path.join(self.scratch,
                                   'featureset-download-' + str(uuid.uuid4()))
        os.makedirs(working_dir)
        header = ['Feature Id', 'Aliases', 'Genome', 'Type', 'Function']

        fs_name, fs_dicts = self.make_featureset_dict(params['featureset_ref'])
        files = {'file_path': "{}/{}.tsv".format(working_dir, fs_name)}
        writer = csv.DictWriter(open(files['file_path'], 'w'),
                                header,
                                delimiter='\t',
                                lineterminator='\n')
        writer.writeheader()
        for feat in fs_dicts:
            writer.writerow(feat)
        return fs_name, files

    def make_featureset_dict(self, fs_ref):
        features = []
        ret = self.dfu.get_objects({'object_refs': [fs_ref]})['data'][0]
        feat_set = ret['data']
        fs_name = ret['info'][1]

        feat_by_genome = defaultdict(list)
        for k, v in feat_set['elements'].items():
            feat_by_genome[v[0]].append(k)

        for genome, fids in feat_by_genome.items():
            genome_name = self.ws.get_object_info3(
                {'objects': [{
                    'ref': genome
                }]})['infos'][0][1]
            res = self.gsu.search({
                'ref': genome,
                'structured_query': {
                    'feature_id': fids
                },
                'sort_by': [['contig_id', 1]],
                'start': 0,
                'limit': len(fids)
            })

            for feat in res['features']:
                features.append({
                    'Feature Id':
                    feat['feature_id'],
                    'Aliases':
                    ", ".join(sorted(feat['aliases'].keys())),
                    'Genome':
                    "{} ({})".format(genome_name, genome),
                    'Type':
                    feat['feature_type'],
                    'Function':
                    feat['function']
                })
        return fs_name, features

    def export(self, files, name, params):
        export_package_dir = os.path.join(self.scratch,
                                          name + str(uuid.uuid4()))
        os.makedirs(export_package_dir)
        for file in files:
            shutil.move(
                file, os.path.join(export_package_dir, os.path.basename(file)))

        # package it up and be done
        package_details = self.dfu.package_for_download({
            'file_path':
            export_package_dir,
            'ws_refs': [params['featureset_ref']]
        })

        return {'shock_id': package_details['shock_id']}
    def export_genome_as_gff(self, ctx, params):
        """
        :param params: instance of type "ExportParams" (input and output
           structure functions for standard downloaders) -> structure:
           parameter "input_ref" of String
        :returns: instance of type "ExportOutput" -> structure: parameter
           "shock_id" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN export_genome_as_gff
        if 'input_ref' not in params:
            raise ValueError('Cannot run export_genome_as_gff- no "input_ref" '
                             'field defined.')

        # get WS metadata to get ws_name and obj_name
        ws = Workspace(url=self.cfg.workspaceURL)
        info = ws.get_objects2({
            'objects': [{
                'ref':
                params['input_ref'],
                'included':
                ['/assembly_ref', '/contigset_ref', '/id', '/gff_handle_ref']
            }]
        })['data'][0]['data']

        # export to file (building from KBase Genome Object)
        result = self.genome_to_gff(ctx,
                                    {'genome_ref': params['input_ref']})[0]

        # get assembly
        if 'assembly_ref' in info:
            assembly_ref = info['assembly_ref']
        else:
            assembly_ref = info['contigset_ref']
        print(('Assembly reference = ' + assembly_ref))
        print('Downloading assembly')
        au = AssemblyUtil(self.cfg.callbackURL)
        assembly_file_path = au.get_assembly_as_fasta(
            {'ref': params['input_ref'] + ";" + assembly_ref})['path']

        # create the output directory and move the files there
        export_package_dir = os.path.join(self.cfg.sharedFolder, info['id'])
        os.makedirs(export_package_dir)
        shutil.move(
            result['file_path'],
            os.path.join(
                export_package_dir,
                'KBase_derived_' + os.path.basename(result['file_path'])))
        shutil.move(
            assembly_file_path,
            os.path.join(export_package_dir,
                         os.path.basename(assembly_file_path)))

        # add cached genome if appropriate
        exporter = GenomeToGFF(self.cfg)
        cached = exporter.get_gff_handle(info, export_package_dir)

        # package it up
        dfUtil = DataFileUtil(self.cfg.callbackURL)
        package_details = dfUtil.package_for_download({
            'file_path':
            export_package_dir,
            'ws_refs': [params['input_ref']]
        })

        output = {'shock_id': package_details['shock_id']}
        #END export_genome_as_gff

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method export_genome_as_gff return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
Example #14
0
class ReadsAlignmentUtils:
    '''
    Module Name:
    ReadsAlignmentUtils

    Module Description:
    A KBase module: ReadsAlignmentUtils

This module is intended for use by Aligners and Assemblers to upload and download alignment files.
The alignment may be uploaded as a sam or bam file. If a sam file is given, it is converted to
the sorted bam format and saved. Upon downloading, optional parameters may be provided to get files
in sam and bai formats from the downloaded bam file. This utility also generates stats from the
stored alignment.
    '''

    ######## WARNING FOR GEVENT USERS ####### noqa
    # Since asynchronous IO can lead to methods - even the same method -
    # interrupting each other, you must be *very* careful when using global
    # state. A method could easily clobber the state set by another while
    # the latter method is running.
    ######################################### noqa
    VERSION = "0.3.6"
    GIT_URL = "https://github.com/kbaseapps/ReadsAlignmentUtils.git"
    GIT_COMMIT_HASH = "75ef2c24694c056dfca71859d6f344ccff7d4725"

    #BEGIN_CLASS_HEADER

    PARAM_IN_FILE = 'file_path'
    PARAM_IN_SRC_REF = 'source_ref'
    PARAM_IN_DST_REF = 'destination_ref'
    PARAM_IN_CONDITION = 'condition'
    PARAM_IN_READ_LIB_REF = 'read_library_ref'
    PARAM_IN_ASM_GEN_REF = 'assembly_or_genome_ref'

    PARAM_IN_ALIGNED_USING = 'aligned_using'
    PARAM_IN_ALIGNER_VER = 'aligner_version'
    PARAM_IN_ALIGNER_OPTS = 'aligner_opts'
    PARAM_IN_REPLICATE_ID = 'replicate_id'
    PARAM_IN_PLATFORM = 'platform'
    PARAM_IN_BOWTIE2_INDEX = 'bowtie2_index'
    PARAM_IN_SAMPLESET_REF = 'sampleset_ref'
    PARAM_IN_MAPPED_SAMPLE_ID = 'mapped_sample_id'

    PARAM_IN_DOWNLOAD_SAM = 'downloadSAM'
    PARAM_IN_DOWNLOAD_BAI = 'downloadBAI'
    PARAM_IN_VALIDATE = 'validate'

    INVALID_WS_OBJ_NAME_RE = re.compile('[^\\w\\|._-]')
    INVALID_WS_NAME_RE = re.compile('[^\\w:._-]')

    def _get_file_path_info(self, file_path):
        """
        Given a file path, returns the directory, file name, file base and file extension
        """
        dir, file_name = os.path.split(file_path)
        file_base, file_ext = os.path.splitext(file_name)

        return dir, file_name, file_base, file_ext

    def _mkdir_p(self, path):
        """
        _mkdir_p: make directory for given path
        """
        if not path:
            return
        try:
            os.makedirs(path)
        except OSError as exc:
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise

    def _check_required_param(self, in_params, param_list):
        """
        Checks if each of the params in the list are in the input params
        """
        for param in param_list:
            if (param not in in_params or not in_params[param]):
                raise ValueError('{} parameter is required'.format(param))

    def _proc_ws_obj_params(self, ctx, params):
        """
        Checks the validity of workspace and object params and returns them
        """
        dst_ref = params.get(self.PARAM_IN_DST_REF)

        ws_name_id, obj_name_id = os.path.split(dst_ref)

        if not bool(ws_name_id.strip()) or ws_name_id == '/':
            raise ValueError("Workspace name or id is required in " +
                             self.PARAM_IN_DST_REF)

        if not bool(obj_name_id.strip()):
            raise ValueError("Object name or id is required in " +
                             self.PARAM_IN_DST_REF)

        if not isinstance(ws_name_id, int):

            try:
                ws_name_id = self.dfu.ws_name_to_id(ws_name_id)
            except DFUError as se:
                prefix = se.message.split('.')[0]
                raise ValueError(prefix)

        self.__LOGGER.info('Obtained workspace name/id ' + str(ws_name_id))

        return ws_name_id, obj_name_id

    def _get_ws_info(self, obj_ref):

        ws = Workspace(self.ws_url)
        try:
            info = ws.get_object_info_new({'objects': [{'ref': obj_ref}]})[0]
        except WorkspaceError as wse:
            self.__LOGGER.error('Logging workspace exception')
            self.__LOGGER.error(str(wse))
            raise
        return info

    def _proc_upload_alignment_params(self, ctx, params):
        """
        Checks the presence and validity of upload alignment params
        """
        self._check_required_param(params, [
            self.PARAM_IN_DST_REF, self.PARAM_IN_FILE, self.PARAM_IN_CONDITION,
            self.PARAM_IN_READ_LIB_REF, self.PARAM_IN_ASM_GEN_REF
        ])

        ws_name_id, obj_name_id = self._proc_ws_obj_params(ctx, params)

        file_path = params.get(self.PARAM_IN_FILE)

        if not (os.path.isfile(file_path)):
            raise ValueError('File does not exist: ' + file_path)

        lib_type = self._get_ws_info(params.get(self.PARAM_IN_READ_LIB_REF))[2]
        if lib_type.startswith('KBaseFile.SingleEndLibrary') or \
           lib_type.startswith('KBaseFile.PairedEndLibrary') or \
           lib_type.startswith('KBaseAssembly.SingleEndLibrary') or \
           lib_type.startswith('KBaseAssembly.PairedEndLibrary'):
            pass
        else:
            raise ValueError(self.PARAM_IN_READ_LIB_REF +
                             ' parameter should be of type' +
                             ' KBaseFile.SingleEndLibrary or' +
                             ' KBaseFile.PairedEndLibrary or' +
                             ' KBaseAssembly.SingleEndLibrary or' +
                             ' KBaseAssembly.PairedEndLibrary')

        obj_type = self._get_ws_info(params.get(self.PARAM_IN_ASM_GEN_REF))[2]
        if obj_type.startswith('KBaseGenomes.Genome') or \
           obj_type.startswith('KBaseGenomeAnnotations.Assembly') or \
           obj_type.startswith('KBaseGenomes.ContigSet'):
            pass
        else:
            raise ValueError(self.PARAM_IN_ASM_GEN_REF +
                             ' parameter should be of type' +
                             ' KBaseGenomes.Genome or' +
                             ' KBaseGenomeAnnotations.Assembly or' +
                             ' KBaseGenomes.ContigSet')
        return ws_name_id, obj_name_id, file_path, lib_type

    def _get_aligner_stats(self, bam_file):
        """
        Gets the aligner stats from BAM file

        How we compute this stats:

        For each segment (line) in SAM/BAM file:
            we take the first element as `reads_id`
                    the second element as `flag`

            if the last bit (0x1) of flag is `1`:
                we treat this segment as paired end reads
            otherwise:
                we treat this segment as single end reads

            For single end reads:
                if the 3rd last bit (0x8) of flag is `1`:
                    we increment unmapped_reads_count
                else:
                    we treat this `reads_id` as mapped

                for all mapped `reads_ids`"
                    if it appears only once:
                        we treat this `reads_id` as `singletons`
                    else:
                        we treat this `reads_id` as `multiple_alignments`

                lastly, total_reads = unmapped_reads_count + identical mapped `reads_id`

            For paired end reads:
                if the 7th last bit (0x40) of flag is `1`:
                    if the 3rd last bit (0x8) of flag is `1`:
                        we increment unmapped_left_reads_count
                    else:
                        we treat this `reads_id` as mapped

                if the 8th last bit ( 0x80) of flag is `1`:
                    if the 3rd last bit (0x8) of flag is `1`:
                        we increment unmapped_right_reads_count
                    else:
                        we treat this `reads_id` as mapped

                for all mapped `reads_ids`"
                    if it appears only once:
                        we treat this `reads_id` as `singletons`
                    else:
                        we treat this `reads_id` as `multiple_alignments`

                lastly, total_reads = unmapped_left_reads_count + unmapped_right_reads_count + identical mapped `reads_id`
        """
        path, file = os.path.split(bam_file)

        self.__LOGGER.info('Start to generate aligner stats')
        start_time = time.time()

        infile = pysam.AlignmentFile(bam_file, 'r')

        properly_paired = 0
        unmapped_reads_count = 0
        unmapped_left_reads_count = 0
        unmapped_right_reads_count = 0
        mapped_reads_ids = []
        mapped_left_reads_ids = []
        mapped_right_reads_ids = []
        paired = False
        for alignment in infile:
            seg = alignment.to_string().split('\t')
            reads_id = seg[0]
            flag = "0000000" + "{0:b}".format(int(seg[1]))

            if flag[-1] == '1':
                paired = True

            if paired:  # process paired end sequence

                if flag[-7] == '1':  # first sequence of a pair
                    if flag[-3] == '1':
                        unmapped_left_reads_count += 1
                    else:
                        mapped_left_reads_ids.append(reads_id)

                if flag[-8] == '1':  # second sequence of a pair
                    if flag[-3] == '1':
                        unmapped_right_reads_count += 1
                    else:
                        mapped_right_reads_ids.append(reads_id)

                if flag[-2] == '1':
                    properly_paired += 1
            else:  # process single end sequence
                if flag[-3] == '1':
                    unmapped_reads_count += 1
                else:
                    mapped_reads_ids.append(reads_id)

                if flag[-2] == '1':
                    properly_paired += 1

        infile.close()

        if paired:
            mapped_reads_ids = mapped_left_reads_ids + mapped_right_reads_ids
            unmapped_reads_count = unmapped_left_reads_count + unmapped_right_reads_count

            mapped_reads_ids_counter = Counter(mapped_reads_ids)
            mapped_reads_count = len(list(mapped_reads_ids_counter))

            singletons = list(mapped_reads_ids_counter.values()).count(1)
            multiple_alignments = mapped_reads_count - singletons

            total_reads = unmapped_reads_count + mapped_reads_count

            properly_paired = properly_paired // 2

        else:
            mapped_reads_ids_counter = Counter(mapped_reads_ids)
            mapped_reads_count = len(list(mapped_reads_ids_counter))

            singletons = list(mapped_reads_ids_counter.values()).count(1)
            multiple_alignments = mapped_reads_count - singletons

            total_reads = unmapped_reads_count + mapped_reads_count

        try:
            alignment_rate = round(
                float(mapped_reads_count) / total_reads * 100, 3)
        except ZeroDivisionError:
            alignment_rate = 0

        if alignment_rate > 100:
            alignment_rate = 100.0

        elapsed_time = time.time() - start_time
        self.__LOGGER.info('Used: {}'.format(
            time.strftime("%H:%M:%S", time.gmtime(elapsed_time))))

        stats_data = {
            "alignment_rate": alignment_rate,
            "mapped_reads": mapped_reads_count,
            "multiple_alignments": multiple_alignments,
            "properly_paired": properly_paired,
            "singletons": singletons,
            "total_reads": total_reads,
            "unmapped_reads": unmapped_reads_count
        }
        return stats_data

    def _validate(self, params):
        samt = SamTools(self.config, self.__LOGGER)
        if 'ignore' in params:
            path, file = os.path.split(params['file_path'])
            rval = samt.validate(ifile=file,
                                 ipath=path,
                                 ignore=params['ignore'])
        else:
            path, file = os.path.split(params['file_path'])
            rval = samt.validate(ifile=file, ipath=path)

        return rval

    #END_CLASS_HEADER

    # config contains contents of config file in a hash or None if it couldn't
    # be found
    def __init__(self, config):
        #BEGIN_CONSTRUCTOR
        self.config = config
        self.__LOGGER = logging.getLogger('KBaseRNASeq')
        if 'log_level' in config:
            self.__LOGGER.setLevel(config['log_level'])
        else:
            self.__LOGGER.setLevel(logging.INFO)
        streamHandler = logging.StreamHandler(sys.stdout)
        formatter = logging.Formatter(
            "%(asctime)s - %(filename)s - %(lineno)d - \
                                       %(levelname)s - %(message)s")
        formatter.converter = time.gmtime
        streamHandler.setFormatter(formatter)
        self.__LOGGER.addHandler(streamHandler)
        self.__LOGGER.info("Logger was set")

        script_utils.check_sys_stat(self.__LOGGER)

        self.scratch = config['scratch']
        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.ws_url = config['workspace-url']
        self.dfu = DataFileUtil(self.callback_url)
        self.samtools = SamTools(config)
        #END_CONSTRUCTOR
        pass

    def validate_alignment(self, ctx, params):
        """
        :param params: instance of type "ValidateAlignmentParams" (* Input
           parameters for validating a reads alignment. For validation errors
           to ignore, see
           http://broadinstitute.github.io/picard/command-line-overview.html#V
           alidateSamFile) -> structure: parameter "file_path" of String,
           parameter "ignore" of list of String
        :returns: instance of type "ValidateAlignmentOutput" (* Results from
           validate alignment *) -> structure: parameter "validated" of type
           "boolean" (A boolean - 0 for false, 1 for true. @range (0, 1))
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN validate_alignment

        rval = self._validate(params)

        if rval == 0:
            returnVal = {'validated': True}
        else:
            returnVal = {'validated': False}

        #END validate_alignment

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError('Method validate_alignment return value ' +
                             'returnVal is not type dict as required.')
        # return the results
        return [returnVal]

    def upload_alignment(self, ctx, params):
        """
        Validates and uploads the reads alignment
                How we compute BAM stats:
                For each segment (line) in SAM/BAM file:
                    we take the first element as `reads_id`
                            the second element as `flag`
                    if the last bit (0x1) of flag is `1`:
                        we treat this segment as paired end reads
                    otherwise:
                        we treat this segment as single end reads
                    For single end reads:
                        if the 3rd last bit (0x8) of flag is `1`:
                            we increment unmapped_reads_count
                        else:
                            we treat this `reads_id` as mapped
                        for all mapped `reads_ids`"
                            if it appears only once:
                                we treat this `reads_id` as `singletons`
                            else:
                                we treat this `reads_id` as `multiple_alignments`
                        lastly, total_reads = unmapped_reads_count + identical mapped `reads_id`
                    For paired end reads:
                        if the 7th last bit (0x40) of flag is `1`:
                            if the 3rd last bit (0x8) of flag is `1`:
                                we increment unmapped_left_reads_count
                            else:
                                we treat this `reads_id` as mapped
                        if the 8th last bit ( 0x80) of flag is `1`:
                            if the 3rd last bit (0x8) of flag is `1`:
                                we increment unmapped_right_reads_count
                            else:
                                we treat this `reads_id` as mapped
                        for all mapped `reads_ids`"
                            if it appears only once:
                                we treat this `reads_id` as `singletons`
                            else:
                                we treat this `reads_id` as `multiple_alignments`
                        lastly, total_reads = unmapped_left_reads_count + unmapped_right_reads_count + identical mapped `reads_id`
        :param params: instance of type "UploadAlignmentParams" (* Required
           input parameters for uploading a reads alignment string
           destination_ref -  object reference of alignment destination. The
           object ref is 'ws_name_or_id/obj_name_or_id' where ws_name_or_id
           is the workspace name or id and obj_name_or_id is the object name
           or id file_path              -  File with the path of the sam or
           bam file to be uploaded. If a sam file is provided, it will be
           converted to the sorted bam format before being saved
           read_library_ref       -  workspace object ref of the read sample
           used to make the alignment file condition              -
           assembly_or_genome_ref -  workspace object ref of genome assembly
           or genome object that was used to build the alignment *) ->
           structure: parameter "destination_ref" of String, parameter
           "file_path" of String, parameter "read_library_ref" of String,
           parameter "condition" of String, parameter
           "assembly_or_genome_ref" of String, parameter "aligned_using" of
           String, parameter "aligner_version" of String, parameter
           "aligner_opts" of mapping from String to String, parameter
           "replicate_id" of String, parameter "platform" of String,
           parameter "bowtie2_index" of type "ws_bowtieIndex_id", parameter
           "sampleset_ref" of type "ws_Sampleset_ref", parameter
           "mapped_sample_id" of mapping from String to mapping from String
           to String, parameter "validate" of type "boolean" (A boolean - 0
           for false, 1 for true. @range (0, 1)), parameter "ignore" of list
           of String
        :returns: instance of type "UploadAlignmentOutput" (*  Output from
           uploading a reads alignment  *) -> structure: parameter "obj_ref"
           of String
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN upload_alignment

        self.__LOGGER.info(
            'Starting upload Reads Alignment, parsing parameters ')
        pprint(params)

        ws_name_id, obj_name_id, file_path, lib_type = self._proc_upload_alignment_params(
            ctx, params)

        dir, file_name, file_base, file_ext = self._get_file_path_info(
            file_path)

        if self.PARAM_IN_VALIDATE in params and params[
                self.PARAM_IN_VALIDATE] is True:
            if self._validate(params) == 1:
                raise Exception('{0} failed validation'.format(file_path))

        bam_file = file_path
        if file_ext.lower() == '.sam':
            bam_file = os.path.join(dir, file_base + '.bam')
            self.samtools.convert_sam_to_sorted_bam(ifile=file_name,
                                                    ipath=dir,
                                                    ofile=bam_file)

        uploaded_file = self.dfu.file_to_shock({
            'file_path': bam_file,
            'make_handle': 1
        })
        file_handle = uploaded_file['handle']
        file_size = uploaded_file['size']

        aligner_stats = self._get_aligner_stats(file_path)
        aligner_data = {
            'file': file_handle,
            'size': file_size,
            'condition': params.get(self.PARAM_IN_CONDITION),
            'read_sample_id': params.get(self.PARAM_IN_READ_LIB_REF),
            'library_type': lib_type,
            'genome_id': params.get(self.PARAM_IN_ASM_GEN_REF),
            'alignment_stats': aligner_stats
        }
        optional_params = [
            self.PARAM_IN_ALIGNED_USING, self.PARAM_IN_ALIGNER_VER,
            self.PARAM_IN_ALIGNER_OPTS, self.PARAM_IN_REPLICATE_ID,
            self.PARAM_IN_PLATFORM, self.PARAM_IN_BOWTIE2_INDEX,
            self.PARAM_IN_SAMPLESET_REF, self.PARAM_IN_MAPPED_SAMPLE_ID
        ]
        for opt_param in optional_params:
            if opt_param in params and params[opt_param] is not None:
                aligner_data[opt_param] = params[opt_param]

        self.__LOGGER.info('=========  Adding extra_provenance_refs')
        self.__LOGGER.info(params.get(self.PARAM_IN_READ_LIB_REF))
        self.__LOGGER.info(params.get(self.PARAM_IN_ASM_GEN_REF))
        self.__LOGGER.info('=======================================')

        res = self.dfu.save_objects({
            "id":
            ws_name_id,
            "objects": [{
                "type":
                "KBaseRNASeq.RNASeqAlignment",
                "data":
                aligner_data,
                "name":
                obj_name_id,
                "extra_provenance_input_refs": [
                    params.get(self.PARAM_IN_READ_LIB_REF),
                    params.get(self.PARAM_IN_ASM_GEN_REF)
                ]
            }]
        })[0]
        self.__LOGGER.info('save complete')

        returnVal = {
            'obj_ref': str(res[6]) + '/' + str(res[0]) + '/' + str(res[4])
        }

        self.__LOGGER.info('Uploaded object: ')
        self.__LOGGER.info(returnVal)

        #END upload_alignment

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError('Method upload_alignment return value ' +
                             'returnVal is not type dict as required.')
        # return the results
        return [returnVal]

    def download_alignment(self, ctx, params):
        """
        Downloads alignment files in .bam, .sam and .bai formats. Also downloads alignment stats *
        :param params: instance of type "DownloadAlignmentParams" (* Required
           input parameters for downloading a reads alignment string
           source_ref -  object reference of alignment source. The object ref
           is 'ws_name_or_id/obj_name_or_id' where ws_name_or_id is the
           workspace name or id and obj_name_or_id is the object name or id
           *) -> structure: parameter "source_ref" of String, parameter
           "downloadSAM" of type "boolean" (A boolean - 0 for false, 1 for
           true. @range (0, 1)), parameter "downloadBAI" of type "boolean" (A
           boolean - 0 for false, 1 for true. @range (0, 1)), parameter
           "validate" of type "boolean" (A boolean - 0 for false, 1 for true.
           @range (0, 1)), parameter "ignore" of list of String
        :returns: instance of type "DownloadAlignmentOutput" (*  The output
           of the download method.  *) -> structure: parameter
           "destination_dir" of String, parameter "stats" of type
           "AlignmentStats" -> structure: parameter "properly_paired" of
           Long, parameter "multiple_alignments" of Long, parameter
           "singletons" of Long, parameter "alignment_rate" of Double,
           parameter "unmapped_reads" of Long, parameter "mapped_reads" of
           Long, parameter "total_reads" of Long
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN download_alignment

        self.__LOGGER.info('Running download_alignment with params:\n' +
                           pformat(params))

        inref = params.get(self.PARAM_IN_SRC_REF)
        if not inref:
            raise ValueError('{} parameter is required'.format(
                self.PARAM_IN_SRC_REF))

        try:
            alignment = self.dfu.get_objects({'object_refs': [inref]})['data']
        except DFUError as e:
            self.__LOGGER.error(
                'Logging stacktrace from workspace exception:\n' + e.data)
            raise

        # set the output dir
        uuid_str = str(uuid.uuid4())
        output_dir = os.path.join(self.scratch, 'download_' + uuid_str)
        self._mkdir_p(output_dir)

        file_ret = self.dfu.shock_to_file({
            'shock_id':
            alignment[0]['data']['file']['id'],
            'file_path':
            output_dir
        })
        if zipfile.is_zipfile(file_ret.get('file_path')):
            with zipfile.ZipFile(file_ret.get('file_path')) as z:
                z.extractall(output_dir)

        for f in glob.glob(output_dir + '/*.zip'):
            os.remove(f)

        bam_files = glob.glob(output_dir + '/*.bam')

        if len(bam_files) == 0:
            raise ValueError("Alignment object does not contain a bam file")

        for bam_file_path in bam_files:
            dir, file_name, file_base, file_ext = self._get_file_path_info(
                bam_file_path)
            if params.get(self.PARAM_IN_VALIDATE, False):
                validate_params = {'file_path': bam_file_path}
                if self._validate(validate_params) == 1:
                    raise Exception(
                        '{0} failed validation'.format(bam_file_path))

            if params.get(self.PARAM_IN_DOWNLOAD_BAI, False):
                bai_file = file_base + '.bai'
                bai_file_path = os.path.join(output_dir, bai_file)
                self.samtools.create_bai_from_bam(ifile=file_name,
                                                  ipath=output_dir,
                                                  ofile=bai_file)
                if not os.path.isfile(bai_file_path):
                    raise ValueError('Error creating {}'.format(bai_file_path))

            if params.get(self.PARAM_IN_DOWNLOAD_SAM, False):
                sam_file = file_base + '.sam'
                sam_file_path = os.path.join(output_dir, sam_file)
                self.samtools.convert_bam_to_sam(ifile=file_name,
                                                 ipath=output_dir,
                                                 ofile=sam_file)
                if not os.path.isfile(sam_file_path):
                    raise ValueError('Error creating {}'.format(sam_file_path))

        returnVal = {
            'destination_dir': output_dir,
            'stats': alignment[0]['data']['alignment_stats']
        }

        #END download_alignment

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError('Method download_alignment return value ' +
                             'returnVal is not type dict as required.')
        # return the results
        return [returnVal]

    def export_alignment(self, ctx, params):
        """
        Wrapper function for use by in-narrative downloaders to download alignments from shock *
        :param params: instance of type "ExportParams" (* Required input
           parameters for exporting a reads alignment string source_ref -
           object reference of alignment source. The object ref is
           'ws_name_or_id/obj_name_or_id' where ws_name_or_id is the
           workspace name or id and obj_name_or_id is the object name or id
           *) -> structure: parameter "source_ref" of String, parameter
           "exportSAM" of type "boolean" (A boolean - 0 for false, 1 for
           true. @range (0, 1)), parameter "exportBAI" of type "boolean" (A
           boolean - 0 for false, 1 for true. @range (0, 1)), parameter
           "validate" of type "boolean" (A boolean - 0 for false, 1 for true.
           @range (0, 1)), parameter "ignore" of list of String
        :returns: instance of type "ExportOutput" -> structure: parameter
           "shock_id" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN export_alignment

        inref = params.get(self.PARAM_IN_SRC_REF)
        if not inref:
            raise ValueError('{} parameter is required'.format(
                self.PARAM_IN_SRC_REF))

        if params.get(self.PARAM_IN_VALIDATE, False) or \
           params.get('exportBAI', False) or \
           params.get('exportSAM', False):
            """
            Need to validate or convert files. Use download_alignment
            """
            download_params = {}
            for key, val in params.items():
                download_params[key.replace('export', 'download')] = val

            download_retVal = self.download_alignment(ctx, download_params)[0]

            export_dir = download_retVal['destination_dir']

            # package and load to shock
            ret = self.dfu.package_for_download({
                'file_path': export_dir,
                'ws_refs': [inref]
            })
            output = {'shock_id': ret['shock_id']}
        else:
            """
            return shock id from the object
            """
            try:
                alignment = self.dfu.get_objects({'object_refs':
                                                  [inref]})['data']
            except DFUError as e:
                self.__LOGGER.error(
                    'Logging stacktrace from workspace exception:\n' + e.data)
                raise
            output = {'shock_id': alignment[0]['data']['file']['id']}

        #END export_alignment

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method export_alignment return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]

    def status(self, ctx):
        #BEGIN_STATUS
        returnVal = {
            'state': "OK",
            'message': "",
            'version': self.VERSION,
            'git_url': self.GIT_URL,
            'git_commit_hash': self.GIT_COMMIT_HASH
        }
        #END_STATUS
        return [returnVal]
Example #15
0
class BiomUtil:
    def _mkdir_p(self, path):
        """
        _mkdir_p: make directory for given path
        """
        if not path:
            return
        try:
            os.makedirs(path)
        except OSError as exc:
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise

    def _process_params(self, params):
        logging.info('start validating import_matrix_from_biom params')

        # check for required parameters
        for p in [
                'obj_type', 'matrix_name', 'workspace_name', 'scale',
                'amplicon_set_name'
        ]:
            if p not in params:
                raise ValueError(
                    '"{}" parameter is required, but missing'.format(p))

        obj_type = params.get('obj_type')
        if obj_type not in self.matrix_types:
            raise ValueError('Unknown matrix object type: {}'.format(obj_type))

        scale = params.get('scale')
        if scale not in SCALE_TYPES:
            raise ValueError('Unknown scale type: {}'.format(scale))

        biom_file = None
        tsv_file = None
        fasta_file = None
        metadata_keys = DEFAULT_META_KEYS

        if params.get('biom_tsv'):
            biom_tsv = params.get('biom_tsv')
            biom_file = biom_tsv.get('biom_file_biom_tsv')
            tsv_file = biom_tsv.get('tsv_file_biom_tsv')

            if not (biom_file and tsv_file):
                raise ValueError('missing BIOM or TSV file')

            biom_file = self.dfu.download_staging_file({
                'staging_file_subdir_path':
                biom_file
            }).get('copy_file_path')

            tsv_file = self.dfu.download_staging_file({
                'staging_file_subdir_path':
                tsv_file
            }).get('copy_file_path')
            mode = 'biom_tsv'
        elif params.get('biom_fasta'):
            biom_fasta = params.get('biom_fasta')
            biom_file = biom_fasta.get('biom_file_biom_fasta')
            fasta_file = biom_fasta.get('fasta_file_biom_fasta')

            if not (biom_file and fasta_file):
                raise ValueError('missing BIOM or FASTA file')

            biom_file = self.dfu.download_staging_file({
                'staging_file_subdir_path':
                biom_file
            }).get('copy_file_path')

            fasta_file = self.dfu.download_staging_file({
                'staging_file_subdir_path':
                fasta_file
            }).get('copy_file_path')
            mode = 'biom_fasta'
        elif params.get('tsv_fasta'):
            tsv_fasta = params.get('tsv_fasta')
            tsv_file = tsv_fasta.get('tsv_file_tsv_fasta')
            fasta_file = tsv_fasta.get('fasta_file_tsv_fasta')

            if not (tsv_file and fasta_file):
                raise ValueError('missing TSV or FASTA file')

            tsv_file = self.dfu.download_staging_file({
                'staging_file_subdir_path':
                tsv_file
            }).get('copy_file_path')

            fasta_file = self.dfu.download_staging_file({
                'staging_file_subdir_path':
                fasta_file
            }).get('copy_file_path')

            metadata_keys_str = tsv_fasta.get('metadata_keys_tsv_fasta')
            if metadata_keys_str:
                metadata_keys += [
                    x.strip() for x in metadata_keys_str.split(',')
                ]
            mode = 'tsv_fasta'
        elif params.get('tsv'):
            tsv = params.get('tsv')
            tsv_file = tsv.get('tsv_file_tsv')

            if not tsv_file:
                raise ValueError('missing TSV file')

            tsv_file = self.dfu.download_staging_file({
                'staging_file_subdir_path':
                tsv_file
            }).get('copy_file_path')

            metadata_keys_str = tsv.get('metadata_keys_tsv')
            if metadata_keys_str:
                metadata_keys += [
                    x.strip() for x in metadata_keys_str.split(',')
                ]

            mode = 'tsv'
        else:
            raise ValueError('missing valide file group type in parameters')

        return (biom_file, tsv_file, fasta_file, mode,
                list(set(metadata_keys)))

    def _retrieve_value(self,
                        biom_metadata_dict,
                        tsv_metadata_df,
                        key,
                        required=False):

        if key in biom_metadata_dict:
            return {k.lower(): v
                    for k, v in biom_metadata_dict.items()}.get(key)
        elif key in tsv_metadata_df:
            return {k.lower(): v for k, v in tsv_metadata_df.items()}.get(key)
        elif required:
            raise ValueError('missing necessary [{}] from file'.format(key))
        else:
            return None

    def _search_taxon(self, scientific_name):
        """
        logic borrowed from: GFU.GenomeInterface
        https://github.com/kbaseapps/GenomeFileUtil/blob/master/lib/GenomeFileUtil/core/GenomeInterface.py#L216
        """
        taxon_id = None

        search_params = {
            "object_types": ["taxon"],
            "match_filter": {
                "lookup_in_keys": {
                    "scientific_name": {
                        "value": scientific_name
                    }
                },
                "exclude_subobjects": 1
            },
            "access_filter": {
                "with_private": 0,
                "with_public": 1
            },
            "sorting_rules": [{
                "is_object_property": 0,
                "property": "timestamp",
                "ascending": 0
            }]
        }

        objects = self.kbse.search_objects(search_params)['objects']

        if not objects:
            search_params['match_filter']['lookup_in_keys'] = {
                "aliases": {
                    "value": scientific_name
                }
            }
            objects = self.kbse.search_objects(search_params)['objects']
        if objects:
            taxon_id = objects[0].get('object_name')

        return taxon_id

    def _fetch_taxon_level(self, taxon_char):

        taxon_level_mapping = {
            'l': 'Life',
            'd': 'Domain',
            'k': 'Kingdom',
            'p': 'Phylum',
            'c': 'Class',
            'o': 'Order',
            'f': 'Family',
            'g': 'Genus',
            's': 'Species'
        }

        return taxon_level_mapping.get(taxon_char[0].lower(), 'Unknown')

    def _fetch_taxonomy(self, datarow):
        lineage = self._retrieve_value([], datarow, 'taxonomy')
        if isinstance(lineage, str):
            delimiter = csv.Sniffer().sniff(lineage).delimiter
            lineage = [x.strip() for x in lineage.split(delimiter)]

        taxonomy = {'lineage': lineage}

        for key in ['score', 'taxonomy_source', 'species_name']:
            val = self._retrieve_value([], datarow, key)
            if val:
                taxonomy[key] = val

        for item in lineage[::-1]:
            scientific_name = item.split('_')[-1]
            taxon_level_char = item.split('_')[0]
            if scientific_name:
                taxon_id = self._search_taxon(scientific_name)
                if taxon_id:
                    taxon_ref = f"{self.taxon_wsname}/{taxon_id}"
                    taxon_level = self._fetch_taxon_level(taxon_level_char)

                    taxonomy.update({
                        'taxon_ref': taxon_ref,
                        'taxon_id': taxon_id,
                        'scientific_name': scientific_name,
                        'taxon_level': taxon_level
                    })
                    break

        return taxonomy

    def _retrieve_tsv_amplicon_set_data(self, tsv_file):
        amplicons = dict()

        try:
            logging.info('start parsing TSV file')
            reader = pd.read_csv(tsv_file, sep=None, iterator=True)
            inferred_sep = reader._engine.data.dialect.delimiter
            df = pd.read_csv(tsv_file, sep=inferred_sep, index_col=0)
        except Exception:
            raise ValueError(
                'Cannot parse file. Please provide valide TSV file')

        if 'consensus_sequence' not in df.columns.tolist():
            raise ValueError('TSV file does not include consensus_sequence')

        logging.info('start processing each row in TSV')
        for observation_id in df.index:
            taxonomy = self._fetch_taxonomy(df.loc[observation_id])

            amplicon = {
                'consensus_sequence': df.loc[observation_id,
                                             'consensus_sequence'],
                'taxonomy': taxonomy
            }

            amplicons.update({observation_id: amplicon})

        logging.info('finished parsing TSV file')

        return amplicons

    def _retrieve_tsv_fasta_amplicon_set_data(self, tsv_file, fasta_file):
        amplicons = dict()
        try:
            logging.info('start parsing FASTA file')
            fastq_dict = SeqIO.index(fasta_file, "fasta")
        except Exception:
            raise ValueError(
                'Cannot parse file. Please provide valide FASTA file')

        try:
            logging.info('start parsing TSV file')
            reader = pd.read_csv(tsv_file, sep=None, iterator=True)
            inferred_sep = reader._engine.data.dialect.delimiter
            df = pd.read_csv(tsv_file, sep=inferred_sep, index_col=0)
        except Exception:
            raise ValueError(
                'Cannot parse file. Please provide valide TSV file')

        logging.info('start processing files')
        for observation_id in df.index:
            if observation_id not in fastq_dict:
                raise ValueError('FASTA file does not have [{}] OTU id'.format(
                    observation_id))

            taxonomy = self._fetch_taxonomy(df.loc[observation_id])

            amplicon = {
                'consensus_sequence': str(fastq_dict.get(observation_id).seq),
                'taxonomy': taxonomy
            }
            amplicons.update({observation_id: amplicon})

        logging.info('finished processing files')
        return amplicons

    def _retrieve_biom_fasta_amplicon_set_data(self, biom_file, fasta_file):
        amplicons = dict()
        try:
            logging.info('start parsing FASTA file')
            fastq_dict = SeqIO.index(fasta_file, "fasta")
        except Exception:
            raise ValueError(
                'Cannot parse file. Please provide valide FASTA file')

        logging.info('start parsing BIOM file')
        table = biom.load_table(biom_file)

        observation_ids = table._observation_ids.tolist()
        observation_metadata = table._observation_metadata

        logging.info('start processing files')
        for index, observation_id in enumerate(observation_ids):
            if observation_id not in fastq_dict:
                raise ValueError('FASTA file does not have [{}] OTU id'.format(
                    observation_id))

            taxonomy = self._fetch_taxonomy(observation_metadata[index])

            amplicon = {
                'consensus_sequence': str(fastq_dict.get(observation_id).seq),
                'taxonomy': taxonomy
            }

            amplicons.update({observation_id: amplicon})

        logging.info('finished processing files')
        return amplicons

    def _retrieve_biom_tsv_amplicon_set_data(self, biom_file, tsv_file):
        amplicons = dict()
        try:
            logging.info('start parsing TSV file')
            reader = pd.read_csv(tsv_file, sep=None, iterator=True)
            inferred_sep = reader._engine.data.dialect.delimiter
            df = pd.read_csv(tsv_file, sep=inferred_sep, index_col=0)
        except Exception:
            raise ValueError(
                'Cannot parse file. Please provide valide tsv file')

        if 'consensus_sequence' not in df.columns.tolist():
            raise ValueError('TSV file does not include consensus_sequence')

        logging.info('start parsing BIOM file')
        table = biom.load_table(biom_file)

        observation_ids = table._observation_ids.tolist()
        observation_metadata = table._observation_metadata

        logging.info('start processing files')
        for index, observation_id in enumerate(observation_ids):
            if observation_id not in df.index:
                raise ValueError('TSV file does not have [{}] OTU id'.format(
                    observation_id))

            taxonomy = self._fetch_taxonomy(df.loc[observation_id])

            amplicon = {
                'consensus_sequence': df.loc[observation_id,
                                             'consensus_sequence'],
                'taxonomy': taxonomy
            }

            amplicons.update({observation_id: amplicon})

        logging.info('finished processing files')
        return amplicons

    def _file_to_amplicon_set_data(self, biom_file, tsv_file, fasta_file, mode,
                                   refs, description, matrix_obj_ref):

        logging.info('start parsing amplicon_set_data')

        amplicon_set_data = dict()

        if mode == 'biom_tsv':
            amplicons = self._retrieve_biom_tsv_amplicon_set_data(
                biom_file, tsv_file)
        elif mode == 'biom_fasta':
            amplicons = self._retrieve_biom_fasta_amplicon_set_data(
                biom_file, fasta_file)
        elif mode == 'tsv_fasta':
            amplicons = self._retrieve_tsv_fasta_amplicon_set_data(
                tsv_file, fasta_file)
        elif mode == 'tsv':
            amplicons = self._retrieve_tsv_amplicon_set_data(tsv_file)
        else:
            raise ValueError(
                'error parsing _file_to_amplicon_set_data, mode: {}'.format(
                    mode))

        amplicon_set_data.update({'amplicons': amplicons})

        if 'reads_set_ref' in refs:
            amplicon_set_data['reads_set_ref'] = refs.get('reads_set_ref')

        if description:
            amplicon_set_data['description'] = description

        matrix_obj_ref_array = matrix_obj_ref.split('/')
        amplicon_set_data['amplicon_matrix_ref'] = '{}/{}'.format(
            matrix_obj_ref_array[0], matrix_obj_ref_array[1])

        return amplicon_set_data

    def _file_to_amplicon_data(self,
                               biom_file,
                               tsv_file,
                               mode,
                               refs,
                               matrix_name,
                               workspace_id,
                               scale,
                               description,
                               metadata_keys=None):

        amplicon_data = refs

        if mode.startswith('biom'):
            logging.info('start parsing BIOM file for matrix data')
            table = biom.load_table(biom_file)
            observation_metadata = table._observation_metadata
            sample_metadata = table._sample_metadata

            matrix_data = {
                'row_ids': table._observation_ids.tolist(),
                'col_ids': table._sample_ids.tolist(),
                'values': table.matrix_data.toarray().tolist()
            }

            logging.info('start building attribute mapping object')
            amplicon_data.update(
                self.get_attribute_mapping("row", observation_metadata,
                                           matrix_data, matrix_name, refs,
                                           workspace_id))
            amplicon_data.update(
                self.get_attribute_mapping("col", sample_metadata, matrix_data,
                                           matrix_name, refs, workspace_id))

            amplicon_data['attributes'] = {}
            for k in ('create_date', 'generated_by'):
                val = getattr(table, k)
                if not val:
                    continue
                if isinstance(val, bytes):
                    amplicon_data['attributes'][k] = val.decode('utf-8')
                else:
                    amplicon_data['attributes'][k] = str(val)
        elif mode.startswith('tsv'):
            observation_metadata = None
            sample_metadata = None
            try:
                logging.info('start parsing TSV file for matrix data')
                reader = pd.read_csv(tsv_file, sep=None, iterator=True)
                inferred_sep = reader._engine.data.dialect.delimiter
                df = pd.read_csv(tsv_file, sep=inferred_sep, index_col=0)
            except Exception:
                raise ValueError(
                    'Cannot parse file. Please provide valide tsv file')
            else:
                metadata_df = None
                if metadata_keys:
                    shared_metadata_keys = list(
                        set(metadata_keys) & set(df.columns))
                    if mode == 'tsv' and 'consensus_sequence' not in shared_metadata_keys:
                        raise ValueError(
                            'TSV file does not include consensus_sequence')
                    if shared_metadata_keys:
                        metadata_df = df[shared_metadata_keys]
                        df.drop(columns=shared_metadata_keys, inplace=True)
                try:
                    df = df.astype(float)
                except ValueError:
                    err_msg = 'Found some non-float values. Matrix contains only numeric values\n'
                    err_msg += 'Please list any non-numeric column names in  Metadata Keys field'
                    raise ValueError(err_msg)
                df.fillna(0, inplace=True)
                matrix_data = {
                    'row_ids': df.index.tolist(),
                    'col_ids': df.columns.tolist(),
                    'values': df.values.tolist()
                }

            logging.info('start building attribute mapping object')
            amplicon_data.update(
                self.get_attribute_mapping("row", observation_metadata,
                                           matrix_data, matrix_name, refs,
                                           workspace_id, metadata_df))
            amplicon_data.update(
                self.get_attribute_mapping("col", sample_metadata, matrix_data,
                                           matrix_name, refs, workspace_id))

            amplicon_data['attributes'] = {}
        else:
            raise ValueError(
                'error parsing _file_to_amplicon_data, mode: {}'.format(mode))

        amplicon_data.update({'data': matrix_data})

        amplicon_data['search_attributes'] = [
            f'{k}|{v}' for k, v in amplicon_data['attributes'].items()
        ]

        amplicon_data['scale'] = scale
        if description:
            amplicon_data['description'] = description

        return amplicon_data

    def get_attribute_mapping(self,
                              axis,
                              metadata,
                              matrix_data,
                              matrix_name,
                              refs,
                              workspace_id,
                              metadata_df=None):
        mapping_data = {}
        axis_ids = matrix_data[f'{axis}_ids']
        if refs.get(f'{axis}_attributemapping_ref'):
            am_data = self.dfu.get_objects(
                {'object_refs':
                 [refs[f'{axis}_attributemapping_ref']]})['data'][0]['data']
            unmatched_ids = set(axis_ids) - set(am_data['instances'].keys())
            if unmatched_ids:
                name = "Column" if axis == 'col' else "Row"
                raise ValueError(
                    f"The following {name} IDs from the uploaded matrix do not match "
                    f"the supplied {name} attribute mapping: {', '.join(unmatched_ids)}"
                    f"\nPlease verify the input data or upload an excel file with a"
                    f"{name} mapping tab.")
            else:
                mapping_data[f'{axis}_mapping'] = {x: x for x in axis_ids}

        elif metadata:
            name = matrix_name + "_{}_attributes".format(axis)
            mapping_data[
                f'{axis}_attributemapping_ref'] = self._metadata_to_attribute_mapping(
                    axis_ids, metadata, name, workspace_id)
            # if coming from biom file, metadata and axis IDs are guaranteed to match
            mapping_data[f'{axis}_mapping'] = {x: x for x in axis_ids}
        elif metadata_df is not None:
            name = matrix_name + "_{}_attributes".format(axis)
            mapping_data[
                f'{axis}_attributemapping_ref'] = self._meta_df_to_attribute_mapping(
                    axis_ids, metadata_df, name, workspace_id)
            mapping_data[f'{axis}_mapping'] = {x: x for x in axis_ids}

        return mapping_data

    def _meta_df_to_attribute_mapping(self, axis_ids, metadata_df, obj_name,
                                      ws_id):
        data = {'ontology_mapping_method': "TSV file", 'instances': {}}
        attribute_keys = metadata_df.columns.tolist()
        data['attributes'] = [{
            'attribute': key,
            'source': 'upload'
        } for key in attribute_keys]

        for axis_id in axis_ids:
            data['instances'][axis_id] = metadata_df.loc[axis_id].tolist()

        logging.info(
            'start saving AttributeMapping object: {}'.format(obj_name))
        info = self.dfu.save_objects({
            "id":
            ws_id,
            "objects": [{
                "type": "KBaseExperiments.AttributeMapping",
                "data": data,
                "name": obj_name
            }]
        })[0]

        return f'{info[6]}/{info[0]}/{info[4]}'

    def _metadata_to_attribute_mapping(self, instances, metadata, obj_name,
                                       ws_id):
        data = {'ontology_mapping_method': "BIOM file", 'instances': {}}
        sample_set = metadata[0:min(len(metadata), 25)]
        metadata_keys = sorted(
            set((k for m_dict in sample_set for k in m_dict)))
        data['attributes'] = [{
            'attribute': key,
            'source': 'upload'
        } for key in metadata_keys]
        for inst, meta in zip(instances, metadata):
            data['instances'][inst] = [
                str(meta[attr]) for attr in metadata_keys
            ]

        logging.info(
            'start saving AttributeMapping object: {}'.format(obj_name))
        info = self.dfu.save_objects({
            "id":
            ws_id,
            "objects": [{
                "type": "KBaseExperiments.AttributeMapping",
                "data": data,
                "name": obj_name
            }]
        })[0]
        return f'{info[6]}/{info[0]}/{info[4]}'

    def _generate_report(self, matrix_obj_ref, amplicon_set_obj_ref,
                         new_row_attr_ref, new_col_attr_ref, workspace_name):
        """
        _generate_report: generate summary report
        """

        objects_created = [{
            'ref': matrix_obj_ref,
            'description': 'Imported Amplicon Matrix'
        }, {
            'ref': amplicon_set_obj_ref,
            'description': 'Imported Amplicon Set'
        }]

        if new_row_attr_ref:
            objects_created.append({
                'ref':
                new_row_attr_ref,
                'description':
                'Imported Amplicons(Row) Attribute Mapping'
            })

        if new_col_attr_ref:
            objects_created.append({
                'ref':
                new_col_attr_ref,
                'description':
                'Imported Samples(Column) Attribute Mapping'
            })

        report_params = {
            'message': '',
            'objects_created': objects_created,
            'workspace_name': workspace_name,
            'report_object_name':
            'import_matrix_from_biom_' + str(uuid.uuid4())
        }

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {
            'report_name': output['name'],
            'report_ref': output['ref']
        }

        return report_output

    def _df_to_tsv(self, amplicon_set_df, result_dir, amplicon_set_ref):
        logging.info('writting amplicon set data frame to tsv file')
        amplicon_set_obj = self.dfu.get_objects(
            {'object_refs': [amplicon_set_ref]})['data'][0]
        amplicon_set_info = amplicon_set_obj['info']
        amplicon_set_name = amplicon_set_info[1]

        file_path = os.path.join(result_dir, amplicon_set_name + ".tsv")

        amplicon_set_df.to_csv(file_path, sep='\t', index=True, header=True)

        return file_path

    def _amplicon_set_to_df(self, amplicon_set_ref):
        logging.info('converting amplicon set to data frame')
        am_set_data = self.dfu.get_objects({'object_refs': [amplicon_set_ref]
                                            })['data'][0]['data']

        amplicon_matrix_ref = am_set_data.get('amplicon_matrix_ref')
        matrix_data = self.dfu.get_objects(
            {'object_refs': [amplicon_matrix_ref]})['data'][0]['data']
        matrix_value_data = matrix_data.get('data')

        index = matrix_value_data.get('row_ids')
        columns = matrix_value_data.get('col_ids')
        values = matrix_value_data.get('values')

        df = pd.DataFrame(values, index=index, columns=columns)

        amplicons = am_set_data.get('amplicons')

        meta_index = list()

        meta_columns = [
            'taxonomy', 'taxon_id', 'taxon_ref', 'taxon_level', 'score',
            'taxonomy_source', 'species_name', 'consensus_sequence'
        ]
        meta_values = list()
        for otu_id, amplicon in amplicons.items():
            meta_index.append(otu_id)

            taxonomy_data = amplicon.get('taxonomy')

            taxonomy = taxonomy_data.get('lineage')
            taxon_id = taxonomy_data.get('taxon_id')
            taxon_ref = taxonomy_data.get('taxon_ref')
            taxon_level = taxonomy_data.get('taxon_level')
            score = taxonomy_data.get('score')
            taxonomy_source = taxonomy_data.get('taxonomy_source')
            species_name = taxonomy_data.get('species_name')

            consensus_sequence = amplicon.get('consensus_sequence')

            meta_values.append([
                taxonomy, taxon_id, taxon_ref, taxon_level, score,
                taxonomy_source, species_name, consensus_sequence
            ])

        meta_df = pd.DataFrame(meta_values,
                               index=meta_index,
                               columns=meta_columns)

        merged_df = df.merge(meta_df,
                             left_index=True,
                             right_index=True,
                             how='left',
                             validate='one_to_one')

        return merged_df

    def __init__(self, config):
        self.callback_url = config['SDK_CALLBACK_URL']
        self.scratch = config['scratch']
        self.token = config['KB_AUTH_TOKEN']
        self.dfu = DataFileUtil(self.callback_url)
        self.data_util = DataUtil(config)
        self.attr_util = AttributesUtil(config)
        self.matrix_util = MatrixUtil(config)
        self.matrix_types = [
            x.split(".")[1].split('-')[0]
            for x in self.data_util.list_generic_types()
        ]
        self.taxon_wsname = config['taxon-workspace-name']
        self.kbse = KBaseSearchEngine(config['search-url'])

    def import_matrix_from_biom(self, params):
        """
        arguments:
        obj_type: one of ExpressionMatrix, FitnessMatrix, DifferentialExpressionMatrix
        matrix_name: matrix object name
        workspace_name: workspace name matrix object to be saved to
        input_shock_id: file shock id
        or
        input_file_path: absolute file path
        or
        input_staging_file_path: staging area file path

        optional arguments:
        col_attributemapping_ref: column AttributeMapping reference
        row_attributemapping_ref: row AttributeMapping reference
        genome_ref: genome reference
        matrix_obj_ref: Matrix reference
        """

        (biom_file, tsv_file, fasta_file, mode,
         metadata_keys) = self._process_params(params)

        workspace_name = params.get('workspace_name')
        matrix_name = params.get('matrix_name')
        amplicon_set_name = params.get('amplicon_set_name')
        obj_type = params.get('obj_type')
        scale = params.get('scale')
        description = params.get('description')
        refs = {k: v for k, v in params.items() if "_ref" in k}

        if not isinstance(workspace_name, int):
            workspace_id = self.dfu.ws_name_to_id(workspace_name)
        else:
            workspace_id = workspace_name

        amplicon_data = self._file_to_amplicon_data(biom_file, tsv_file, mode,
                                                    refs, matrix_name,
                                                    workspace_id, scale,
                                                    description, metadata_keys)

        new_row_attr_ref = None
        if not params.get('row_attributemapping_ref'):
            new_row_attr_ref = amplicon_data.get('row_attributemapping_ref')

        new_col_attr_ref = None
        if not params.get('col_attributemapping_ref'):
            new_col_attr_ref = amplicon_data.get('col_attributemapping_ref')

        logging.info('start saving Matrix object: {}'.format(matrix_name))
        matrix_obj_ref = self.data_util.save_object({
            'obj_type':
            'KBaseMatrices.{}'.format(obj_type),
            'obj_name':
            matrix_name,
            'data':
            amplicon_data,
            'workspace_name':
            workspace_id
        })['obj_ref']

        amplicon_set_data = self._file_to_amplicon_set_data(
            biom_file, tsv_file, fasta_file, mode, refs, description,
            matrix_obj_ref)

        logging.info(
            'start saving AmpliconSet object: {}'.format(amplicon_set_name))
        amplicon_set_obj_ref = self.data_util.save_object({
            'obj_type':
            'KBaseExperiments.AmpliconSet',
            'obj_name':
            amplicon_set_name,
            'data':
            amplicon_set_data,
            'workspace_name':
            workspace_id
        })['obj_ref']

        logging.info(
            'start resaving Matrix object with amplicon set: {}'.format(
                matrix_name))
        amplicon_data['amplicon_set_ref'] = '{}/{}'.format(
            workspace_id, amplicon_set_name)
        matrix_obj_ref = self.data_util.save_object({
            'obj_type':
            'KBaseMatrices.{}'.format(obj_type),
            'obj_name':
            matrix_name,
            'data':
            amplicon_data,
            'workspace_name':
            workspace_id
        })['obj_ref']

        returnVal = {
            'matrix_obj_ref': matrix_obj_ref,
            'amplicon_set_obj_ref': amplicon_set_obj_ref
        }

        report_output = self._generate_report(matrix_obj_ref,
                                              amplicon_set_obj_ref,
                                              new_row_attr_ref,
                                              new_col_attr_ref, workspace_name)

        returnVal.update(report_output)

        return returnVal

    def export_amplicon_set_tsv(self, params):
        """
        export AmpliconSet as TSV
        """
        logging.info('start exporting amplicon set object')
        amplicon_set_ref = params.get('input_ref')

        amplicon_set_df = self._amplicon_set_to_df(amplicon_set_ref)

        result_dir = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(result_dir)

        self._df_to_tsv(amplicon_set_df, result_dir, amplicon_set_ref)

        package_details = self.dfu.package_for_download({
            'file_path':
            result_dir,
            'ws_refs': [amplicon_set_ref]
        })

        return {'shock_id': package_details['shock_id']}
class sample_uploader:
    '''
    Module Name:
    sample_uploader

    Module Description:
    A KBase module: sample_uploader
    '''

    ######## WARNING FOR GEVENT USERS ####### noqa
    # Since asynchronous IO can lead to methods - even the same method -
    # interrupting each other, you must be *very* careful when using global
    # state. A method could easily clobber the state set by another while
    # the latter method is running.
    ######################################### noqa
    VERSION = "0.0.14"
    GIT_URL = "[email protected]:Tianhao-Gu/sample_uploader.git"
    GIT_COMMIT_HASH = "fddb10ca67368def8437569f8157b71b59f41e1c"

    #BEGIN_CLASS_HEADER
    #END_CLASS_HEADER

    # config contains contents of config file in a hash or None if it couldn't
    # be found
    def __init__(self, config):
        #BEGIN_CONSTRUCTOR
        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.workspace_url = config['workspace-url']
        self.scratch = config['scratch']
        # janky, but works for now
        self.sw_url = config.get('kbase-endpoint') + '/service_wizard'
        self.dfu = DataFileUtil(url=self.callback_url)
        logging.basicConfig(format='%(created)s %(levelname)s: %(message)s',
                            level=logging.INFO)
        #END_CONSTRUCTOR
        pass

    def import_samples(self, ctx, params):
        """
        :param params: instance of type "ImportSampleInputs" -> structure:
           parameter "sample_set_ref" of String, parameter "sample_file" of
           String, parameter "workspace_name" of String, parameter
           "workspace_id" of Long, parameter "file_format" of String,
           parameter "description" of String, parameter "set_name" of String,
           parameter "header_row_index" of Long, parameter "id_field" of
           String, parameter "output_format" of String, parameter
           "taxonomy_source" of String, parameter "num_otus" of Long,
           parameter "incl_seq" of Long, parameter "otu_prefix" of String,
           parameter "share_within_workspace" of Long, parameter
           "prevalidate" of Long, parameter "incl_input_in_output" of Long
        :returns: instance of type "ImportSampleOutputs" -> structure:
           parameter "report_name" of String, parameter "report_ref" of
           String, parameter "sample_set" of type "SampleSet" -> structure:
           parameter "samples" of list of type "sample_info" -> structure:
           parameter "id" of type "sample_id", parameter "name" of String,
           parameter "description" of String, parameter "sample_set_ref" of
           String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN import_samples
        print(f"Beginning sample import with following parameters:")
        print(f"params -- {params}")
        sample_set = {"samples": []}
        # Check if we have an existing Sample Set as input
        # if so, download
        if params.get('sample_set_ref'):
            ret = self.dfu.get_objects(
                {'object_refs': [params['sample_set_ref']]})['data'][0]
            sample_set = ret['data']
            set_name = ret['info'][1]
            save_ws_id = params['sample_set_ref'].split('/')[0]
        else:
            if not params.get('set_name'):
                raise ValueError(
                    f"Sample set name required, when new SampleSet object is created."
                )
            set_name = params['set_name']
            save_ws_id = params.get('workspace_id')
        if params.get('header_row_index'):
            header_row_index = int(params["header_row_index"]) - 1
        else:
            header_row_index = 0
            if params.get('file_format') == "SESAR":
                header_row_index = 1

        username = ctx['user_id']

        if params.get('file_format') == 'ENIGMA':
            # ENIGMA_mappings['verification_mapping'].update(
            #     {key: ("is_string", []) for key in ENIGMA_mappings['basic_columns']}
            # )
            sample_set, errors = import_samples_from_file(
                params, self.sw_url, self.workspace_url, username,
                ctx['token'], ENIGMA_mappings['column_mapping'],
                ENIGMA_mappings.get('groups',
                                    []), ENIGMA_mappings['date_columns'],
                ENIGMA_mappings.get('column_unit_regex',
                                    []), sample_set, header_row_index)
        elif params.get('file_format') == 'SESAR':
            # SESAR_mappings['verification_mapping'].update(
            #     {key: ("is_string", []) for key in SESAR_mappings['basic_columns']}
            # )
            sample_set, errors = import_samples_from_file(
                params, self.sw_url, self.workspace_url, username,
                ctx['token'], SESAR_mappings['column_mapping'],
                SESAR_mappings.get('groups',
                                   []), SESAR_mappings['date_columns'],
                SESAR_mappings.get('column_unit_regex',
                                   []), sample_set, header_row_index)
        elif params.get('file_format') == 'KBASE':
            sample_set, errors = import_samples_from_file(
                params, self.sw_url, self.workspace_url, username,
                ctx['token'], {}, [], [], [], sample_set, header_row_index)
        else:
            raise ValueError(
                f"Only SESAR and ENIGMA formats are currently supported for importing samples. "
                "File of format {params.get('file_format')} not supported.")

        file_links = []
        sample_set_ref = None
        html_link = None

        if errors:
            # create UI to display the errors clearly
            html_link = _error_ui(errors, self.scratch)
        else:
            # only save object if there are no errors
            obj_info = self.dfu.save_objects({
                'id':
                save_ws_id,
                'objects': [{
                    "name": set_name,
                    "type": "KBaseSets.SampleSet",
                    "data": sample_set
                }]
            })[0]

            sample_set_ref = '/'.join(
                [str(obj_info[6]),
                 str(obj_info[0]),
                 str(obj_info[4])])
            sample_file_name = os.path.basename(
                params['sample_file']).split('.')[0] + '_OTU'

            # -- Format outputs below --
            # if output file format specified, add one to output
            if params.get('output_format') in ['csv', 'xls']:
                otu_path = sample_set_to_OTU_sheet(sample_set,
                                                   sample_file_name,
                                                   self.scratch, params)
                file_links.append({
                    'path':
                    otu_path,
                    'name':
                    os.path.basename(otu_path),
                    'label':
                    "OTU template file",
                    'description':
                    "file with each column containing the assigned sample_id and sample "
                    "name of each saved sample. Intended for uploading OTU data."
                })

        if params.get('incl_input_in_output'):
            sample_file = params.get('sample_file')
            if not os.path.isfile(sample_file):
                # try prepending '/staging/' to file and check then
                if os.path.isfile(os.path.join('/staging', sample_file)):
                    sample_file = os.path.join('/staging', sample_file)
                else:
                    raise ValueError(
                        f"input file {sample_file} does not exist.")
            sample_file_copy = os.path.join(self.scratch,
                                            os.path.basename(sample_file))
            shutil.copy(sample_file, sample_file_copy)
            file_links.append({
                "path":
                sample_file_copy,
                "name":
                os.path.basename(sample_file_copy),
                "label":
                "Input Sample file",
                "description":
                "Input file provided to create the sample set."
            })

        # create report
        report_client = KBaseReport(self.callback_url)
        report_data = {
            'report_object_name':
            "SampleSet_import_report_" + str(uuid.uuid4()),
            'workspace_name': params['workspace_name']
        }
        if file_links:
            report_data['file_links'] = file_links
        if sample_set_ref:
            report_data[
                'message'] = f"SampleSet object named \"{set_name}\" imported."
            report_data['objects_created'] = [{'ref': sample_set_ref}]

        if html_link:
            report_data['html_links'] = [{
                'path':
                html_link,
                'name':
                'index.html',
                'description':
                'Sample Set Import Error ui'
            }]
            report_data['direct_html_link_index'] = 0
        report_info = report_client.create_extended_report(report_data)
        output = {
            'report_ref': report_info['ref'],
            'report_name': report_info['name'],
            'sample_set': sample_set,
            'sample_set_ref': sample_set_ref,
            'errors': errors
        }
        #END import_samples

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method import_samples return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]

    def import_samples_from_IGSN(self, ctx, params):
        """
        :param params: instance of type "ImportSampleIGSNInputs" ->
           structure: parameter "sample_set_ref" of String, parameter "igsns"
           of list of String, parameter "workspace_name" of String, parameter
           "workspace_id" of Long, parameter "description" of String,
           parameter "set_name" of String, parameter "output_format" of
           String, parameter "taxonomy_source" of String, parameter
           "num_otus" of Long, parameter "incl_seq" of Long, parameter
           "otu_prefix" of String, parameter "share_within_workspace" of
           Long, parameter "prevalidate" of Long, parameter
           "incl_input_in_output" of Long
        :returns: instance of type "ImportSampleOutputs" -> structure:
           parameter "report_name" of String, parameter "report_ref" of
           String, parameter "sample_set" of type "SampleSet" -> structure:
           parameter "samples" of list of type "sample_info" -> structure:
           parameter "id" of type "sample_id", parameter "name" of String,
           parameter "description" of String, parameter "sample_set_ref" of
           String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN import_samples_from_IGSN

        igsns = params.get('igsns')
        if not igsns:
            raise ValueError('Please provide IGSNs')

        if isinstance(igsns, str):
            if igsns.isalnum():
                # single igsn given e.g. 'IEAWH0001'
                igsns = [igsns]
            else:
                # multiple igsn given e.g. 'IEAWH0001, GEE0000O4' or 'IEAWH0001; GEE0000O4'
                delimiter = csv.Sniffer().sniff(igsns).delimiter
                igsns = [x.strip() for x in igsns.split(delimiter)]

        logging.info('Start importing samples from IGSNs: {}'.format(igsns))

        sample_file_name = 'isgn_sample_{}.csv'.format(str(uuid.uuid4()))
        sample_file_dir = os.path.join(self.scratch, str(uuid.uuid4()))
        os.makedirs(sample_file_dir)
        sample_file = os.path.join(sample_file_dir, sample_file_name)

        igsns_to_csv(igsns, sample_file)

        params['sample_file'] = sample_file
        params['file_format'] = 'SESAR'

        output = self.import_samples(ctx, params)[0]
        #END import_samples_from_IGSN

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method import_samples_from_IGSN return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]

    def generate_OTU_sheet(self, ctx, params):
        """
        :param params: instance of type "GenerateOTUSheetParams" (Generate a
           customized OTU worksheet using a SampleSet input to generate the
           appropriate columns.) -> structure: parameter "workspace_name" of
           String, parameter "workspace_id" of Long, parameter
           "sample_set_ref" of String, parameter "output_name" of String,
           parameter "output_format" of String, parameter "num_otus" of Long,
           parameter "taxonomy_source" of String, parameter "incl_seq" of
           Long, parameter "otu_prefix" of String
        :returns: instance of type "GenerateOTUSheetOutputs" -> structure:
           parameter "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN generate_OTU_sheet
        # first we download sampleset
        sample_set_ref = params.get('sample_set_ref')
        ret = self.dfu.get_objects({'object_refs':
                                    [sample_set_ref]})['data'][0]
        sample_set = ret['data']
        if params.get('output_name'):
            output_name = params.get('output_name')
        else:
            # if output_name not specified use name of sample_set as output + "_OTUs"
            output_name = ret['info'][1] + "_OTUs"
        otu_path = sample_set_to_OTU_sheet(sample_set, output_name,
                                           self.scratch, params)
        report_client = KBaseReport(self.callback_url)
        report_name = "Generate_OTU_sheet_report_" + str(uuid.uuid4())
        report_info = report_client.create_extended_report({
            'file_links': [{
                'path':
                otu_path,
                'name':
                os.path.basename(otu_path),
                'label':
                "CSV with headers for OTU",
                'description':
                "CSV file with each column containing the assigned sample_id and sample "
                "name of each saved sample. Intended for uploading OTU data."
            }],
            'report_object_name':
            report_name,
            'workspace_name':
            params['workspace_name']
        })
        output = {
            'report_ref': report_info['ref'],
            'report_name': report_info['name'],
        }

        #END generate_OTU_sheet

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method generate_OTU_sheet return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]

    def update_sample_set_acls(self, ctx, params):
        """
        :param params: instance of type "update_sample_set_acls_params" ->
           structure: parameter "workspace_name" of String, parameter
           "workspace_id" of Long, parameter "sample_set_ref" of String,
           parameter "new_users" of list of String, parameter "is_reader" of
           Long, parameter "is_writer" of Long, parameter "is_admin" of Long,
           parameter "share_within_workspace" of Long
        :returns: instance of type "update_sample_set_acls_output" ->
           structure: parameter "status" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN update_sample_set_acls

        # first get sample_set object
        sample_set_ref = params.get('sample_set_ref')
        ret = self.dfu.get_objects({'object_refs':
                                    [sample_set_ref]})['data'][0]
        sample_set = ret['data']
        sample_url = get_sample_service_url(self.sw_url)

        acls = {'read': [], 'write': [], 'admin': []}

        if params.get('share_within_workspace'):
            acls = get_workspace_user_perms(self.workspace_url,
                                            params.get('workspace_id'),
                                            ctx['token'], ctx['user_id'], acls)

        for new_user in params.get('new_users'):
            if params.get('is_admin'):
                acls['admin'].append(new_user)
            elif params.get('is_writer'):
                acls['write'].append(new_user)
            elif params.get('is_reader'):
                acls['read'].append(new_user)

        for sample in sample_set['samples']:
            sample_id = sample['id']
            status = update_acls(sample_url, sample_id, acls, ctx['token'])
        output = {"status": status}
        #END update_sample_set_acls

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method update_sample_set_acls return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]

    def export_samples(self, ctx, params):
        """
        :param params: instance of type "ExportParams" (export function for
           samples) -> structure: parameter "input_ref" of String, parameter
           "file_format" of String
        :returns: instance of type "ExportOutput" -> structure: parameter
           "shock_id" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN export_samples
        if not params.get('input_ref'):
            raise ValueError(f"variable input_ref required")
        sample_set_ref = params.get('input_ref')
        output_file_format = params.get('file_format', 'SESAR')

        ret = self.dfu.get_objects({'object_refs':
                                    [sample_set_ref]})['data'][0]
        sample_set = ret['data']
        sample_set_name = ret['info'][1]
        sample_url = get_sample_service_url(self.sw_url)

        export_package_dir = os.path.join(self.scratch, "output")
        if not os.path.isdir(export_package_dir):
            os.mkdir(export_package_dir)
        output_file = os.path.join(export_package_dir,
                                   '_'.join(sample_set_name.split()) + ".csv")

        sample_set_to_output(sample_set, sample_url, ctx['token'], output_file,
                             output_file_format)

        # package it up
        package_details = self.dfu.package_for_download({
            'file_path':
            export_package_dir,
            'ws_refs': [params['input_ref']]
        })

        output = {
            'shock_id': package_details['shock_id'],
            'result_dir': export_package_dir
        }
        #END export_samples

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method export_samples return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]

    def link_reads(self, ctx, params):
        """
        :param params: instance of type "LinkReadsParams" -> structure:
           parameter "workspace_name" of String, parameter "workspace_id" of
           String, parameter "sample_set_ref" of String, parameter "links" of
           list of type "ReadsLink" (Create links between samples and reads
           objects.) -> structure: parameter "sample_name" of String,
           parameter "reads_ref" of String
        :returns: instance of type "LinkReadsOutput" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String,
           parameter "links" of list of unspecified object
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN link_reads
        logging.info(params)

        ss = SampleService(self.sw_url, service_ver='dev')

        sample_set_ref = params['sample_set_ref']
        sample_set_obj = self.dfu.get_objects(
            {'object_refs': [sample_set_ref]})['data'][0]['data']
        sample_name_2_info = {d['name']: d for d in sample_set_obj['samples']}

        links = [(d['sample_name'][0], d['reads_ref'])
                 for d in params['links']]

        new_data_links = []
        for sample_name, reads_ref in links:
            sample_id = sample_name_2_info[sample_name]['id']
            version = sample_name_2_info[sample_name]['version']
            sample = ss.get_sample({
                'id': sample_id,
                'version': version,
            })
            ret = ss.create_data_link(
                dict(
                    upa=reads_ref,
                    id=sample_id,
                    version=version,
                    node=sample['node_tree'][0]['id'],
                    update=1,
                ))
            new_data_links.append(ret)

        report_client = KBaseReport(self.callback_url)
        report_info = report_client.create_extended_report({
            'workspace_name':
            params['workspace_name'],
        })
        output = {
            'report_name': report_info['name'],
            'report_ref': report_info['ref'],
            'links': new_data_links,
        }
        #END link_reads

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method link_reads return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]

    def status(self, ctx):
        #BEGIN_STATUS
        returnVal = {
            'state': "OK",
            'message': "",
            'version': self.VERSION,
            'git_url': self.GIT_URL,
            'git_commit_hash': self.GIT_COMMIT_HASH
        }
        #END_STATUS
        return [returnVal]
Example #17
0
class ParmigeneUtils:

    R_BIN = '/kb/deployment/bin'
    PARMI_OUT_DIR = 'parmigene_output'
    PARAM_IN_WS = 'workspace_name'
    PARAM_IN_MATRIX = 'input_obj_ref'
    PARAM_OUT_MATRIX = 'parmigene_matrix_name'
    OMP_NUM_THREADS = 'num_threads'

    def _mkdir_p(self, path):
        """
        _mkdir_p: make directory for given path
        """
        if not path:
            return
        try:
            os.makedirs(path)
        except OSError as exc:
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise

    def _validate_run_mi_params(self, params):
        """
        _validate_run_mi_params:
            validates params passed to run_mi method
        """

        logging.info('start validating run_mi params')

        # check for required parameters
        for p in [
                self.PARAM_IN_MATRIX, self.PARAM_IN_WS, self.PARAM_OUT_MATRIX
        ]:
            if p not in params:
                raise ValueError(
                    '"{}" parameter is required, but missing'.format(p))

    def _build_rParmigene_script(self, mi, algthm, num_threads, eps):
        """
        _build_rParmigene_script: build a sequence of R command calls according to params
        Note: To run the Parmigene functions, we will call different functions from the parmigene
        package, that requires a mutual information matrix (mi), the algorithm name (algthm),
        actual number of threads used (num_threads) and, sometimes, a positive numeric criteria
        (eps) to remove the weakest edge of each triple of nodes.
        """
        parmi_scrpt = 'library(parmigene)\n'
        parmi_scrpt += algthm + '(' + mi + ',' + eps + ')\n'
        # save the results in the memory
        # 1) store species ordination
        parmi_scrpt += 'variableScores <- vg_data.parmi$species\n'
        # 2) store site ordination
        parmi_scrpt += 'sampleScores <- vg_data.parmi$points\n'

        # save the results to the current dir
        # Write CSV in R
        parmi_scrpt += 'write.csv(dist_matrix,file="dist_matrix.csv",row.names=TRUE,na="")\n'
        parmi_scrpt += 'write.csv(variableScores,file="species_ordination.csv",' + \
                       'row.names=TRUE,na="")\n'

        # Write JSON in R
        parmi_scrpt += 'write_json(toJSON(dist_matrix),path="dist_matrix.json",pretty=TRUE,' + \
                       'auto_unbox=FALSE)\n'

        # save Parmigene plot
        parmi_scrpt += 'bmp(file="saving_mi_plot.bmp",width=6,height=4,units="in",res=100)\n'
        parmi_scrpt += 'plot(vg_data.parmi,type="n",display="sites")\n'
        parmi_scrpt += 'points(vg_data.parmi)\n'
        parmi_scrpt += 'dev.off()\n'

        parmi_rscript = 'parmi_script.R'
        rscrpt_file_path = os.path.join(self.output_dir, parmi_rscript)

        with open(rscrpt_file_path, 'w') as r_file:
            r_file.write(parmi_scrpt)
        return rscrpt_file_path

    def _execute_r_script(self, rfile_name):
        """
        _execute_r_script: Calling the Rscript executable to run the R script in rfile_name
        """
        logging.info('Calling R......')

        result_dir = os.path.dirname(rfile_name)
        if not result_dir:
            result_dir = self.working_dir

        rcmd = [os.path.join(self.R_BIN, 'Rscript')]
        rcmd.append(rfile_name)

        logging.info(
            'Running Parmigene script in current working directory: {}'.format(
                result_dir))
        exitCode = 0
        try:
            complete_proc = subprocess.run(rcmd,
                                           cwd=result_dir,
                                           stdin=subprocess.PIPE,
                                           stdout=subprocess.PIPE,
                                           stderr=subprocess.STDOUT,
                                           close_fds=True)
            exitCode = complete_proc.returncode
            if (exitCode == 0):
                logging.info('\n{}'.format(complete_proc.stdout))
                logging.info(
                    '\n{} was executed successfully, exit code was: {}'.format(
                        ' '.join(rcmd), str(exitCode)))
                logging.info("Finished calling R.")
            else:
                logging.info('Error running command: {} Exit Code: '.format(
                    ' '.join(rcmd), str(exitCode)))
                logging.info('\n{}'.format(complete_proc.stderr))
        except subprocess.CalledProcessError as sub_e:
            exitCode = -99
            logging.info(
                'Caught subprocess.CalledProcessError {}'.format(sub_e))

        return exitCode

    def _df_to_list(self, df):
        """
        _df_to_list: convert Dataframe to FloatMatrix2D matrix data
        """

        df.index = df.index.astype('str')
        df.columns = df.columns.astype('str')
        df.fillna(0, inplace=True)
        matrix_data = {
            'row_ids': df.index.tolist(),
            'col_ids': df.columns.tolist(),
            'values': df.values.tolist()
        }

        return matrix_data

    def _mi_df_to_excel(self, mi_df, distance_df, result_dir, mi_matrix_ref):
        """
        write mutual information matrix df into excel
        """
        logging.info('writting mi data frame to excel file')
        mi_matrix_obj = self.dfu.get_objects({'object_refs':
                                              [mi_matrix_ref]})['data'][0]
        mi_matrix_info = mi_matrix_obj['info']
        mi_matrix_name = mi_matrix_info[1]

        file_path = os.path.join(result_dir, mi_matrix_name + ".xlsx")
        writer = pd.ExcelWriter(file_path)

        mi_df.to_excel(writer, "mi_matrix", index=True)
        if distance_df:
            distance_df.to_excel(writer, "mi_distance_matrix", index=True)

        writer.close()

    def _Matrix2D_to_df(self, Matrix2D):
        """
        _Matrix2D_to_df: transform a FloatMatrix2D to data frame
        """

        index = Matrix2D.get('row_ids')
        columns = Matrix2D.get('col_ids')
        values = Matrix2D.get('values')

        df = pd.DataFrame(values, index=index, columns=columns)

        return df

    def _mi_to_df(self, mi_matrix_ref):
        """
        retrieve mutual information matrix ws object to mi_df
        """
        logging.info('converting mutual information matrix to data frame')
        mi_data = self.dfu.get_objects({'object_refs':
                                        [mi_matrix_ref]})['data'][0]['data']

        rotation_matrix_data = mi_data.get('rotation_matrix')
        distance_matrix_data = mi_data.get('distance_matrix')
        original_matrix_ref = mi_data.get('original_matrix_ref')
        dimension = mi_data.get('mi_parameters').get('n_components')

        mi_df = self._Matrix2D_to_df(rotation_matrix_data)
        distance_df = None
        if distance_matrix_data:
            distance_df = self._Matrix2D_to_df(distance_matrix_data)

        if original_matrix_ref:
            logging.info(
                'appending instance group information to mutual information data frame'
            )
            obj_data = self.dfu.get_objects(
                {'object_refs': [original_matrix_ref]})['data'][0]['data']

            attributemapping_ref = obj_data.get(
                '{}_attributemapping_ref'.format(dimension))

            am_data = self.dfu.get_objects(
                {'object_refs': [attributemapping_ref]})['data'][0]['data']

            attributes = am_data.get('attributes')
            instances = am_data.get('instances')
            am_df = pd.DataFrame(data=list(instances.values()),
                                 columns=list(
                                     map(lambda x: x.get('attribute'),
                                         attributes)),
                                 index=instances.keys())

            mi_df = mi_df.merge(am_df,
                                left_index=True,
                                right_index=True,
                                how='left',
                                validate='one_to_one')

        return mi_df, distance_df

    def _save_mi_matrix(self, workspace_name, input_obj_ref, mi_matrix_name,
                        distance_df, mi_params_df, site_ordin_df,
                        species_ordin_df):

        logging.info('Saving MIMatrix...')

        if not isinstance(workspace_name, int):
            ws_name_id = self.dfu.ws_name_to_id(workspace_name)
        else:
            ws_name_id = workspace_name

        mi_data = {}

        mi_data.update({'distance_matrix': self._df_to_list(distance_df)})
        mi_data.update({'site_ordination': self._df_to_list(site_ordin_df)})
        mi_data.update(
            {'species_ordination': self._df_to_list(species_ordin_df)})
        mi_data.update({'mi_parameters': self._df_to_list(mi_params_df)})
        mi_data.update({'original_matrix_ref': input_obj_ref})
        mi_data.update({'rotation_matrix': self._df_to_list(distance_df)})

        obj_type = 'KBaseExperiments.PCAMatrix'
        info = self.dfu.save_objects({
            "id":
            ws_name_id,
            "objects": [{
                "type": obj_type,
                "data": mi_data,
                "name": mi_matrix_name
            }]
        })[0]

        return "%s/%s/%s" % (info[6], info[0], info[4])

    def _zip_folder(self, folder_path, output_path):
        """
        _zip_folder: Zip the contents of an entire folder (with that folder included in the
         archive). Empty subfolders could be included in the archive as well if the 'Included
         all subfolders, including empty ones' portion.
         portion is used.
        """
        with zipfile.ZipFile(output_path,
                             'w',
                             zipfile.ZIP_DEFLATED,
                             allowZip64=True) as ziph:
            for root, folders, files in os.walk(folder_path):
                # Include all subfolders, including empty ones.
                for folder_name in folders:
                    absolute_fpath = os.path.join(root, folder_name)
                    relative_fpath = os.path.join(os.path.basename(root),
                                                  folder_name)
                    logging.info(
                        "Adding {} to archive.".format(absolute_fpath))
                    ziph.write(absolute_fpath, relative_fpath)
                for f in files:
                    absolute_path = os.path.join(root, f)
                    relative_path = os.path.join(os.path.basename(root), f)
                    logging.info("Adding {} to archive.".format(absolute_path))
                    ziph.write(absolute_path, relative_path)

        logging.info("{} created successfully.".format(output_path))

    def _generate_output_file_list(self, out_dir):
        """
        _generate_output_file_list: zip result files and generate file_links for report
        """

        logging.info('Start packing result files from Parmigene...')

        output_files = list()

        output_dir = os.path.join(self.working_dir, str(uuid.uuid4()))
        self._mkdir_p(output_dir)
        mi_output = os.path.join(output_dir, 'metami_output.zip')
        self._zip_folder(out_dir, mi_output)

        output_files.append({
            'path':
            mi_output,
            'name':
            os.path.basename(mi_output),
            'label':
            os.path.basename(mi_output),
            'description':
            'Output file(s) generated by Parmigene'
        })
        return output_files

    def _generate_mi_html_report(self, mi_outdir, n_components):

        logging.info('Start generating html report for Parmigene results...')
        html_report = list()

        result_dir = os.path.join(self.working_dir, str(uuid.uuid4()))
        self._mkdir_p(result_dir)
        result_file_path = os.path.join(result_dir, 'mi_result.html')

        mi_plots = list()
        for root, folders, files in os.walk(mi_outdir):
            # Find the image files by their extensions.
            for f in files:
                if re.match('^[a-zA-Z]+.*.(jpeg|jpg|bmp|tiff|pdf|ps)$', f):
                    absolute_path = os.path.join(root, f)
                    logging.info(
                        "Adding {} to plot archive.".format(absolute_path))
                    mi_plots.append(absolute_path)

        visualization_content = ''

        for mi_plot in mi_plots:
            shutil.copy2(mi_plot,
                         os.path.join(result_dir, os.path.basename(mi_plot)))
            visualization_content += '<iframe height="900px" width="100%" '
            visualization_content += 'src="{}" '.format(
                os.path.basename(mi_plot))
            visualization_content += 'style="border:none;"></iframe>\n<p></p>\n'

        with open(result_file_path, 'w') as result_file:
            with open(
                    os.path.join(os.path.dirname(__file__), 'templates',
                                 'mi_template.html'),
                    'r') as report_template_file:
                report_template = report_template_file.read()
                report_template = report_template.replace(
                    '<p>Visualization_Content</p>', visualization_content)
                report_template = report_template.replace(
                    'n_components', '{} Components'.format(n_components))
                result_file.write(report_template)

        report_shock_id = self.dfu.file_to_shock({
            'file_path': result_dir,
            'pack': 'zip'
        })['shock_id']

        html_report.append({
            'shock_id':
            report_shock_id,
            'name':
            os.path.basename(result_file_path),
            'label':
            os.path.basename(result_file_path),
            'description':
            'HTML summary report for Parmigene Matrix App'
        })
        return html_report

    def _generate_mi_report(self, mi_ref, output_dir, workspace_name,
                            n_components):
        logging.info('Creating Parmigene report...')

        output_files = self._generate_output_file_list(output_dir)
        output_html_files = self._generate_mi_html_report(
            output_dir, n_components)

        objects_created = list()
        objects_created.append({
            'ref': mi_ref,
            'description': 'Mutual Information Matrix'
        })

        report_params = {
            'message': '',
            'workspace_name': workspace_name,
            'file_links': output_files,
            'objects_created': objects_created,
            'html_links': output_html_files,
            'direct_html_link_index': 0,
            'html_window_height': 666,
            'report_object_name': 'kb_mi_report_' + str(uuid.uuid4())
        }

        kbase_report_client = KBaseReport(self.callback_url)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {
            'report_name': output['name'],
            'report_ref': output['ref']
        }

        return report_output

    def __init__(self, config):

        self.ws_url = config["workspace-url"]
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        self.scratch = config['scratch']
        self.dfu = DataFileUtil(self.callback_url, service_ver='release')
        self.working_dir = self.scratch

        self.data_util = DataUtil(config)
        self.dfu = DataFileUtil(self.callback_url)
        self.output_dir = os.path.join(self.working_dir, self.PARMI_OUT_DIR)
        self._mkdir_p(self.output_dir)

    def run_mi(self, params):
        """
        run_mi: perform Parmigene analysis on matrix
        :param input_obj_ref: object reference of a matrix
        :param workspace_name: the name of the workspace
        :param mi_matrix_name: name of Parmigene (KBaseExperiments.MIMatrix) object
        :param n_components - dimentionality of the reduced space (default 2)
        :param max_iter: maximum iterations allowed
        :param distance_metric: distance the ordination will be performed on, default to "bray"
        """

        logging.info('--->\nrunning Parmigene with input\n' +
                     'params:\n{}'.format(json.dumps(params, indent=1)))

        self._validate_run_mi_params(params)

        input_obj_ref = params.get(self.PARAM_IN_MATRIX)
        workspace_name = params.get(self.PARAM_IN_WS)
        mi_matrix_name = params.get(self.PARAM_OUT_MATRIX)
        n_threads = int(params.get(self.OMP_NUM_THREADS, 2))

        res = self.dfu.get_objects({'object_refs': [input_obj_ref]})['data'][0]
        obj_data = res['data']
        obj_name = res['info'][1]
        obj_type = res['info'][2]

        exitCode = -99
        if "KBaseMatrices" in obj_type:
            # create the input file from obj_data
            matrix_tab = obj_data['data']['values']
            row_ids = obj_data['data']['row_ids']
            col_ids = obj_data['data']['col_ids']
            matrix_df = pd.DataFrame(matrix_tab,
                                     index=row_ids,
                                     columns=col_ids)

            matrix_data_file = os.path.join(self.output_dir, obj_name + '.csv')
            with open(matrix_data_file, 'w') as m_file:
                matrix_df.to_csv(m_file, sep='\t')

            params['datafile'] = matrix_data_file
            exitCode = self.run_mi_with_file(params)
        else:
            err_msg = 'Ooops! [{}] is not supported.\n'.format(obj_type)
            err_msg += 'Please provide a KBaseMatrices object'
            raise ValueError("err_msg")

        if exitCode == -99:
            raise ValueError(
                'Caught subprocess.CalledProcessError while calling R.')

        # saving the mi_matrix object
        # read Parmigene results from files into data frames
        dist_matrix_df = pd.read_csv(
            os.path.join(self.output_dir, "dist_matrix.csv"))
        mi_params_df = pd.read_json(
            os.path.join(self.output_dir, "others.json"))
        site_ordin_df = pd.read_csv(
            os.path.join(self.output_dir, "site_ordination.csv"))
        species_ordin_df = pd.read_csv(
            os.path.join(self.output_dir, "species_ordination.csv"))

        mi_ref = self._save_mi_matrix(workspace_name, input_obj_ref,
                                      mi_matrix_name, dist_matrix_df,
                                      mi_params_df, site_ordin_df,
                                      species_ordin_df)
        returnVal = {'mi_ref': mi_ref}

        # generating report
        report_output = self._generate_mi_report(mi_ref, self.output_dir,
                                                 workspace_name, n_threads)

        returnVal.update(report_output)
        return returnVal

    def run_mi_with_file(self, params):
        """
        run_mi_with_file: perform Parmigene analysis on matrix
        :param datafile: a file that contains the matrix data
        :param workspace_name: the name of the workspace
        :param mi_matrix_name: name of Parmigene (KBaseExperiments.MIMatrix) object
        :param n_components - dimentionality of the reduced space (default 2)
        :param max_iter: maximum iterations allowed
        :param distance_metric: distance the ordination will be performed on, default to "bray"
        """

        logging.info('--->\nrunning Parmigene with input \n' +
                     'params:\n{}'.format(json.dumps(params, indent=1)))

        rscrpt_file = self._build_rmi_script(params)
        logging.info(
            '--->\nR script file has been written to {}'.format(rscrpt_file))

        return self._execute_r_script(rscrpt_file)

    def export_mi_matrix_excel(self, params):
        """
        export MIMatrix as Excel
        """
        logging.info('start exporting Parmigene matrix')
        mi_matrix_ref = params.get('input_ref')

        mi_df, components_df = self._mi_to_df(mi_matrix_ref)

        result_dir = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(result_dir)

        self._mi_df_to_excel(mi_df, components_df, result_dir, mi_matrix_ref)

        package_details = self.dfu.package_for_download({
            'file_path':
            result_dir,
            'ws_refs': [mi_matrix_ref]
        })

        return {'shock_id': package_details['shock_id']}