class VariationToVCF: def __init__(self, callback_url, scratch): self.scratch = scratch self.dfu = DataFileUtil(callback_url) def is_gz_file(filepath): with open(filepath, 'rb') as test_f: return binascii.hexlify(test_f.read(2)) == b'1f8b' def export_as_vcf(self, params): if 'input_var_ref' not in params: raise ValueError('Cannot export Variation- no input_var_ref field defined.') file = self.variation_to_vcf({'variation_ref': params['input_var_ref']}) export_dir = os.path.join(self.scratch, file['variation_name']) os.makedirs(export_dir) try: shutil.move(file['path'], os.path.join(export_dir, os.path.basename(file['path']))) except shutil.Error as e: exit(e) dfupkg = self.dfu.package_for_download({ 'file_path': export_dir, 'ws_refs': [params['input_var_ref']] }) return {'shock_id': dfupkg['shock_id']} def variation_to_vcf(self, params): self.validate_params(params) print('downloading ws object data: '+params["variation_ref"]) variation_obj = self.dfu.get_objects({'object_refs': [params['variation_ref']]})['data'][0] ws_type = variation_obj['info'][2] obj_name = variation_obj['info'][1] if 'KBaseGwasData.Variations' in ws_type: dl_path = self.process_vcf(self.scratch, variation_obj['data']) else: raise ValueError('Cannot write data to VCF; invalid WS type (' + ws_type + '). Supported types is KBaseGwasData.Variations') return {'path': dl_path, 'variation_name': obj_name} def process_vcf(self, output_vcf_file_path, data): obj = self.dfu.shock_to_file({ 'handle_id': data['vcf_handle_ref'], 'file_path': output_vcf_file_path, }) return obj['file_path'] def validate_params(self, params): for key in ['variation_ref']: if key not in params: raise ValueError('required "' + key + '" field was not defined')
class AssemblyToFasta: def __init__(self, callback_url, scratch): self.scratch = scratch self.dfu = DataFileUtil(callback_url) def export_as_fasta(self, ctx, params): """ Used almost exclusively for download only """ # validate parameters if 'input_ref' not in params: raise ValueError( 'Cannot export Assembly- not input_ref field defined.') # export to a file file = self.assembly_as_fasta(ctx, {'ref': params['input_ref']}) # create the output directory and move the file there export_package_dir = os.path.join(self.scratch, file['assembly_name']) os.makedirs(export_package_dir) shutil.move( file['path'], os.path.join(export_package_dir, os.path.basename(file['path']))) # package it up and be done package_details = self.dfu.package_for_download({ 'file_path': export_package_dir, 'ws_refs': [params['input_ref']] }) return {'shock_id': package_details['shock_id']} def assembly_as_fasta(self, ctx, params): """ main function that accepts a ref to an object and writes a file """ self.validate_params(params) print(f'downloading ws object data ({ params["ref"]})') assembly_object = self.dfu.get_objects( {'object_refs': [params['ref']]})['data'][0] ws_type = assembly_object['info'][2] obj_name = assembly_object['info'][1] if 'filename' in params: output_filename = params['filename'] else: output_filename = obj_name + '.fa' output_fasta_file_path = os.path.join(self.scratch, output_filename) if 'KBaseGenomes.ContigSet' in ws_type: self.process_legacy_contigset(output_fasta_file_path, assembly_object['data']) elif 'KBaseGenomeAnnotations.Assembly' in ws_type: self.process_assembly(output_fasta_file_path, assembly_object['data']) else: raise ValueError( 'Cannot write data to fasta; invalid WS type (' + ws_type + '). Supported types are KBaseGenomes.ContigSet and ' + 'KBaseGenomeAnnotations.Assembly') return {'path': output_fasta_file_path, 'assembly_name': obj_name} def fasta_rows_generator_from_contigset(self, contig_list): """ generates SeqRecords iterator for writing from a legacy contigset object """ for contig in contig_list: description = '' if 'description' in contig and contig['description']: description = contig['description'] yield SeqRecord(Seq(contig['sequence'], SingleLetterAlphabet), id=contig['id'], description=description) def process_legacy_contigset(self, output_fasta_path, data): SeqIO.write(self.fasta_rows_generator_from_contigset(data['contigs']), output_fasta_path, "fasta") def process_assembly(self, output_fasta_path, data): self.dfu.shock_to_file({ 'handle_id': data['fasta_handle_ref'], 'file_path': output_fasta_path, 'unpack': 'uncompress' }) def validate_params(self, params): for key in ['ref']: if key not in params: raise ValueError('required "' + key + '" field was not defined')
class CorrelationUtil: def _mkdir_p(self, path): """ _mkdir_p: make directory for given path """ if not path: return try: os.makedirs(path) except OSError as exc: if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise def _validate_compute_corr_matrix_params(self, params): """ _validate_compute_corr_matrix_params: validates params passed to compute_correlation_matrix method """ logging.info('start validating compute_corrrelation_matrix params') # check for required parameters for p in ['input_obj_ref', 'workspace_name', 'corr_matrix_name']: if p not in params: raise ValueError( '"{}" parameter is required, but missing'.format(p)) def _validate_compute_correlation_across_matrices_params(self, params): """ _validate_compute_correlation_across_matrices_params: validates params passed to compute_correlation_across_matrices method """ logging.info( 'start validating compute_correlation_across_matrices params') # check for required parameters for p in [ 'workspace_name', 'corr_matrix_name', 'matrix_ref_1', 'matrix_ref_2' ]: if p not in params: raise ValueError( '"{}" parameter is required, but missing'.format(p)) def _fetch_taxon(self, amplicon_set_ref, amplicon_ids): logging.info('start fetching taxon info from AmpliconSet') taxons = dict() taxons_level = dict() amplicon_set_data = self.dfu.get_objects( {'object_refs': [amplicon_set_ref]})['data'][0]['data'] amplicons = amplicon_set_data.get('amplicons') for amplicon_id in amplicon_ids: scientific_name = 'None' level = 'Unknown' try: scientific_name = amplicons.get(amplicon_id).get( 'taxonomy').get('scientific_name') except Exception: pass try: level = amplicons.get(amplicon_id).get('taxonomy').get( 'taxon_level') except Exception: pass taxons.update({amplicon_id: scientific_name}) taxons_level.update({amplicon_id: level}) # default empty taxons and taxons_level if set(taxons.values()) == {'None'}: taxons = None if set(taxons_level.values()) == {'Unknown'}: taxons_level = None return taxons, taxons_level def _build_table_content(self, matrix_2D, output_directory, original_matrix_ref=[], type='corr'): """ _build_table_content: generate HTML table content for FloatMatrix2D object """ page_content = """\n""" table_file_name = '{}_table.html'.format(type) data_file_name = '{}_data.json'.format(type) page_content += """<iframe height="900px" width="100%" """ page_content += """src="{}" """.format(table_file_name) page_content += """style="border:none;"></iframe>\n""" row_ids = matrix_2D.get('row_ids') col_ids = matrix_2D.get('col_ids') values = matrix_2D.get('values') df = pd.DataFrame(values, index=row_ids, columns=col_ids) df = df.T links = df.stack().reset_index() columns = list() taxons = None taxons_level = None if len(original_matrix_ref) == 1: res = self.dfu.get_objects( {'object_refs': [original_matrix_ref[0]]})['data'][0] obj_type = res['info'][2] matrix_type = obj_type.split('Matrix')[0].split('.')[-1] if matrix_type == 'Amplicon': amplicon_set_ref = res['data'].get('amplicon_set_ref') if amplicon_set_ref: taxons, taxons_level = self._fetch_taxon( amplicon_set_ref, col_ids) columns.extend( ['{} 1'.format(matrix_type), '{} 2'.format(matrix_type)]) elif len(original_matrix_ref) == 2: for matrix_ref in original_matrix_ref[::-1]: res = self.dfu.get_objects({'object_refs': [matrix_ref]})['data'][0] obj_type = res['info'][2] matrix_type = obj_type.split('Matrix')[0].split('.')[-1] if matrix_type == 'Amplicon': amplicon_set_ref = res['data'].get('amplicon_set_ref') if amplicon_set_ref: taxons, taxons_level = self._fetch_taxon( amplicon_set_ref, col_ids) columns.append(matrix_type) else: links.columns = ['Variable 1', 'Variable 2'] # remove self-comparison links = links[links.iloc[:, 0] != links.iloc[:, 1]] if type == 'corr': columns.append('Correlation') elif type == 'sig': columns.append('Significance') else: columns.append('Value') links.columns = columns if taxons: links['Taxon'] = links.iloc[:, 0].map(taxons) if taxons_level: links['Taxon Level'] = links.iloc[:, 0].map(taxons_level) table_headers = links.columns.tolist() table_content = """\n""" # build header and footer table_content += """\n<thead>\n<tr>\n""" for table_header in table_headers: table_content += """\n <th>{}</th>\n""".format(table_header) table_content += """\n</tr>\n</thead>\n""" table_content += """\n<tfoot>\n<tr>\n""" for table_header in table_headers: table_content += """\n <th>{}</th>\n""".format(table_header) table_content += """\n</tr>\n</tfoot>\n""" logging.info('start generating table json file') data_array = links.values.tolist() total_rec = len(data_array) json_dict = { 'draw': 1, 'recordsTotal': total_rec, 'recordsFiltered': total_rec, 'data': data_array } with open(os.path.join(output_directory, data_file_name), 'w') as fp: json.dump(json_dict, fp) logging.info('start generating table html') with open(os.path.join(output_directory, table_file_name), 'w') as result_file: with open( os.path.join(os.path.dirname(__file__), 'templates', 'table_template.html'), 'r') as report_template_file: report_template = report_template_file.read() report_template = report_template.replace( '<p>table_header</p>', table_content) report_template = report_template.replace( 'ajax_file_path', data_file_name) report_template = report_template.replace( 'deferLoading_size', str(total_rec)) result_file.write(report_template) return page_content def _generate_visualization_content(self, output_directory, corr_matrix_obj_ref, corr_matrix_plot_path, scatter_plot_path): """ <div class="tab"> <button class="tablinks" onclick="openTab(event, 'CorrelationMatrix')" id="defaultOpen">Correlation Matrix</button> </div> <div id="CorrelationMatrix" class="tabcontent"> <p>CorrelationMatrix_Content</p> </div>""" tab_def_content = '' tab_content = '' corr_data = self.dfu.get_objects( {'object_refs': [corr_matrix_obj_ref]})['data'][0]['data'] coefficient_data = corr_data.get('coefficient_data') significance_data = corr_data.get('significance_data') original_matrix_ref = corr_data.get('original_matrix_ref') tab_def_content += """ <div class="tab"> <button class="tablinks" onclick="openTab(event, 'CorrelationMatrix')" id="defaultOpen">Correlation Matrix</button> """ corr_table_content = self._build_table_content( coefficient_data, output_directory, original_matrix_ref=original_matrix_ref, type='corr') tab_content += """ <div id="CorrelationMatrix" class="tabcontent">{}</div>""".format( corr_table_content) if significance_data: tab_def_content += """ <button class="tablinks" onclick="openTab(event, 'SignificanceMatrix')">Significance Matrix</button> """ sig_table_content = self._build_table_content( significance_data, output_directory, original_matrix_ref=original_matrix_ref, type='sig') tab_content += """ <div id="SignificanceMatrix" class="tabcontent">{}</div>""".format( sig_table_content) if corr_matrix_plot_path: tab_def_content += """ <button class="tablinks" onclick="openTab(event, 'CorrelationMatrixPlot')">Correlation Matrix Heatmap</button> """ tab_content += """ <div id="CorrelationMatrixPlot" class="tabcontent"> """ if corr_matrix_plot_path.endswith('.png'): corr_matrix_plot_name = 'CorrelationMatrixPlot.png' corr_matrix_plot_display_name = 'Correlation Matrix Plot' shutil.copy2( corr_matrix_plot_path, os.path.join(output_directory, corr_matrix_plot_name)) tab_content += '<div class="gallery">' tab_content += '<a target="_blank" href="{}">'.format( corr_matrix_plot_name) tab_content += '<img src="{}" '.format(corr_matrix_plot_name) tab_content += 'alt="{}" width="600" height="400">'.format( corr_matrix_plot_display_name) tab_content += '</a><div class="desc">{}</div></div>'.format( corr_matrix_plot_display_name) elif corr_matrix_plot_path.endswith('.html'): corr_matrix_plot_name = 'CorrelationMatrixPlot.html' shutil.copy2( corr_matrix_plot_path, os.path.join(output_directory, corr_matrix_plot_name)) tab_content += '<iframe height="900px" width="100%" ' tab_content += 'src="{}" '.format(corr_matrix_plot_name) tab_content += 'style="border:none;"></iframe>\n<p></p>\n' else: raise ValueError( 'unexpected correlation matrix plot format:\n{}'.format( corr_matrix_plot_path)) tab_content += """</div>""" if scatter_plot_path: tab_def_content += """ <button class="tablinks" onclick="openTab(event, 'ScatterMatrixPlot')">Scatter Matrix Plot</button> """ tab_content += """ <div id="ScatterMatrixPlot" class="tabcontent"> """ scatter_plot_name = 'ScatterMatrixPlot.png' scatter_plot_display_name = 'Scatter Matrix Plot' shutil.copy2(scatter_plot_path, os.path.join(output_directory, scatter_plot_name)) tab_content += '<div class="gallery">' tab_content += '<a target="_blank" href="{}">'.format( scatter_plot_name) tab_content += '<img src="{}" '.format(scatter_plot_name) tab_content += 'alt="{}" width="600" height="400">'.format( scatter_plot_display_name) tab_content += '</a><div class="desc">{}</div></div>'.format( scatter_plot_display_name) tab_content += """</div>""" tab_def_content += """</div>""" return tab_def_content + tab_content def _generate_corr_html_report(self, corr_matrix_obj_ref, corr_matrix_plot_path, scatter_plot_path): """ _generate_corr_html_report: generate html summary report for correlation """ logging.info('Start generating html report') html_report = list() output_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(output_directory) result_file_path = os.path.join(output_directory, 'corr_report.html') visualization_content = self._generate_visualization_content( output_directory, corr_matrix_obj_ref, corr_matrix_plot_path, scatter_plot_path) with open(result_file_path, 'w') as result_file: with open( os.path.join(os.path.dirname(__file__), 'templates', 'corr_template.html'), 'r') as report_template_file: report_template = report_template_file.read() report_template = report_template.replace( '<p>Visualization_Content</p>', visualization_content) result_file.write(report_template) report_shock_id = self.dfu.file_to_shock({ 'file_path': output_directory, 'pack': 'zip' })['shock_id'] html_report.append({ 'shock_id': report_shock_id, 'name': os.path.basename(result_file_path), 'label': os.path.basename(result_file_path), 'description': 'HTML summary report for Compute Correlation App' }) return html_report def _generate_corr_report(self, corr_matrix_obj_ref, workspace_name, corr_matrix_plot_path, scatter_plot_path=None): """ _generate_report: generate summary report """ logging.info('Start creating report') output_html_files = self._generate_corr_html_report( corr_matrix_obj_ref, corr_matrix_plot_path, scatter_plot_path) report_params = { 'message': '', 'objects_created': [{ 'ref': corr_matrix_obj_ref, 'description': 'Correlation Matrix' }], 'workspace_name': workspace_name, 'html_links': output_html_files, 'direct_html_link_index': 0, 'html_window_height': 666, 'report_object_name': 'compute_correlation_matrix_' + str(uuid.uuid4()) } kbase_report_client = KBaseReport(self.callback_url, token=self.token) output = kbase_report_client.create_extended_report(report_params) report_output = { 'report_name': output['name'], 'report_ref': output['ref'] } return report_output def _corr_for_matrix(self, input_obj_ref, method, dimension): """ _corr_for_matrix: compute correlation matrix df for KBaseMatrices object """ data_matrix = self.data_util.fetch_data({ 'obj_ref': input_obj_ref }).get('data_matrix') data_df = pd.read_json(data_matrix) data_df = data_df.reindex(index=natsorted(data_df.index)) data_df = data_df.reindex(columns=natsorted(data_df.columns)) corr_df = self.df_to_corr(data_df, method=method, dimension=dimension) return corr_df, data_df def _compute_significance(self, data_df, dimension): """ _compute_significance: compute pairwsie significance dataframe two-sided p-value for a hypothesis test """ logging.info('Start computing significance matrix') if dimension == 'row': data_df = data_df.T data_df = data_df.dropna()._get_numeric_data() dfcols = pd.DataFrame(columns=data_df.columns) sig_df = dfcols.transpose().join(dfcols, how='outer') for r in data_df.columns: for c in data_df.columns: pvalue = stats.linregress(data_df[r], data_df[c])[3] sig_df[r][c] = round(pvalue, 4) return sig_df def _df_to_list(self, df, threshold=None): """ _df_to_list: convert Dataframe to FloatMatrix2D matrix data """ df.fillna(0, inplace=True) if threshold: drop_cols = list() for col in df.columns: if all(df[col] < threshold) and all(df[col] > -threshold): drop_cols.append(col) df.drop(columns=drop_cols, inplace=True, errors='ignore') drop_idx = list() for idx in df.index: if all(df.loc[idx] < threshold) and all( df.loc[idx] > -threshold): drop_idx.append(idx) df.drop(index=drop_idx, inplace=True, errors='ignore') matrix_data = { 'row_ids': df.index.tolist(), 'col_ids': df.columns.tolist(), 'values': df.values.tolist() } return matrix_data def _save_corr_matrix(self, workspace_name, corr_matrix_name, corr_df, sig_df, method, matrix_ref=None, corr_threshold=None): """ _save_corr_matrix: save KBaseExperiments.CorrelationMatrix object """ logging.info('Start saving CorrelationMatrix') if not isinstance(workspace_name, int): ws_name_id = self.dfu.ws_name_to_id(workspace_name) else: ws_name_id = workspace_name corr_data = {} corr_data.update({ 'coefficient_data': self._df_to_list(corr_df, threshold=corr_threshold) }) corr_data.update({'correlation_parameters': {'method': method}}) if matrix_ref: corr_data.update({'original_matrix_ref': matrix_ref}) if sig_df is not None: corr_data.update({'significance_data': self._df_to_list(sig_df)}) obj_type = 'KBaseExperiments.CorrelationMatrix' info = self.dfu.save_objects({ "id": ws_name_id, "objects": [{ "type": obj_type, "data": corr_data, "name": corr_matrix_name }] })[0] return "%s/%s/%s" % (info[6], info[0], info[4]) def _Matrix2D_to_df(self, Matrix2D): """ _Matrix2D_to_df: transform a FloatMatrix2D to data frame """ index = Matrix2D.get('row_ids') columns = Matrix2D.get('col_ids') values = Matrix2D.get('values') df = pd.DataFrame(values, index=index, columns=columns) return df def _corr_to_df(self, corr_matrix_ref): """ retrieve correlation matrix ws object to coefficient_df and significance_df """ corr_data = self.dfu.get_objects({'object_refs': [corr_matrix_ref] })['data'][0]['data'] coefficient_data = corr_data.get('coefficient_data') significance_data = corr_data.get('significance_data') coefficient_df = self._Matrix2D_to_df(coefficient_data) significance_df = None if significance_data: significance_df = self._Matrix2D_to_df(significance_data) return coefficient_df, significance_df def _corr_df_to_excel(self, coefficient_df, significance_df, result_dir, corr_matrix_ref): """ write correlation matrix dfs into excel """ corr_info = self.dfu.get_objects({'object_refs': [corr_matrix_ref] })['data'][0]['info'] corr_name = corr_info[1] file_path = os.path.join(result_dir, corr_name + ".xlsx") writer = pd.ExcelWriter(file_path) coefficient_df.to_excel(writer, "coefficient_data", index=True) if significance_df is not None: significance_df.to_excel(writer, "significance_data", index=True) writer.close() def _update_taxonomy_index(self, data_df, amplicon_set_ref): logging.info( 'start updating index with taxonomy info from AmpliconSet') amplicon_set_data = self.dfu.get_objects( {'object_refs': [amplicon_set_ref]})['data'][0]['data'] amplicons = amplicon_set_data.get('amplicons') index = data_df.index.values replace_index = list() for idx in index: scientific_name = None try: scientific_name = amplicons.get(idx).get('taxonomy').get( 'scientific_name') except Exception: pass if scientific_name: replace_index.append(scientific_name + '_' + idx) else: replace_index.append(idx) for idx, val in enumerate(replace_index): index[idx] = val return data_df def _fetch_matrix_data(self, matrix_ref): logging.info('start fectching matrix data') res = self.dfu.get_objects({'object_refs': [matrix_ref]})['data'][0] obj_type = res['info'][2] if "KBaseMatrices" in obj_type: data_matrix = self.data_util.fetch_data({ 'obj_ref': matrix_ref }).get('data_matrix') data_df = pd.read_json(data_matrix) data_df = data_df.reindex(index=natsorted(data_df.index)) data_df = data_df.reindex(columns=natsorted(data_df.columns)) return data_df else: err_msg = 'Ooops! [{}] is not supported.\n'.format(obj_type) err_msg += 'Please supply KBaseMatrices object' raise ValueError("err_msg") def _compute_metrices_corr(self, df1, df2, method, compute_significance): df1.fillna(0, inplace=True) df2.fillna(0, inplace=True) col_1 = df1.columns col_2 = df2.columns idx_1 = df1.index idx_2 = df2.index common_col = col_1.intersection(col_2) logging.info('matrices share [{}] common columns'.format( common_col.size)) if common_col.empty: raise ValueError('Matrices share no common columns') logging.info('start trimming original matrix') df1 = df1.loc[:][common_col] df2 = df2.loc[:][common_col] corr_df = pd.DataFrame(index=idx_1, columns=idx_2) sig_df = pd.DataFrame(index=idx_1, columns=idx_2) logging.info('start calculating correlation matrix') logging.info('sizing {} x {}'.format(idx_1.size, idx_2.size)) counter = 0 for idx_value in idx_1: for col_value in idx_2: if counter % 100000 == 0: logging.info('computed {} corr/sig values'.format(counter)) value_array_1 = df1.loc[idx_value].tolist() value_array_2 = df2.loc[col_value].tolist() if method == 'pearson': corr_value, p_value = stats.pearsonr( value_array_1, value_array_2) elif method == 'spearman': corr_value, p_value = stats.spearmanr( value_array_1, value_array_2) elif method == 'kendall': corr_value, p_value = stats.kendalltau( value_array_1, value_array_2) else: err_msg = 'Input correlation method [{}] is not available.\n'.format( method) err_msg += 'Please choose one of {}'.format(CORR_METHOD) raise ValueError(err_msg) corr_df.at[idx_value, col_value] = round(corr_value, 4) if compute_significance: sig_df.at[idx_value, col_value] = round(p_value, 4) counter += 1 if not compute_significance: sig_df = None return corr_df, sig_df def __init__(self, config): self.ws_url = config["workspace-url"] self.callback_url = config['SDK_CALLBACK_URL'] self.token = config['KB_AUTH_TOKEN'] self.scratch = config['scratch'] self.data_util = DataUtil(config) self.dfu = DataFileUtil(self.callback_url) plt.switch_backend('agg') def df_to_corr(self, df, method='pearson', dimension='col'): """ Compute pairwise correlation of dimension (col or row) method: one of ['pearson', 'kendall', 'spearman'] """ logging.info('Computing correlation matrix') if method not in CORR_METHOD: err_msg = 'Input correlation method [{}] is not available.\n'.format( method) err_msg += 'Please choose one of {}'.format(CORR_METHOD) raise ValueError(err_msg) if dimension == 'row': df = df.T elif dimension != 'col': err_msg = 'Input dimension [{}] is not available.\n'.format( dimension) err_msg += 'Please choose either "col" or "row"' raise ValueError(err_msg) corr_df = df.corr(method=method).round(4) return corr_df def plotly_corr_matrix(self, corr_df): logging.info('Plotting matrix of correlation') result_dir = os.path.join(self.scratch, str(uuid.uuid4()) + '_corr_matrix_plots') self._mkdir_p(result_dir) try: trace = go.Heatmap(z=corr_df.values, x=corr_df.columns, y=corr_df.index) data = [trace] except Exception: err_msg = 'Running plotly_corr_matrix returned an error:\n{}\n'.format( traceback.format_exc()) raise ValueError(err_msg) else: corr_matrix_plot_path = os.path.join(result_dir, 'corr_matrix_plots.html') logging.info('Saving plot to:\n{}'.format(corr_matrix_plot_path)) plot(data, filename=corr_matrix_plot_path) return corr_matrix_plot_path def plot_corr_matrix(self, corr_df): """ plot_corr_matrix: genreate correlation matrix plot """ logging.info('Plotting matrix of correlation') result_dir = os.path.join(self.scratch, str(uuid.uuid4()) + '_corr_matrix_plots') self._mkdir_p(result_dir) try: plt.clf() matrix_size = corr_df.index.size figsize = 10 if matrix_size / 5 < 10 else matrix_size / 5 fig, ax = plt.subplots(figsize=(figsize, figsize)) cax = ax.matshow(corr_df) plt.xticks(list(range(len(corr_df.columns))), corr_df.columns, rotation='vertical', fontstyle='italic') plt.yticks(list(range(len(corr_df.columns))), corr_df.columns, fontstyle='italic') plt.colorbar(cax) except Exception: err_msg = 'Running plot_corr_matrix returned an error:\n{}\n'.format( traceback.format_exc()) raise ValueError(err_msg) else: corr_matrix_plot_path = os.path.join(result_dir, 'corr_matrix_plots.png') logging.info('Saving plot to:\n{}'.format(corr_matrix_plot_path)) plt.savefig(corr_matrix_plot_path) return corr_matrix_plot_path def plot_scatter_matrix(self, df, dimension='col', alpha=0.2, diagonal='kde', figsize=(10, 10)): """ plot_scatter_matrix: generate scatter plot for dimension (col or row) ref: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.plotting.scatter_matrix.html """ logging.info('Plotting matrix of scatter') result_dir = os.path.join(self.scratch, str(uuid.uuid4()) + '_scatter_plots') self._mkdir_p(result_dir) if dimension == 'row': df = df.T elif dimension != 'col': err_msg = 'Input dimension [{}] is not available.\n'.format( dimension) err_msg += 'Please choose either "col" or "row"' raise ValueError(err_msg) try: plt.clf() sm = pd.plotting.scatter_matrix(df, alpha=alpha, diagonal=diagonal, figsize=figsize) # Change label rotation [s.xaxis.label.set_rotation(45) for s in sm.reshape(-1)] [s.yaxis.label.set_rotation(45) for s in sm.reshape(-1)] # # May need to offset label when rotating to prevent overlap of figure [s.get_yaxis().set_label_coords(-1.5, 0.5) for s in sm.reshape(-1)] # Hide all ticks [s.set_xticks(()) for s in sm.reshape(-1)] [s.set_yticks(()) for s in sm.reshape(-1)] except Exception: err_msg = 'Running scatter_matrix returned an error:\n{}\n'.format( traceback.format_exc()) raise ValueError(err_msg) else: scatter_plot_path = os.path.join(result_dir, 'scatter_plots.png') logging.info('Saving plot to:\n{}'.format(scatter_plot_path)) plt.savefig(scatter_plot_path) return scatter_plot_path def compute_correlation_across_matrices(self, params): """ matrix_ref_1: object reference of a matrix matrix_ref_2: object reference of a matrix workspace_name: workspace name objects to be saved to corr_matrix_name: correlation matrix object name method: correlation method, one of ['pearson', 'kendall', 'spearman'] plot_corr_matrix: plot correlation matrix in report, default False compute_significance: also compute Significance in addition to correlation matrix """ logging.info( '--->\nrunning CorrelationUtil.compute_correlation_across_matrices\n' + 'params:\n{}'.format(json.dumps(params, indent=1))) self._validate_compute_correlation_across_matrices_params(params) matrix_ref_1 = params.get('matrix_ref_1') matrix_ref_2 = params.get('matrix_ref_2') workspace_name = params.get('workspace_name') corr_matrix_name = params.get('corr_matrix_name') corr_threshold = params.get('corr_threshold') method = params.get('method', 'pearson') if method not in CORR_METHOD: err_msg = 'Input correlation method [{}] is not available.\n'.format( method) err_msg += 'Please choose one of {}'.format(CORR_METHOD) raise ValueError(err_msg) plot_corr_matrix = params.get('plot_corr_matrix', False) compute_significance = params.get('compute_significance', False) matrix_1_type = self.dfu.get_objects({'object_refs': [matrix_ref_1] })['data'][0]['info'][2] # making sure otu_ids are on the column of table if "AmpliconMatrix" in matrix_1_type: matrix_ref_1, matrix_ref_2 = matrix_ref_2, matrix_ref_1 df1 = self._fetch_matrix_data(matrix_ref_1) df2 = self._fetch_matrix_data(matrix_ref_2) corr_df, sig_df = self._compute_metrices_corr(df1, df2, method, compute_significance) if plot_corr_matrix: corr_matrix_plot_path = self.plotly_corr_matrix(corr_df) else: corr_matrix_plot_path = None corr_matrix_obj_ref = self._save_corr_matrix( workspace_name, corr_matrix_name, corr_df, sig_df, method, matrix_ref=[matrix_ref_1, matrix_ref_2], corr_threshold=corr_threshold) returnVal = {'corr_matrix_obj_ref': corr_matrix_obj_ref} report_output = self._generate_corr_report(corr_matrix_obj_ref, workspace_name, corr_matrix_plot_path) returnVal.update(report_output) return returnVal def compute_correlation_matrix(self, params): """ input_obj_ref: object reference of a matrix workspace_name: workspace name objects to be saved to dimension: compute correlation on column or row, one of ['col', 'row'] corr_matrix_name: correlation matrix object name method: correlation method, one of ['pearson', 'kendall', 'spearman'] compute_significance: compute pairwise significance value, default False plot_corr_matrix: plot correlation matrix in repor, default False plot_scatter_matrix: plot scatter matrix in report, default False """ logging.info( '--->\nrunning CorrelationUtil.compute_correlation_matrix\n' + 'params:\n{}'.format(json.dumps(params, indent=1))) self._validate_compute_corr_matrix_params(params) input_obj_ref = params.get('input_obj_ref') workspace_name = params.get('workspace_name') corr_matrix_name = params.get('corr_matrix_name') method = params.get('method', 'pearson') dimension = params.get('dimension', 'row') plot_corr_matrix = params.get('plot_corr_matrix', False) plot_scatter_matrix = params.get('plot_scatter_matrix', False) compute_significance = params.get('compute_significance', False) res = self.dfu.get_objects({'object_refs': [input_obj_ref]})['data'][0] obj_type = res['info'][2] if "KBaseMatrices" in obj_type: corr_df, data_df = self._corr_for_matrix(input_obj_ref, method, dimension) sig_df = None if compute_significance: sig_df = self._compute_significance(data_df, dimension) else: err_msg = 'Ooops! [{}] is not supported.\n'.format(obj_type) err_msg += 'Please supply KBaseMatrices object' raise ValueError("err_msg") if plot_corr_matrix: corr_matrix_plot_path = self.plotly_corr_matrix(corr_df) else: corr_matrix_plot_path = None if plot_scatter_matrix: scatter_plot_path = self.plot_scatter_matrix(data_df, dimension=dimension) else: scatter_plot_path = None corr_matrix_obj_ref = self._save_corr_matrix( workspace_name, corr_matrix_name, corr_df, sig_df, method, matrix_ref=[input_obj_ref]) returnVal = {'corr_matrix_obj_ref': corr_matrix_obj_ref} report_output = self._generate_corr_report(corr_matrix_obj_ref, workspace_name, corr_matrix_plot_path, scatter_plot_path) returnVal.update(report_output) return returnVal def export_corr_matrix_excel(self, params): """ export CorrelationMatrix as Excel """ corr_matrix_ref = params.get('input_ref') coefficient_df, significance_df = self._corr_to_df(corr_matrix_ref) result_dir = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(result_dir) self._corr_df_to_excel(coefficient_df, significance_df, result_dir, corr_matrix_ref) package_details = self.dfu.package_for_download({ 'file_path': result_dir, 'ws_refs': [corr_matrix_ref] }) return {'shock_id': package_details['shock_id']}
class PCAUtil: def _mkdir_p(self, path): """ _mkdir_p: make directory for given path """ if not path: return try: os.makedirs(path) except OSError as exc: if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise def _validate_run_pca_params(self, params): """ _validate_run_pca_params: validates params passed to run_pca method """ logging.info('start validating run_pca params') # check for required parameters for p in ['input_obj_ref', 'workspace_name', 'pca_matrix_name']: if p not in params: raise ValueError( '"{}" parameter is required, but missing'.format(p)) def _df_to_list(self, df): """ _df_to_list: convert Dataframe to FloatMatrix2D matrix data """ df.index = df.index.astype('str') df.columns = df.columns.astype('str') df.fillna(0, inplace=True) matrix_data = { 'row_ids': df.index.tolist(), 'col_ids': df.columns.tolist(), 'values': df.values.tolist() } return matrix_data def _pca_df_to_excel(self, pca_df, components_df, result_dir, pca_matrix_ref): """ write PCA matrix df into excel """ logging.info('writting pca data frame to excel file') pca_matrix_obj = self.dfu.get_objects( {'object_refs': [pca_matrix_ref]})['data'][0] pca_matrix_info = pca_matrix_obj['info'] pca_matrix_name = pca_matrix_info[1] file_path = os.path.join(result_dir, pca_matrix_name + ".xlsx") writer = pd.ExcelWriter(file_path) pca_df.to_excel(writer, "principal_component_matrix", index=True) if components_df is not None: components_df.to_excel(writer, "component_variance_matrix", index=True) writer.close() def _Matrix2D_to_df(self, Matrix2D): """ _Matrix2D_to_df: transform a FloatMatrix2D to data frame """ index = Matrix2D.get('row_ids') columns = Matrix2D.get('col_ids') values = Matrix2D.get('values') df = pd.DataFrame(values, index=index, columns=columns) return df def _pca_to_df(self, pca_matrix_ref): """ retrieve pca matrix ws object to pca_df """ logging.info('converting pca matrix to data frame') pca_data = self.dfu.get_objects({'object_refs': [pca_matrix_ref]})['data'][0]['data'] rotation_matrix_data = pca_data.get('rotation_matrix') components_matrix_data = pca_data.get('components_matrix') explained_variance = pca_data.get('explained_variance') explained_variance_ratio = pca_data.get('explained_variance_ratio') singular_values = pca_data.get('singular_values') dimension = pca_data.get('pca_parameters').get('dimension') original_matrix_ref = pca_data.get('original_matrix_ref') pca_df = self._Matrix2D_to_df(rotation_matrix_data) components_df = None if components_matrix_data: components_df = self._Matrix2D_to_df(components_matrix_data) components_df.loc['explained_variance'] = explained_variance components_df.loc[ 'explained_variance_ratio'] = explained_variance_ratio components_df.loc['singular_values'] = singular_values if original_matrix_ref: logging.info( 'appending instance group information to pca data frame') obj_data = self.dfu.get_objects( {'object_refs': [original_matrix_ref]})['data'][0]['data'] attributemapping_ref = obj_data.get( '{}_attributemapping_ref'.format(dimension)) am_data = self.dfu.get_objects( {'object_refs': [attributemapping_ref]})['data'][0]['data'] attributes = am_data.get('attributes') instances = am_data.get('instances') am_df = pd.DataFrame(data=list(instances.values()), columns=list( map(lambda x: x.get('attribute'), attributes)), index=instances.keys()) pca_df = pca_df.merge(am_df, left_index=True, right_index=True, how='left', validate='one_to_one') return pca_df, components_df def _save_pca_matrix(self, workspace_name, input_obj_ref, pca_matrix_name, rotation_matrix_df, components_df, explained_variance, explained_variance_ratio, singular_values, n_components, dimension): logging.info('saving PCAMatrix') if not isinstance(workspace_name, int): ws_name_id = self.dfu.ws_name_to_id(workspace_name) else: ws_name_id = workspace_name pca_data = {} pca_data.update( {'rotation_matrix': self._df_to_list(rotation_matrix_df)}) pca_data.update({'components_matrix': self._df_to_list(components_df)}) pca_data.update({'explained_variance': explained_variance}) pca_data.update({'explained_variance_ratio': explained_variance_ratio}) pca_data.update({'singular_values': singular_values}) pca_data.update({ 'pca_parameters': { 'n_components': str(n_components), 'dimension': str(dimension) } }) pca_data.update({'original_matrix_ref': input_obj_ref}) obj_type = 'KBaseExperiments.PCAMatrix' info = self.dfu.save_objects({ "id": ws_name_id, "objects": [{ "type": obj_type, "data": pca_data, "name": pca_matrix_name }] })[0] return "%s/%s/%s" % (info[6], info[0], info[4]) def _creat_biplot(self, xs, ys, coeff, first_component, second_component, bi_plot_path, labels=None): plt.clf() n = coeff.shape[0] scalex = 1.0 / (xs.max() - xs.min()) scaley = 1.0 / (ys.max() - ys.min()) plt.scatter(xs * scalex, ys * scaley, s=5) for i in range(n): plt.arrow(0, 0, coeff[i, 0], coeff[i, 1], color='r', alpha=0.5) if labels is None: plt.text(coeff[i, 0] * 1.15, coeff[i, 1] * 1.15, "Var" + str(i + 1), color='green', ha='center', va='center') else: plt.text(coeff[i, 0] * 1.15, coeff[i, 1] * 1.15, labels[i], color='green', ha='center', va='center') plt.xlabel("PC{}".format(first_component)) plt.ylabel("PC{}".format(second_component)) plt.grid() plt.savefig(bi_plot_path) def _pca_for_matrix(self, input_obj_ref, n_components, dimension): """ _pca_for_matrix: perform PCA analysis for matrix object """ data_matrix = self.data_util.fetch_data({ 'obj_ref': input_obj_ref }).get('data_matrix') data_df = pd.read_json(data_matrix) data_df.fillna(0, inplace=True) if dimension == 'col': data_df = data_df.T elif dimension != 'row': err_msg = 'Input dimension [{}] is not available.\n'.format( dimension) err_msg += 'Please choose either "col" or "row"' raise ValueError(err_msg) if n_components > min(data_df.index.size, data_df.columns.size): raise ValueError( 'Number of components should be less than min(n_samples, n_features)' ) # normalize sample # logging.info("Standardizing the matrix") # s_values = StandardScaler().fit_transform(data_df.values) # skip normalizing sample s_values = data_df.values # Projection to ND pca = PCA(n_components=n_components, whiten=True) principalComponents = pca.fit_transform(s_values) explained_variance = list(pca.explained_variance_) explained_variance_ratio = list(pca.explained_variance_ratio_) components = pca.components_ singular_values = list(pca.singular_values_) col = list() for i in range(n_components): col.append('principal_component_{}'.format(i + 1)) rotation_matrix_df = pd.DataFrame(data=principalComponents, columns=col, index=data_df.index) components_df = pd.DataFrame(data=components, columns=data_df.columns, index=col).transpose() rotation_matrix_df.fillna(0, inplace=True) return (rotation_matrix_df, components_df, explained_variance, explained_variance_ratio, singular_values) def _generate_pca_html_report(self, score_plots, loading_plots, bi_plots, n_components): logging.info('start generating html report') html_report = list() output_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(output_directory) result_file_path = os.path.join(output_directory, 'pca_report.html') visualization_content = '' biplot_content = '' loading_content = '' for score_plot in score_plots: shutil.copy2( score_plot, os.path.join(output_directory, os.path.basename(score_plot))) visualization_content += '<iframe height="900px" width="100%" ' visualization_content += 'src="{}" '.format( os.path.basename(score_plot)) visualization_content += 'style="border:none;"></iframe>\n<p></p>\n' for loading_plot in loading_plots: shutil.copy2( loading_plot, os.path.join(output_directory, os.path.basename(loading_plot))) loading_content += '<iframe height="900px" width="100%" ' loading_content += 'src="{}" '.format( os.path.basename(loading_plot)) loading_content += 'style="border:none;"></iframe>\n<p></p>\n' for bi_plot in bi_plots: shutil.copy2( bi_plot, os.path.join(output_directory, os.path.basename(bi_plot))) biplot_content += '<iframe height="900px" width="100%" ' biplot_content += 'src="{}" '.format(os.path.basename(bi_plot)) biplot_content += 'style="border:none;"></iframe>\n<p></p>\n' with open(result_file_path, 'w') as result_file: with open( os.path.join(os.path.dirname(__file__), 'templates', 'pca_template.html'), 'r') as report_template_file: report_template = report_template_file.read() report_template = report_template.replace( '<p>Visualization_Content</p>', visualization_content) report_template = report_template.replace( 'n_components', '{} Components'.format(n_components)) report_template = report_template.replace( '<p>BiPlot</p>', biplot_content) report_template = report_template.replace( '<p>LoadingPlot</p>', loading_content) result_file.write(report_template) report_shock_id = self.dfu.file_to_shock({ 'file_path': output_directory, 'pack': 'zip' })['shock_id'] html_report.append({ 'shock_id': report_shock_id, 'name': os.path.basename(result_file_path), 'label': os.path.basename(result_file_path), 'description': 'HTML summary report for ExpressionMatrix Cluster App' }) return html_report def _generate_pca_report(self, pca_ref, score_plots, loading_plots, bi_plots, workspace_name, n_components): logging.info('creating report') output_html_files = self._generate_pca_html_report( score_plots, loading_plots, bi_plots, n_components) objects_created = list() objects_created.append({'ref': pca_ref, 'description': 'PCA Matrix'}) report_params = { 'message': '', 'workspace_name': workspace_name, 'objects_created': objects_created, 'html_links': output_html_files, 'direct_html_link_index': 0, 'html_window_height': 1050, 'report_object_name': 'kb_pca_report_' + str(uuid.uuid4()) } kbase_report_client = KBaseReport(self.callback_url) output = kbase_report_client.create_extended_report(report_params) report_output = { 'report_name': output['name'], 'report_ref': output['ref'] } return report_output def _append_instance_group(self, plot_pca_matrix, obj_data, dimension): plot_pca_matrix = plot_pca_matrix.copy() if dimension == 'row': attribute_mapping = obj_data.get('row_mapping') elif dimension == 'col': attribute_mapping = obj_data.get('col_mapping') else: raise ValueError('Unexpected dimension') if not attribute_mapping: logging.warning( 'Matrix object does not have {}_mapping attribute'.format( dimension)) # build matrix with unify color and shape return plot_pca_matrix else: # append instance col mapping from row/col_mapping plot_pca_matrix['instance'] = plot_pca_matrix.index.map( attribute_mapping) return plot_pca_matrix def _build_size_pca_matrix(self, plot_pca_matrix, obj_data, dimension, attribute_name): """ _build_size_pca_matrix: append attribute value to rotation_matrix """ logging.info('appending attribute value for sizing to rotation matrix') plot_pca_matrix = plot_pca_matrix.copy() if dimension == 'row': attribute_mapping = obj_data.get('row_mapping') attribute_mapping_ref = obj_data.get('row_attributemapping_ref') elif dimension == 'col': attribute_mapping = obj_data.get('col_mapping') attribute_mapping_ref = obj_data.get('col_attributemapping_ref') else: raise ValueError('Unexpected dimension') if not attribute_mapping: logging.warning( 'Matrix object does not have {}_mapping attribute'.format( dimension)) # build matrix with unify color and shape return plot_pca_matrix else: # append instance col mapping from row/col_mapping plot_pca_matrix['instance'] = plot_pca_matrix.index.map( attribute_mapping) res = self.dfu.get_objects({'object_refs': [attribute_mapping_ref]})['data'][0] attri_data = res['data'] attri_name = res['info'][1] attributes = attri_data.get('attributes') attr_pos = None for idx, attribute in enumerate(attributes): if attribute.get('attribute') == attribute_name: attr_pos = idx break if attr_pos is None: raise ValueError('Cannot find attribute [{}] in [{}]'.format( attribute_name, attri_name)) instances = attri_data.get('instances') plot_pca_matrix['attribute_value_size'] = None for instance_name, attri_values in instances.items(): plot_pca_matrix.loc[ plot_pca_matrix.instance == instance_name, ['attribute_value_size']] = attri_values[attr_pos] return plot_pca_matrix def _build_color_pca_matrix(self, plot_pca_matrix, obj_data, dimension, attribute_name): """ _build_color_pca_matrix: append attribute value to rotation_matrix """ logging.info( 'appending attribute value for grouping color to rotation matrix') plot_pca_matrix = plot_pca_matrix.copy() if dimension == 'row': attribute_mapping = obj_data.get('row_mapping') attribute_mapping_ref = obj_data.get('row_attributemapping_ref') elif dimension == 'col': attribute_mapping = obj_data.get('col_mapping') attribute_mapping_ref = obj_data.get('col_attributemapping_ref') else: raise ValueError('Unexpected dimension') if not attribute_mapping: logging.warning( 'Matrix object does not have {}_mapping attribute'.format( dimension)) # build matrix with unify color and shape return plot_pca_matrix else: # append instance col mapping from row/col_mapping plot_pca_matrix['instance'] = plot_pca_matrix.index.map( attribute_mapping) res = self.dfu.get_objects({'object_refs': [attribute_mapping_ref]})['data'][0] attri_data = res['data'] attri_name = res['info'][1] attributes = attri_data.get('attributes') attr_pos = None for idx, attribute in enumerate(attributes): if attribute.get('attribute') == attribute_name: attr_pos = idx break if attr_pos is None: raise ValueError('Cannot find attribute [{}] in [{}]'.format( attribute_name, attri_name)) instances = attri_data.get('instances') plot_pca_matrix['attribute_value_color'] = None for instance_name, attri_values in instances.items(): plot_pca_matrix.loc[ plot_pca_matrix.instance == instance_name, ['attribute_value_color']] = attri_values[attr_pos] return plot_pca_matrix def _build_2_comp_trace(self, plot_pca_matrix, components_x, components_y): traces = [] if 'attribute_value_color' in plot_pca_matrix.columns and 'attribute_value_size' in plot_pca_matrix.columns: maximum_marker_size = 10 try: sizeref = 2. * float( max(plot_pca_matrix['attribute_value_size'])) / ( maximum_marker_size**2) except Exception: print('failed to run _build_2_comp_trace') print(traceback.format_exc()) print(sys.exc_info()[2]) error_msg = "Failed to calculate data point value size." error_msg += "Some data value in your matrix is not numerical." raise ValueError(error_msg) for name in set(plot_pca_matrix.attribute_value_color): attribute_value_size = plot_pca_matrix.loc[plot_pca_matrix[ 'attribute_value_color'].eq(name)].attribute_value_size size_list = list( map(abs, list(map(float, attribute_value_size)))) for idx, val in enumerate(size_list): if val == 0: size_list[idx] = sys.float_info.min trace = go.Scatter( x=list(plot_pca_matrix.loc[plot_pca_matrix[ 'attribute_value_color'].eq(name)][components_x]), y=list(plot_pca_matrix.loc[plot_pca_matrix[ 'attribute_value_color'].eq(name)][components_y]), mode='markers', name=name, text=list(plot_pca_matrix.loc[plot_pca_matrix[ 'attribute_value_color'].eq(name)].index), textposition='bottom center', marker=go.Marker(symbol='circle', sizemode='area', sizeref=sizeref, size=size_list, sizemin=2, line=go.Line( color='rgba(217, 217, 217, 0.14)', width=0.5), opacity=0.8)) traces.append(trace) elif 'attribute_value_color' in plot_pca_matrix.columns: for name in set(plot_pca_matrix.attribute_value_color): trace = go.Scatter( x=list(plot_pca_matrix.loc[plot_pca_matrix[ 'attribute_value_color'].eq(name)][components_x]), y=list(plot_pca_matrix.loc[plot_pca_matrix[ 'attribute_value_color'].eq(name)][components_y]), mode='markers', name=name, text=list(plot_pca_matrix.loc[plot_pca_matrix[ 'attribute_value_color'].eq(name)].index), textposition='bottom center', marker=go.Marker(size=10, opacity=0.8, line=go.Line( color='rgba(217, 217, 217, 0.14)', width=0.5))) traces.append(trace) elif 'attribute_value_size' in plot_pca_matrix.columns: maximum_marker_size = 10 try: sizeref = 2. * float( max(plot_pca_matrix['attribute_value_size'])) / ( maximum_marker_size**2) except Exception: print('failed to run _build_2_comp_trace') print(traceback.format_exc()) print(sys.exc_info()[2]) error_msg = "Failed to calculate data point value size." error_msg += "Some data value in your matrix is not numerical." raise ValueError(error_msg) for name in set(plot_pca_matrix.instance): attribute_value_size = plot_pca_matrix.loc[ plot_pca_matrix['instance'].eq(name)].attribute_value_size size_list = list( map(abs, list(map(float, attribute_value_size)))) for idx, val in enumerate(size_list): if val == 0: size_list[idx] = sys.float_info.min trace = go.Scatter( x=list(plot_pca_matrix.loc[plot_pca_matrix['instance'].eq( name)][components_x]), y=list(plot_pca_matrix.loc[plot_pca_matrix['instance'].eq( name)][components_y]), mode='markers', name=name, text=list(plot_pca_matrix.loc[ plot_pca_matrix['instance'].eq(name)].index), textposition='bottom center', marker=go.Marker(symbol='circle', sizemode='area', sizeref=sizeref, size=size_list, sizemin=2, line=go.Line( color='rgba(217, 217, 217, 0.14)', width=0.5), opacity=0.8)) traces.append(trace) else: trace = go.Scatter(x=list(plot_pca_matrix[components_x]), y=list(plot_pca_matrix[components_y]), mode='markers', name='score plot', text=list(plot_pca_matrix.index), textposition='bottom center', marker=go.Marker( size=10, opacity=0.8, line=go.Line( color='rgba(217, 217, 217, 0.14)', width=0.5))) traces.append(trace) return traces def _plot_score_pca_matrix(self, plot_pca_matrix, n_components): output_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(output_directory) result_file_paths = [] all_pairs = list(itertools.combinations(range(1, n_components + 1), 2)) for pair in all_pairs: first_component = pair[0] second_component = pair[1] result_file_path = os.path.join( output_directory, 'pca_score_plot_{}_{}.html'.format(first_component, second_component)) traces = self._build_2_comp_trace( plot_pca_matrix, 'principal_component_{}'.format(first_component), 'principal_component_{}'.format(second_component)) data = go.Data(traces) layout = go.Layout( xaxis=go.XAxis(title='PC{}'.format(first_component), showline=False), yaxis=go.YAxis(title='PC{}'.format(second_component), showline=False)) fig = go.Figure(data=data, layout=layout) plot(fig, filename=result_file_path) result_file_paths.append(result_file_path) return result_file_paths def _plot_loading_pca_matrix(self, components_df, n_components): output_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(output_directory) result_file_paths = [] all_pairs = list(itertools.combinations(range(1, n_components + 1), 2)) for pair in all_pairs: first_component = pair[0] second_component = pair[1] result_file_path = os.path.join( output_directory, 'pca_loading_plot_{}_{}.html'.format(first_component, second_component)) traces = list() data = go.Data(traces) layout = go.Layout( xaxis=go.XAxis(title='PC{}'.format(first_component), showline=False), yaxis=go.YAxis(title='PC{}'.format(second_component), showline=False)) fig = go.Figure(data=data, layout=layout) coeff = list() coeff.append(components_df['principal_component_{}'.format( first_component)]) coeff.append(components_df['principal_component_{}'.format( second_component)]) coeff = np.transpose(coeff) loading_x = list() loading_y = list() loading_text = list() for idx, position in enumerate(coeff): loading_x.append(0) loading_y.append(0) loading_text.append('0') loading_x.append(position[0]) loading_y.append(position[1]) loading_text.append(components_df.index[idx]) fig.add_trace( go.Scatter(x=loading_x, y=loading_y, mode="lines+markers", name="loading plot", text=loading_text, textposition="bottom center")) plot(fig, filename=result_file_path) result_file_paths.append(result_file_path) return result_file_paths def _plot_biplot_pca_matrix(self, plot_pca_matrix, components_df, n_components): output_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(output_directory) result_file_paths = [] all_pairs = list(itertools.combinations(range(1, n_components + 1), 2)) for pair in all_pairs: first_component = pair[0] second_component = pair[1] result_file_path = os.path.join( output_directory, 'pca_biplot_plot_{}_{}.html'.format(first_component, second_component)) traces = self._build_2_comp_trace( plot_pca_matrix, 'principal_component_{}'.format(first_component), 'principal_component_{}'.format(second_component)) data = go.Data(traces) layout = go.Layout( xaxis=go.XAxis(title='PC{}'.format(first_component), showline=False), yaxis=go.YAxis(title='PC{}'.format(second_component), showline=False)) fig = go.Figure(data=data, layout=layout) coeff = list() coeff.append(components_df['principal_component_{}'.format( first_component)]) coeff.append(components_df['principal_component_{}'.format( second_component)]) coeff = np.transpose(coeff) loading_x = list() loading_y = list() loading_text = list() for idx, position in enumerate(coeff): loading_x.append(0) loading_y.append(0) loading_text.append('0') loading_x.append(position[0]) loading_y.append(position[1]) loading_text.append(components_df.index[idx]) fig.add_trace( go.Scatter(x=loading_x, y=loading_y, mode="lines+markers", name="loading plot", text=loading_text, textposition="bottom center")) plot(fig, filename=result_file_path) result_file_paths.append(result_file_path) return result_file_paths def _validate_pca_matrix(self, obj_data, dimension, color_marker_by, scale_size_by): if dimension == 'row': attribute_mapping = obj_data.get('row_mapping') attributemapping_ref = obj_data.get('row_attributemapping_ref') if not attributemapping_ref: # handle Functional Profile base_object_ref = obj_data.get('base_object_ref') base_object_data = self.dfu.get_objects( {'object_refs': [base_object_ref]})['data'][0]['data'] attributemapping_ref = base_object_data.get( 'row_attributemapping_ref') obj_data['row_attributemapping_ref'] = attributemapping_ref if not attribute_mapping and attributemapping_ref: am_data = self.dfu.get_objects( {'object_refs': [attributemapping_ref]})['data'][0]['data'] attribute_mapping = {x: x for x in am_data['instances'].keys()} obj_data['row_mapping'] = attribute_mapping elif dimension == 'col': attribute_mapping = obj_data.get('col_mapping') attributemapping_ref = obj_data.get('col_attributemapping_ref') if not attributemapping_ref: # handle Functional Profile base_object_ref = obj_data.get('base_object_ref') base_object_data = self.dfu.get_objects( {'object_refs': [base_object_ref]})['data'][0]['data'] attributemapping_ref = base_object_data.get( 'row_attributemapping_ref') obj_data['row_attributemapping_ref'] = attributemapping_ref if not attribute_mapping and attributemapping_ref: am_data = self.dfu.get_objects( {'object_refs': [attributemapping_ref]})['data'][0]['data'] attribute_mapping = {x: x for x in am_data['instances'].keys()} obj_data['col_mapping'] = attribute_mapping else: raise ValueError('Unexpected dimension') if not attribute_mapping: if (color_marker_by and color_marker_by.get('attribute_color')[0]) or \ (scale_size_by and scale_size_by.get('attribute_size')[0]): raise ValueError( 'Matrix object is not associated with any {} attribute mapping' .format(dimension)) return obj_data def __init__(self, config): self.ws_url = config["workspace-url"] self.callback_url = config['SDK_CALLBACK_URL'] self.token = config['KB_AUTH_TOKEN'] self.scratch = config['scratch'] self.data_util = DataUtil(config) self.dfu = DataFileUtil(self.callback_url) plt.switch_backend('agg') def run_pca(self, params): """ perform PCA analysis on matrix input_obj_ref: object reference of a matrix workspace_name: the name of the workspace pca_matrix_name: name of PCA (KBaseExperiments.PCAMatrix) object n_components - number of components (default 2) dimension: compute correlation on column or row, one of ['col', 'row'] """ logging.info('--->\nrunning NetworkUtil.build_network\n' + 'params:\n{}'.format(json.dumps(params, indent=1))) self._validate_run_pca_params(params) input_obj_ref = params.get('input_obj_ref') workspace_name = params.get('workspace_name') pca_matrix_name = params.get('pca_matrix_name') n_components = int(params.get('n_components', 2)) dimension = params.get('dimension', 'col') res = self.dfu.get_objects({'object_refs': [input_obj_ref]})['data'][0] obj_data = res['data'] obj_type = res['info'][2] obj_data = self._validate_pca_matrix(obj_data, dimension, params.get('color_marker_by'), params.get('scale_size_by')) if "KBaseMatrices" in obj_type or 'KBaseProfile' in obj_type: (rotation_matrix_df, components_df, explained_variance, explained_variance_ratio, singular_values) = self._pca_for_matrix(input_obj_ref, n_components, dimension) else: err_msg = 'Ooops! [{}] is not supported.\n'.format(obj_type) err_msg += 'Please supply KBaseMatrices or KBaseProfile object' raise ValueError(err_msg) pca_ref = self._save_pca_matrix(workspace_name, input_obj_ref, pca_matrix_name, rotation_matrix_df, components_df, explained_variance, explained_variance_ratio, singular_values, n_components, dimension) plot_pca_matrix = self._append_instance_group( rotation_matrix_df.copy(), obj_data, dimension) if params.get('color_marker_by'): plot_pca_matrix = self._build_color_pca_matrix( plot_pca_matrix, obj_data, dimension, params.get('color_marker_by').get('attribute_color')[0]) if params.get('scale_size_by'): plot_pca_matrix = self._build_size_pca_matrix( plot_pca_matrix, obj_data, dimension, params.get('scale_size_by').get('attribute_size')[0]) returnVal = {'pca_ref': pca_ref} report_output = self._generate_pca_report( pca_ref, self._plot_score_pca_matrix(plot_pca_matrix, n_components), self._plot_loading_pca_matrix(components_df, n_components), self._plot_biplot_pca_matrix(plot_pca_matrix, components_df, n_components), workspace_name, n_components) returnVal.update(report_output) return returnVal def export_pca_matrix_excel(self, params): """ export PCAMatrix as Excel """ logging.info('start exporting pca matrix') pca_matrix_ref = params.get('input_ref') pca_df, components_df = self._pca_to_df(pca_matrix_ref) result_dir = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(result_dir) self._pca_df_to_excel(pca_df, components_df, result_dir, pca_matrix_ref) package_details = self.dfu.package_for_download({ 'file_path': result_dir, 'ws_refs': [pca_matrix_ref] }) return {'shock_id': package_details['shock_id']}
class Utils: def __init__(self, config): self.ws_url = config["workspace-url"] self.callback_url = config['SDK_CALLBACK_URL'] self.token = config['KB_AUTH_TOKEN'] self.shock_url = config['shock-url'] self.srv_wiz_url = config['srv-wiz-url'] self.scratch = config['scratch'] self.dfu = DataFileUtil(self.callback_url) @staticmethod def validate_params(params, expected, opt_param=set()): """Validates that required parameters are present. Warns if unexpected parameters appear""" expected = set(expected) opt_param = set(opt_param) pkeys = set(params) if expected - pkeys: raise ValueError("Required keys {} not in supplied parameters" .format(", ".join(expected - pkeys))) defined_param = expected | opt_param for param in params: if param not in defined_param: logging.warning("Unexpected parameter {} supplied".format(param)) def _ws_obj_to_cobra(self, ref): ret = self.dfu.get_objects({'object_refs': [ref]})['data'][0] name = ret['info'][1] #old nasty method #model = cobrakbase.convert_kmodel(ret['data']) #if 'genome_ref' in ret['data']: # logging.info(f"Annotating model with genome information: {ret['data']['genome_ref']}") # genome = self.dfu.get_objects( # {'object_refs': [ret['data']['genome_ref']]})['data'][0]['data'] # cobrakbase.annotate_model_with_genome(model, genome) #fbamodel object wraps json data fbamodel = KBaseFBAModel(ret['data']) builder = KBaseFBAModelToCobraBuilder(fbamodel) if 'genome_ref' in ret['data']: logging.info(f"Annotating model with genome information: {ret['data']['genome_ref']}") genome = self.dfu.get_objects( {'object_refs': [ret['data']['genome_ref']]})['data'][0]['data'] #adding Genome to the Builder builder.with_genome(KBaseGenome(genome)) #converts to cobra model object with builder model = builder.build() modelseed = cobrakbase.modelseed.from_local('/kb/module/data/') print(cobrakbase.annotate_model_with_modelseed(model, modelseed)) return name, model def to_sbml(self, params): """Convert a FBAModel to a SBML file""" files = {} _id, cobra_model = self._ws_obj_to_cobra(params['input_ref']) files['file_path'] = os.path.join(params['destination_dir'], _id + ".xml") cobra.io.write_sbml_model(cobra_model, files['file_path']) return _id, files def export(self, file, name, input_ref): """Saves a set of files to SHOCK for export""" export_package_dir = os.path.join(self.scratch, name + str(uuid.uuid4())) os.makedirs(export_package_dir) shutil.move(file, os.path.join(export_package_dir, os.path.basename(file))) # package it up and be done package_details = self.dfu.package_for_download({ 'file_path': export_package_dir, 'ws_refs': [input_ref] }) return {'shock_id': package_details['shock_id']}
class AttributesUtil: def __init__(self, config): self.ws_url = config["workspace-url"] self.callback_url = config['SDK_CALLBACK_URL'] self.token = config['KB_AUTH_TOKEN'] self.shock_url = config['shock-url'] self.srv_wiz_url = config['srv-wiz-url'] self.scratch = config['scratch'] self.dfu = DataFileUtil(self.callback_url) self.kbse = KBaseSearchEngine(config['search-url']) self.data_util = DataUtil(config) self.wsClient = workspaceService(self.ws_url, token=self.token) self.DEFAULT_ONTOLOGY_ID = "Custom:Term" self.DEFAULT_UNIT_ID = "Custom:Unit" self.ONT_LABEL_DEL = " - " self.ONT_TERM_DEL = ":" @staticmethod def validate_params(params, expected, opt_param=set()): """Validates that required parameters are present. Warns if unexpected parameters appear""" expected = set(expected) opt_param = set(opt_param) pkeys = set(params) if expected - pkeys: raise ValueError( "Required keys {} not in supplied parameters".format( ", ".join(expected - pkeys))) defined_param = expected | opt_param for param in params: if param not in defined_param: logging.warning( "Unexpected parameter {} supplied".format(param)) def file_to_attribute_mapping(self, params): """Convert a user supplied file to a compound set""" if 'input_file_path' in params: scratch_file_path = params['input_file_path'] elif 'input_shock_id' in params: scratch_file_path = self.dfu.shock_to_file({ 'shock_id': params['input_shock_id'], 'file_path': self.scratch }).get('file_path') else: raise ValueError( "Must supply either a input_shock_id or input_file_path") attr_mapping = self._file_to_am_obj(scratch_file_path) info = self.dfu.save_objects({ "id": params['output_ws_id'], "objects": [{ "type": "KBaseExperiments.AttributeMapping", "data": attr_mapping, "name": params['output_obj_name'] }] })[0] return { "attribute_mapping_ref": "%s/%s/%s" % (info[6], info[0], info[4]) } def append_file_to_attribute_mapping(self, staging_file_subdir_path, old_am_ref, output_ws_id, new_am_name=None): """append an attribute mapping file to existing attribute mapping object """ download_staging_file_params = { 'staging_file_subdir_path': staging_file_subdir_path } scratch_file_path = self.dfu.download_staging_file( download_staging_file_params).get('copy_file_path') append_am_data = self._file_to_am_obj(scratch_file_path) old_am_obj = self.dfu.get_objects({'object_refs': [old_am_ref]})['data'][0] old_am_info = old_am_obj['info'] old_am_name = old_am_info[1] old_am_data = old_am_obj['data'] new_am_data = self._check_and_append_am_data(old_am_data, append_am_data) if not new_am_name: current_time = time.localtime() new_am_name = old_am_name + time.strftime('_%H_%M_%S_%Y_%m_%d', current_time) info = self.dfu.save_objects({ "id": output_ws_id, "objects": [{ "type": "KBaseExperiments.AttributeMapping", "data": new_am_data, "name": new_am_name }] })[0] return { "attribute_mapping_ref": "%s/%s/%s" % (info[6], info[0], info[4]) } def update_matrix_attribute_mapping(self, params): dimension = params.get('dimension') if dimension not in ['col', 'row']: raise ValueError('Please use "col" or "row" for input dimension') workspace_name = params.get('workspace_name') old_matrix_ref = params.get('input_matrix_ref') old_matrix_obj = self.dfu.get_objects( {'object_refs': [old_matrix_ref]})['data'][0] old_matrix_info = old_matrix_obj['info'] old_matrix_data = old_matrix_obj['data'] old_am_ref = old_matrix_data.get( '{}_attributemapping_ref'.format(dimension)) if not isinstance(workspace_name, int): workspace_id = self.dfu.ws_name_to_id(workspace_name) else: workspace_id = workspace_name if not old_am_ref: raise ValueError( 'Matrix object does not have {} attribute mapping'.format( dimension)) new_am_ref = self.append_file_to_attribute_mapping( params['staging_file_subdir_path'], old_am_ref, workspace_id, params['output_am_obj_name'])['attribute_mapping_ref'] old_matrix_data['{}_attributemapping_ref'.format( dimension)] = new_am_ref info = self.dfu.save_objects({ "id": workspace_id, "objects": [{ "type": old_matrix_info[2], "data": old_matrix_data, "name": params['output_matrix_obj_name'] }] })[0] new_matrix_obj_ref = "%s/%s/%s" % (info[6], info[0], info[4]) objects_created = [{ 'ref': new_am_ref, 'description': 'Updated Attribute Mapping' }, { 'ref': new_matrix_obj_ref, 'description': 'Updated Matrix' }] report_params = { 'message': '', 'objects_created': objects_created, 'workspace_name': workspace_name, 'report_object_name': 'import_matrix_from_biom_' + str(uuid.uuid4()) } kbase_report_client = KBaseReport(self.callback_url, token=self.token) output = kbase_report_client.create_extended_report(report_params) return { 'new_matrix_obj_ref': new_matrix_obj_ref, 'new_attribute_mapping_ref': new_am_ref, 'report_name': output['name'], 'report_ref': output['ref'] } def _check_and_append_am_data(self, old_am_data, append_am_data): exclude_keys = {'attributes', 'instances'} new_am_data = { k: old_am_data[k] for k in set(list(old_am_data.keys())) - exclude_keys } old_attrs = old_am_data.get('attributes') old_insts = old_am_data.get('instances') append_attrs = append_am_data.get('attributes') append_insts = append_am_data.get('instances') # checking duplicate attributes old_attrs_names = [old_attr.get('attribute') for old_attr in old_attrs] append_attrs_names = [ append_attr.get('attribute') for append_attr in append_attrs ] duplicate_attrs = set(old_attrs_names).intersection(append_attrs_names) if duplicate_attrs: error_msg = 'Duplicate attribute mappings: [{}]'.format( duplicate_attrs) raise ValueError(error_msg) # checking missing instances missing_inst = old_insts.keys() - append_insts.keys() if missing_inst: error_msg = 'Appended attribute mapping misses [{}] instances'.format( missing_inst) raise ValueError(error_msg) new_attrs = old_attrs + append_attrs new_am_data['attributes'] = new_attrs new_insts = deepcopy(old_insts) for inst_name, val in new_insts.items(): append_val = append_insts.get(inst_name) val.extend(append_val) new_am_data['instances'] = new_insts return new_am_data def _am_data_to_df(self, data): """ Converts a compound set object data to a dataframe """ attributes = pd.DataFrame(data['attributes']) attributes.rename(columns=lambda x: x.replace("ont", "ontology"). capitalize().replace("_", " ")) instances = pd.DataFrame(data['instances']) am_df = attributes.join(instances) return am_df def _clusterset_data_to_df(self, data): """ Converts a cluster set object data to a dataframe """ original_matrix_ref = data.get('original_data') data_matrix = self.data_util.fetch_data({ 'obj_ref': original_matrix_ref }).get('data_matrix') data_df = pd.read_json(data_matrix) clusters = data.get('clusters') id_name_list = [ list(cluster.get('id_to_data_position').keys()) for cluster in clusters ] id_names = [item for sublist in id_name_list for item in sublist] if set(data_df.columns.tolist()) == set( id_names): # cluster is based on columns data_df = data_df.T cluster_names = [None] * data_df.index.size cluster_id = 0 for cluster in clusters: item_ids = list(cluster.get('id_to_data_position').keys()) item_idx = [data_df.index.get_loc(item_id) for item_id in item_ids] for idx in item_idx: cluster_names[idx] = cluster_id cluster_id += 1 data_df['cluster'] = cluster_names return data_df def _ws_obj_to_df(self, input_ref): """Converts workspace obj to a DataFrame""" res = self.dfu.get_objects({'object_refs': [input_ref]})['data'][0] name = res['info'][1] obj_type = res['info'][2] if "KBaseExperiments.AttributeMapping" in obj_type: cs_df = self._am_data_to_df(res['data']) elif "KBaseExperiments.ClusterSet" in obj_type: cs_df = self._clusterset_data_to_df(res['data']) else: err_msg = 'Ooops! [{}] is not supported.\n'.format(obj_type) err_msg += 'Please supply KBaseExperiments.AttributeMapping or KBaseExperiments.ClusterSet' raise ValueError("err_msg") return name, cs_df, obj_type def _file_to_am_obj(self, scratch_file_path): try: df = pd.read_excel(scratch_file_path, dtype='str') except XLRDError: df = pd.read_csv(scratch_file_path, sep=None, dtype='str') df = df.replace('nan', '') if df.columns[1].lower() == "attribute ontology id": am_obj = self._df_to_am_obj(df) else: am_obj = self._isa_df_to_am_object(df) return am_obj def _df_to_am_obj(self, am_df): """Converts a dataframe from a user file to a compound set object""" if not len(am_df): raise ValueError("No attributes in supplied files") attribute_df = am_df.filter(regex="[Uu]nit|[Aa]ttribute") instance_df = am_df.drop(attribute_df.columns, axis=1) if not len(instance_df.columns): raise ValueError( "Unable to find any instance columns in supplied file") attribute_df.rename( columns=lambda x: x.lower().replace(" ontology ", "_ont_").strip(), inplace=True) if "attribute" not in attribute_df.columns: raise ValueError( "Unable to find a 'attribute' column in supplied file") attribute_df['source'] = 'upload' attribute_fields = ('attribute', 'unit', 'attribute_ont_id', 'unit_ont_id', 'source') attributes = attribute_df.filter( items=attribute_fields).to_dict('records') print(attributes) self._validate_attribute_values( am_df.set_index(attribute_df.attribute).iterrows()) attribute_mapping = { 'ontology_mapping_method': "User Curation", 'attributes': [self._add_ontology_info(f) for f in attributes], 'instances': instance_df.to_dict('list') } return attribute_mapping def _isa_df_to_am_object(self, isa_df): skip_columns = { 'Raw Data File', 'Derived Data File', 'Array Data File', 'Image File' } if 'Sample Name' in isa_df.columns and not any( isa_df['Sample Name'].duplicated()): isa_df.set_index('Sample Name', inplace=True) elif 'Assay Name' in isa_df.columns and not any( isa_df['Assay Name'].duplicated()): isa_df.set_index('Assay Name', inplace=True) elif not any(isa_df[isa_df.columns[0]].duplicated()): logging.warning(f'Using {isa_df.columns[0]} as ID column') isa_df.set_index(isa_df.columns[0], inplace=True) else: raise ValueError( "Unable to detect an ID column that was unigue for each row. " f"Considered 'Sample Names', 'Assay Names' and {isa_df.columns[0]}" ) self._validate_attribute_values(isa_df.iteritems()) attribute_mapping = { 'ontology_mapping_method': "User Curation - ISA format" } attribute_mapping[ 'attributes'], new_skip_cols = self._get_attributes_from_isa( isa_df, skip_columns) reduced_isa = isa_df.drop(columns=new_skip_cols, errors='ignore') attribute_mapping['instances'] = reduced_isa.T.to_dict('list') return attribute_mapping def _validate_attribute_values(self, attribute_series): errors = {} for attr, vals in attribute_series: try: validator = getattr(AttributeValidation, attr) attr_errors = validator(vals) if attr_errors: errors[attr] = attr_errors except AttributeError: continue if errors: for attr, attr_errors in errors.items(): logging.error( f'Attribute {attr} had the following validation errors:\n' "\n".join(attr_errors) + '\n') raise ValueError( f'The following attributes failed validation: {", ".join(errors)}' f'\n See the log for details') def _get_attributes_from_isa(self, isa_df, skip_columns): attributes = [] # associate attribute columns with the other columns that relate to them for i, col in enumerate(isa_df.columns): if col.startswith('Term Source REF'): skip_columns.add(col) last_attr = attributes[-1] if '_unit' in last_attr: last_attr['_unit_ont'] = col else: last_attr['_val_ont'] = col elif col.startswith('Term Accession Number'): # If the term Accession is a web link only grab the last bit # Similarly, sometimes the number is prefixed with the term source e.x. UO_0000012 isa_df[col] = isa_df[col].map( lambda x: x.split("/")[-1].split("_")[-1]) skip_columns.add(col) last_attr = attributes[-1] if '_unit' in last_attr: last_attr['_unit_accession'] = col else: last_attr['_val_accession'] = col elif col.startswith('Unit'): skip_columns.add(col) last_attr = attributes[-1] if last_attr.get('unit'): raise ValueError( "More than one unit column is supplied for attribute {}" .format(last_attr['attribute'])) last_attr['_unit'] = col elif col not in skip_columns: split_col = col.split("|", 1) if len(split_col) > 1: attributes.append({ "attribute": split_col[0], "attribute_ont_id": split_col[1], "source": "upload" }) else: attributes.append({"attribute": col, "source": "upload"}) # handle the categories for each attribute for i, attribute in enumerate(attributes): if '_val_accession' in attribute: category_df = isa_df[[ attribute['attribute'], attribute.pop('_val_ont'), attribute.pop('_val_accession') ]].drop_duplicates() category_df[ 'attribute_ont_id'] = category_df.iloc[:, 1].str.cat( category_df.iloc[:, 2], ":") category_df['value'] = category_df[attribute['attribute']] cats = category_df.set_index(attribute['attribute'])[[ 'value', 'attribute_ont_id' ]].to_dict('index') attribute['categories'] = { k: self._add_ontology_info(v) for k, v in cats.items() } if '_unit' in attribute: units = isa_df[attribute.pop('_unit')].unique() if len(units) > 1: raise ValueError( "More than one unit type is supplied for attribute {}: {}" .format(attribute['attribute'], units)) attribute['unit'] = units[0] if '_unit_ont' in attribute: unit_ont = isa_df[attribute.pop('_unit_ont')].str.cat( isa_df[attribute.pop('_unit_accession')], ":").unique() if len(units) > 1: raise ValueError( "More than one unit ontology is supplied for attribute " "{}: {}".format(attribute['attribute'], unit_ont)) attribute['unit_ont_id'] = unit_ont[0] attributes[i] = self._add_ontology_info(attribute) return attributes, skip_columns def _search_ontologies(self, term, closest=False): """ Match to an existing KBase ontology term :param term: Test to match :param closest: if false, term must exactly match an ontology ID :return: dict(ontology_ref, id) """ params = { "object_types": ["OntologyTerm"], "match_filter": { "lookup_in_keys": { "id": { "value": term } } }, "access_filter": { "with_private": 0, "with_public": 1 }, "pagination": { "count": 1 }, "post_processing": { "skip_data": 1 } } if closest: params['match_filter'] = {"full_text_in_all": term} res = self.kbse.search_objects(params) if not res['objects']: return None term = res['objects'][0] return { "ontology_ref": term['guid'].split(":")[1], "id": term['key_props']['id'] } def _add_ontology_info(self, attribute): """Searches KBASE ontologies for terms matching the user supplied attributes and units. Add the references if found""" optionals = { "unit", "unit_ont_id", "unit_ont_ref", } attribute = { k: v for k, v in attribute.items() if k not in optionals or v != "" } ont_info = self._search_ontologies( attribute.get('attribute_ont_id', "").replace("_", ":")) if ont_info: attribute['attribute_ont_ref'] = ont_info['ontology_ref'] attribute['attribute_ont_id'] = ont_info['id'] elif not attribute.get( 'attribute_ont_id') or attribute['attribute_ont_id'] == ":": attribute.pop('attribute_ont_id', None) if attribute.get('unit'): ont_info = self._search_ontologies( attribute.get('unit_ont_id', '').replace("_", ":")) if ont_info: attribute['unit_ont_ref'] = ont_info['ontology_ref'] attribute['unit_ont_id'] = ont_info['id'] elif not attribute.get( 'attribute_ont_id') or attribute['unit_ont_id'] == ":": attribute.pop('unit_ont_id', None) return attribute def to_tsv(self, params): """Convert an compound set to TSV file""" files = {} _id, df, obj_type = self._ws_obj_to_df(params['input_ref']) files['file_path'] = os.path.join(params['destination_dir'], _id + ".tsv") df.to_csv(files['file_path'], sep="\t", index=False) return _id, files def to_excel(self, params): """Convert an compound set to Excel file""" files = {} _id, df, obj_type = self._ws_obj_to_df(params['input_ref']) files['file_path'] = os.path.join(params['destination_dir'], _id + ".xlsx") writer = pd.ExcelWriter(files['file_path']) if "KBaseExperiments.AttributeMapping" in obj_type: df.to_excel(writer, "Attributes", index=False) elif "KBaseExperiments.ClusterSet" in obj_type: df.to_excel(writer, "ClusterSet", index=True) # else is checked in `_ws_obj_to_df` writer.save() return _id, files def export(self, file, name, input_ref): """Saves a set of files to SHOCK for export""" export_package_dir = os.path.join(self.scratch, name + str(uuid.uuid4())) os.makedirs(export_package_dir) shutil.move(file, os.path.join(export_package_dir, os.path.basename(file))) # package it up and be done package_details = self.dfu.package_for_download({ 'file_path': export_package_dir, 'ws_refs': [input_ref] }) return {'shock_id': package_details['shock_id']}
class FeatureSetDownload: def __init__(self, config): self.cfg = config self.scratch = config['scratch'] self.gsu = GenomeSearchUtil(os.environ['SDK_CALLBACK_URL']) self.dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL']) self.ws = Workspace(config["workspace-url"]) @staticmethod def validate_params(params, expected={"workspace_name", "featureset_name"}): expected = set(expected) pkeys = set(params) if expected - pkeys: raise ValueError("Required keys {} not in supplied parameters" .format(", ".join(expected - pkeys))) def to_tsv(self, params): working_dir = os.path.join(self.scratch, 'featureset-download-'+str(uuid.uuid4())) os.makedirs(working_dir) header = ['Feature Id', 'Aliases', 'Genome', 'Type', 'Function'] fs_name, fs_dicts = self.make_featureset_dict(params['featureset_ref']) files = {'file_path': "{}/{}.tsv".format(working_dir, fs_name)} writer = csv.DictWriter(open(files['file_path'], 'w'), header, delimiter='\t', lineterminator='\n') writer.writeheader() for feat in fs_dicts: writer.writerow(feat) return fs_name, files def make_featureset_dict(self, fs_ref): features = [] ret = self.dfu.get_objects({'object_refs': [fs_ref]})['data'][0] feat_set = ret['data'] fs_name = ret['info'][1] feat_by_genome = defaultdict(list) for k, v in feat_set['elements'].items(): feat_by_genome[v[0]].append(k) for genome, fids in feat_by_genome.items(): genome_name = self.ws.get_object_info3({'objects': [{'ref': genome}]})['infos'][0][1] res = self.gsu.search({'ref': genome, 'structured_query': {'feature_id': fids}, 'sort_by': [['contig_id', 1]], 'start': 0, 'limit': len(fids) }) for feat in res['features']: features.append({'Feature Id': feat['feature_id'], 'Aliases': ", ".join(sorted(feat['aliases'].keys())), 'Genome': "{} ({})".format(genome_name, genome), 'Type': feat['feature_type'], 'Function': feat['function'] }) return fs_name, features def export(self, files, name, params): export_package_dir = os.path.join(self.scratch, name+str(uuid.uuid4())) os.makedirs(export_package_dir) for file in files: shutil.move(file, os.path.join(export_package_dir, os.path.basename(file))) # package it up and be done package_details = self.dfu.package_for_download({ 'file_path': export_package_dir, 'ws_refs': [params['featureset_ref']] }) return {'shock_id': package_details['shock_id']}
def export_genome_as_genbank(self, ctx, params): """ :param params: instance of type "ExportParams" (input and output structure functions for standard downloaders) -> structure: parameter "input_ref" of String :returns: instance of type "ExportOutput" -> structure: parameter "shock_id" of String """ # ctx is the context object # return variables are: output #BEGIN export_genome_as_genbank print('export_genome_as_genbank -- paramaters = ') # validate parameters if 'input_ref' not in params: raise ValueError( 'Cannot run export_genome_as_genbank- no "input_ref" field defined.' ) # get WS metadata to get ws_name and obj_name ws = Workspace(url=self.cfg.workspaceURL) info = ws.get_object_info_new({ 'objects': [{ 'ref': params['input_ref'] }], 'includeMetadata': 0, 'ignoreErrors': 0 })[0] genome_to_genbank_params = {'genome_ref': params['input_ref']} # export to file (building from KBase Genome Object) result = self.genome_to_genbank( ctx, genome_to_genbank_params)[0]['genbank_file'] # create the output directory and move the file there export_package_dir = os.path.join(self.cfg.sharedFolder, info[1]) os.makedirs(export_package_dir) shutil.move( result['file_path'], os.path.join(export_package_dir, os.path.basename(result['file_path']))) # export original uploaded GenBank file if it existed. exporter = GenomeToGenbank(self.cfg) original_result_full = exporter.export_original_genbank( ctx, genome_to_genbank_params) if original_result_full is not None: original_result = original_result_full['genbank_file'] shutil.move( original_result['file_path'], os.path.join(export_package_dir, os.path.basename(original_result['file_path']))) # Make warning file about genes only. warning_filename = "README.txt" with open(os.path.join(export_package_dir, warning_filename), 'w') as temp_file: temp_file.write( 'This directory includes the KBase-derived GenBank file and also ' + '(if you originally uploaded the genome from an annotated ' + 'GenBank file) the original GenBank input.') # package it up and be done dfUtil = DataFileUtil(self.cfg.callbackURL) package_details = dfUtil.package_for_download({ 'file_path': export_package_dir, 'ws_refs': [params['input_ref']] }) output = {'shock_id': package_details['shock_id']} print('export complete -- result = ') pprint(output) #END export_genome_as_genbank # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method export_genome_as_genbank return value ' + 'output is not type dict as required.') # return the results return [output]
class sample_uploader: ''' Module Name: sample_uploader Module Description: A KBase module: sample_uploader ''' ######## WARNING FOR GEVENT USERS ####### noqa # Since asynchronous IO can lead to methods - even the same method - # interrupting each other, you must be *very* careful when using global # state. A method could easily clobber the state set by another while # the latter method is running. ######################################### noqa VERSION = "0.0.12" GIT_URL = "https://github.com/kbaseapps/sample_uploader" GIT_COMMIT_HASH = "5134b679279c84128b0ca5b684fa75dacf7dba59" #BEGIN_CLASS_HEADER #END_CLASS_HEADER # config contains contents of config file in a hash or None if it couldn't # be found def __init__(self, config): #BEGIN_CONSTRUCTOR self.callback_url = os.environ['SDK_CALLBACK_URL'] self.workspace_url = config['workspace-url'] self.scratch = config['scratch'] # janky, but works for now self.sw_url = config.get('kbase-endpoint') + '/service_wizard' self.dfu = DataFileUtil(url=self.callback_url) logging.basicConfig(format='%(created)s %(levelname)s: %(message)s', level=logging.INFO) #END_CONSTRUCTOR pass def import_samples(self, ctx, params): """ :param params: instance of type "ImportSampleInputs" -> structure: parameter "sample_set_ref" of String, parameter "sample_file" of String, parameter "workspace_name" of String, parameter "workspace_id" of Long, parameter "file_format" of String, parameter "description" of String, parameter "set_name" of String, parameter "header_row_index" of Long, parameter "id_field" of String, parameter "output_format" of String, parameter "taxonomy_source" of String, parameter "num_otus" of Long, parameter "incl_seq" of Long, parameter "otu_prefix" of String, parameter "share_within_workspace" of Long :returns: instance of type "ImportSampleOutputs" -> structure: parameter "report_name" of String, parameter "report_ref" of String, parameter "sample_set" of type "SampleSet" -> structure: parameter "samples" of list of type "sample_info" -> structure: parameter "id" of type "sample_id", parameter "name" of String, parameter "description" of String, parameter "sample_set_ref" of String """ # ctx is the context object # return variables are: output #BEGIN import_samples print(f"Beginning sample import with following parameters:") print(f"params -- {params}") sample_set = {"samples": []} # We subtract by 1 for zero indexing. if params.get('sample_set_ref'): ret = self.dfu.get_objects( {'object_refs': [params['sample_set_ref']]})['data'][0] sample_set = ret['data'] set_name = ret['info'][1] save_ws_id = params['sample_set_ref'].split('/')[0] else: if not params.get('set_name'): raise ValueError( f"Sample set name required, when new SampleSet object is created." ) set_name = params['set_name'] save_ws_id = params.get('workspace_id') if params.get('header_row_index'): header_row_index = int(params["header_row_index"]) - 1 else: header_row_index = 0 if params.get('file_format') == "SESAR": header_row_index = 1 username = ctx['user_id'] if params.get('file_format') == 'ENIGMA': # ENIGMA_mappings['verification_mapping'].update( # {key: ("is_string", []) for key in ENIGMA_mappings['basic_columns']} # ) sample_set = import_samples_from_file( params, self.sw_url, self.workspace_url, username, ctx['token'], ENIGMA_mappings['column_mapping'], ENIGMA_mappings.get('groups', []), ENIGMA_mappings['date_columns'], ENIGMA_mappings.get('column_unit_regex', []), sample_set, header_row_index) elif params.get('file_format') == 'SESAR': # SESAR_mappings['verification_mapping'].update( # {key: ("is_string", []) for key in SESAR_mappings['basic_columns']} # ) sample_set = import_samples_from_file( params, self.sw_url, self.workspace_url, username, ctx['token'], SESAR_mappings['column_mapping'], SESAR_mappings.get('groups', []), SESAR_mappings['date_columns'], SESAR_mappings.get('column_unit_regex', []), sample_set, header_row_index) elif params.get('file_format') == 'KBASE': sample_set = import_samples_from_file(params, self.sw_url, self.workspace_url, username, ctx['token'], {}, [], [], [], sample_set, header_row_index) else: raise ValueError( f"Only SESAR and ENIGMA formats are currently supported for importing samples. " "File of format {params.get('file_format')} not supported.") obj_info = self.dfu.save_objects({ 'id': save_ws_id, 'objects': [{ "name": set_name, "type": "KBaseSets.SampleSet", "data": sample_set }] })[0] sample_set_ref = '/'.join( [str(obj_info[6]), str(obj_info[0]), str(obj_info[4])]) sample_file_name = os.path.basename( params['sample_file']).split('.')[0] + '_OTU' # -- Format outputs below -- # if output file format specified, add one to output if params.get('output_format') in ['csv', 'xls']: otu_path = sample_set_to_OTU_sheet(sample_set, sample_file_name, self.scratch, params) file_links = [{ 'path': otu_path, 'name': os.path.basename(otu_path), 'label': "OTU template file", 'description': "file with each column containing the assigned sample_id and sample " "name of each saved sample. Intended for uploading OTU data." }] else: file_links = [] if params.get('incl_input_in_output'): sample_file = params.get('sample_file') if not os.path.isfile(sample_file): # try prepending '/staging/' to file and check then if os.path.isfile(os.path.join('/staging', sample_file)): sample_file = os.path.join('/staging', sample_file) else: raise ValueError( f"input file {sample_file} does not exist.") sample_file_copy = os.path.join(self.scratch, os.path.basename(sample_file)) shutil.copy(sample_file, sample_file_copy) file_links.append({ "path": sample_file_copy, "name": os.path.basename(sample_file_copy), "label": "Input Sample file", "description": "Input file provided to create the sample set." }) # create report report_client = KBaseReport(self.callback_url) report_name = "SampleSet_import_report_" + str(uuid.uuid4()) report_info = report_client.create_extended_report({ 'message': f"SampleSet object named \"{set_name}\" imported.", 'objects_created': [{ 'ref': sample_set_ref }], 'file_links': file_links, 'report_object_name': report_name, 'workspace_name': params['workspace_name'] }) output = { 'report_ref': report_info['ref'], 'report_name': report_info['name'], 'sample_set': sample_set, 'sample_set_ref': sample_set_ref } #END import_samples # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method import_samples return value ' + 'output is not type dict as required.') # return the results return [output] def generate_OTU_sheet(self, ctx, params): """ :param params: instance of type "GenerateOTUSheetParams" (Generate a customized OTU worksheet using a SampleSet input to generate the appropriate columns.) -> structure: parameter "workspace_name" of String, parameter "workspace_id" of Long, parameter "sample_set_ref" of String, parameter "output_name" of String, parameter "output_format" of String, parameter "num_otus" of Long, parameter "taxonomy_source" of String, parameter "incl_seq" of Long, parameter "otu_prefix" of String :returns: instance of type "GenerateOTUSheetOutputs" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN generate_OTU_sheet # first we download sampleset sample_set_ref = params.get('sample_set_ref') ret = self.dfu.get_objects({'object_refs': [sample_set_ref]})['data'][0] sample_set = ret['data'] if params.get('output_name'): output_name = params.get('output_name') else: # if output_name not specified use name of sample_set as output + "_OTUs" output_name = ret['info'][1] + "_OTUs" otu_path = sample_set_to_OTU_sheet(sample_set, output_name, self.scratch, params) report_client = KBaseReport(self.callback_url) report_name = "Generate_OTU_sheet_report_" + str(uuid.uuid4()) report_info = report_client.create_extended_report({ 'file_links': [{ 'path': otu_path, 'name': os.path.basename(otu_path), 'label': "CSV with headers for OTU", 'description': "CSV file with each column containing the assigned sample_id and sample " "name of each saved sample. Intended for uploading OTU data." }], 'report_object_name': report_name, 'workspace_name': params['workspace_name'] }) output = { 'report_ref': report_info['ref'], 'report_name': report_info['name'], } #END generate_OTU_sheet # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method generate_OTU_sheet return value ' + 'output is not type dict as required.') # return the results return [output] def update_sample_set_acls(self, ctx, params): """ :param params: instance of type "update_sample_set_acls_params" -> structure: parameter "workspace_name" of String, parameter "workspace_id" of Long, parameter "sample_set_ref" of String, parameter "new_users" of list of String, parameter "is_reader" of Long, parameter "is_writer" of Long, parameter "is_admin" of Long, parameter "share_within_workspace" of Long :returns: instance of type "update_sample_set_acls_output" -> structure: parameter "status" of String """ # ctx is the context object # return variables are: output #BEGIN update_sample_set_acls # first get sample_set object sample_set_ref = params.get('sample_set_ref') ret = self.dfu.get_objects({'object_refs': [sample_set_ref]})['data'][0] sample_set = ret['data'] sample_url = get_sample_service_url(self.sw_url) acls = {'read': [], 'write': [], 'admin': []} if params.get('share_within_workspace'): acls = get_workspace_user_perms(self.workspace_url, params.get('workspace_id'), ctx['token'], ctx['user_id'], acls) for new_user in params.get('new_users'): if params.get('is_admin'): acls['admin'].append(new_user) elif params.get('is_writer'): acls['write'].append(new_user) elif params.get('is_reader'): acls['read'].append(new_user) for sample in sample_set['samples']: sample_id = sample['id'] status = update_acls(sample_url, sample_id, acls, ctx['token']) output = {"status": status} #END update_sample_set_acls # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method update_sample_set_acls return value ' + 'output is not type dict as required.') # return the results return [output] def export_samples(self, ctx, params): """ :param params: instance of type "ExportParams" (export function for samples) -> structure: parameter "input_ref" of String, parameter "file_format" of String :returns: instance of type "ExportOutput" -> structure: parameter "shock_id" of String """ # ctx is the context object # return variables are: output #BEGIN export_samples if not params.get('input_ref'): raise ValueError(f"variable input_ref required") sample_set_ref = params.get('input_ref') output_file_format = params.get('file_format', 'SESAR') ret = self.dfu.get_objects({'object_refs': [sample_set_ref]})['data'][0] sample_set = ret['data'] sample_set_name = ret['info'][1] sample_url = get_sample_service_url(self.sw_url) export_package_dir = os.path.join(self.scratch, "output") if not os.path.isdir(export_package_dir): os.mkdir(export_package_dir) output_file = os.path.join(export_package_dir, '_'.join(sample_set_name.split()) + ".csv") sample_set_to_output(sample_set, sample_url, ctx['token'], output_file, output_file_format) # package it up package_details = self.dfu.package_for_download({ 'file_path': export_package_dir, 'ws_refs': [params['input_ref']] }) output = { 'shock_id': package_details['shock_id'], 'result_dir': export_package_dir } #END export_samples # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method export_samples return value ' + 'output is not type dict as required.') # return the results return [output] def link_reads(self, ctx, params): """ Create links between samples and reads objects :param params: instance of mapping from String to unspecified object :returns: instance of type "ReportResults" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN link_reads ss = SampleService(self.sw_url, token=ctx['token'], service_ver='beta') sample_set_ref = params['sample_set_ref'] sample_set = SampleSet(self.dfu, sample_set_ref) links = [(d['sample_name'], d['reads_ref']) for d in params['links']] for sample_name, reads_ref in links: node_id, version, sample_id = sample_set.get_sample_info( sample_name) p = dict( upa=reads_ref, id=sample_id, version=version, node=node_id, update=1, ) ret = ss.create_data_link(p) report_client = KBaseReport(self.callback_url) report_info = report_client.create_extended_report({ 'workspace_name': params['workspace_name'], }) output = { 'report_name': report_info['name'], 'report_ref': report_info['ref'], } #END link_reads # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method link_reads return value ' + 'output is not type dict as required.') # return the results return [output] def status(self, ctx): #BEGIN_STATUS returnVal = { 'state': "OK", 'message': "", 'version': self.VERSION, 'git_url': self.GIT_URL, 'git_commit_hash': self.GIT_COMMIT_HASH } #END_STATUS return [returnVal]
class MDSUtils: R_BIN = '/kb/deployment/bin' MDS_OUT_DIR = 'mds_output' PARAM_IN_WS = 'workspace_name' PARAM_IN_MATRIX = 'input_obj_ref' PARAM_OUT_MATRIX = 'mds_matrix_name' def _mkdir_p(self, path): """ _mkdir_p: make directory for given path """ if not path: return try: os.makedirs(path) except OSError as exc: if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise def _validate_run_mds_params(self, params): """ _validate_run_mds_params: validates params passed to run_mds method """ logging.info('start validating run_mds params') # check for required parameters for p in [self.PARAM_IN_MATRIX, self.PARAM_IN_WS, self.PARAM_OUT_MATRIX]: if p not in params: raise ValueError('"{}" parameter is required, but missing'.format(p)) def _build_rMDS_script(self, params): """ _build_rMDS_script: build a sequence of R command calls according to params Note: To run the NMDS, we will use the function metaMDS from the vegan package. # The metaMDS function requires only a community-by-species matrix. """ data_file_path = params.get('datafile', None) if not data_file_path: return '' exists = os.path.isfile(os.path.join(self.output_dir, os.path.basename(data_file_path))) if not exists: shutil.copyfile(data_file_path, os.path.join(self.output_dir, os.path.basename(data_file_path))) n_components = params.get('n_components', 2) max_iter = params.get('max_iter', 300) run_metric = True if params.get('metric', 0) else False dist_metric = params.get('distance_metric', 'bray') mds_cfg = 'distance="' + dist_metric + '",try=20,trymax=' + str(max_iter) + \ ',autotransform=TRUE,noshare=0.1,expand=TRUE,trace=1,' + \ 'plot=FALSE,engine=c("monoMDS","isoMDS"),k=' + str(n_components) if run_metric: mds_cfg += 'metric=True' mds_scrpt = 'library(vegan)\n' mds_scrpt += 'library(jsonlite)\n' mds_scrpt += 'vg_data <- read.table("' + data_file_path + \ '",header=TRUE,row.names=1,sep="")\n' # remove the last (taxonomy) column # mds_scrpt += 'vg_data<-vg_data[,1:dim(vg_data)[2]-1]\n' # Function metaMDS returns an object of class metaMDS. mds_scrpt += 'vg_data.mds <- metaMDS(vg_data,' + mds_cfg + ')\n' mds_scrpt += 'vg_data.mds\n' # save the results in the memory # 1) store species ordination mds_scrpt += 'variableScores <- vg_data.mds$species\n' # 2) store site ordination mds_scrpt += 'sampleScores <- vg_data.mds$points\n' # 3) store other ordination results mds_scrpt += 'stress <- vg_data.mds$stress\n' mds_scrpt += 'dist_metric <- vg_data.mds$distance\n' mds_scrpt += 'dist_matrix <- vg_data.mds$diss\n' mds_scrpt += 'dist_call <- vg_data.mds$distcall\n' mds_scrpt += 'converged <- vg_data.mds$converged\n' mds_scrpt += 'dims <- vg_data.mds$ndim\n' mds_scrpt += 'tries <- vg_data.mds$tries\n' mds_scrpt += 'maxits <- vg_data.mds$maxits\n' mds_scrpt += 'func_call <- vg_data.mds$call\n' mds_scrpt += 'mds_data <- vg_data.mds$data\n' # save the results to the current dir # Write CSV in R mds_scrpt += 'write.csv(dist_matrix,file="dist_matrix.csv",row.names=TRUE,na="")\n' mds_scrpt += 'write.csv(variableScores,file="species_ordination.csv",' + \ 'row.names=TRUE,na="")\n' mds_scrpt += 'write.csv(sampleScores,file="site_ordination.csv",row.names=TRUE,na="")\n' # Write JSON in R mds_scrpt += 'write_json(toJSON(dist_matrix),path="dist_matrix.json",pretty=TRUE,' + \ 'auto_unbox=FALSE)\n' mds_scrpt += 'write_json(toJSON(variableScores),path="species_ordination.json",' + \ 'pretty=TRUE,auto_unbox=FALSE)\n' mds_scrpt += 'write_json(toJSON(sampleScores),path="site_ordination.json",' + \ 'pretty=TRUE,auto_unbox=FALSE)\n' mds_scrpt += 'item_name=c("stress","distance_metric","dist_call","converged",' + \ '"dimesions","trials","maxits")\n' mds_scrpt += 'item_value=c(stress,dist_metric,dist_call,converged,dims,tries,maxits)\n' mds_scrpt += 'df <- data.frame(item_name,item_value,stringsAsFactors=FALSE)\n' mds_scrpt += 'write_json(toJSON(df),path="others.json",pretty=TRUE,auto_unbox=FALSE)\n' # save mds plots ''' mds_scrpt += 'bmp(file="saving_mds_plot.bmp",width=580,height=580,units="px",' + \ 'res=100, pointsize=12)\n' mds_scrpt += 'plot(vg_data.mds,type="n",display="sites")\n' mds_scrpt += 'points(vg_data.mds)\n' mds_scrpt += 'dev.off()\n' mds_scrpt += 'pdf(file="saving_mds_plot.pdf",width=6,height=6)\n' mds_scrpt += 'plot(vg_data.mds,type="n",display="sites")\n' mds_scrpt += 'points(vg_data.mds)\n' mds_scrpt += 'dev.off()\n' mds_scrpt += 'pdf(file="mds_plot_withlabel.pdf",width=6,height=6)\n' mds_scrpt += 'plot(vg_data.mds,type="n",display="sites")\n' mds_scrpt += 'ordilabel(vg_data.mds,dis="sites",cex=1.2,font=3,fill="hotpink",col="blue")\n' mds_scrpt += 'dev.off()\n' mds_scrpt += 'pdf(file="mds_plot_withcolor.pdf",width=6,height=6)\n' mds_scrpt += 'fig <- ordiplot(vg_data.mds,type="none")\n' mds_scrpt += 'points(fig,"sites",pch=21,col="red",bg="yellow")\n' mds_scrpt += 'points(fig,"species",pch=21,col="green",bg="blue")\n' # mds_scrpt += 'text(fig, "species", col="blue", cex=0.9)\n' mds_scrpt += 'dev.off()\n' ''' # If there is user input plotting script: plt_scrpt = params.get('plot_script', '').lower() if plt_scrpt and re.match("^plot\(\s*[a-zA-Z]+.*\)$", plt_scrpt): arr_plt = plt_scrpt.split(',') arr_plt[0] = 'plot(vg_data.mds' # make sure to pass the correct data plt_scrpt = (',').join(arr_plt) if len(arr_plt) == 1: plt_scrpt += ')' plt_type = params.get('plot_type', 'pdf').lower() if not plt_type: plt_type = 'pdf' plt_name = params.get('plot_name', 'usr_plt_name').lower() if not plt_name: plt_name = 'usr_plt_name' plt_name += '.' + plt_type if plt_type == 'jpg': plt_type = 'jpeg' if plt_type == 'ps': plt_type = 'postscript' mds_scrpt += plt_type mds_scrpt += '(file="' + plt_name + '")\n' if plt_type == 'tiff': mds_scrpt += plt_type mds_scrpt += '(file="' + plt_name + '",width=4,height=4,units="in",' + \ 'compression="lzw",res=300)\n' if plt_type in ['jpg', 'jpeg', 'bmp', 'png']: mds_scrpt += plt_type mds_scrpt += '(file="' + plt_name + '",width=580,height=580,units="px",' + \ 'res=100, pointsize=12)\n' mds_scrpt += plt_scrpt + '\n' mds_scrpt += 'dev.off()\n' logging.info('R script: {}'.format(mds_scrpt)) mds_rscript = 'mds_script.R' rscrpt_file_path = os.path.join(self.output_dir, mds_rscript) with open(rscrpt_file_path, 'w') as r_file: r_file.write(mds_scrpt) return rscrpt_file_path def _execute_r_script(self, rfile_name): """ _execute_r_script: Calling the Rscript executable to run the R script in rfile_name """ logging.info('Calling R......') result_dir = os.path.dirname(rfile_name) if not result_dir: result_dir = self.working_dir rcmd = [os.path.join(self.R_BIN, 'Rscript')] rcmd.append(rfile_name) logging.info('Running metaMDS script in current working directory: {}'.format(result_dir)) exitCode = 0 try: complete_proc = subprocess.run(rcmd, cwd=result_dir, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, close_fds=True) exitCode = complete_proc.returncode if (exitCode == 0): logging.info('\n{}'.format(complete_proc.stdout)) logging.info('\n{} was executed successfully, exit code was: {}'.format( ' '.join(rcmd), str(exitCode))) logging.info("Finished calling R.") else: logging.info('Error running command: {} Exit Code: '.format( ' '.join(rcmd), str(exitCode))) logging.info('\n{}'.format(complete_proc.stderr)) except subprocess.CalledProcessError as sub_e: exitCode = -99 logging.info('Caught subprocess.CalledProcessError {}'.format(sub_e)) return exitCode def _df_to_list(self, df): """ _df_to_list: convert Dataframe to FloatMatrix2D matrix data """ df.index = df.index.astype('str') df.columns = df.columns.astype('str') df.fillna(0, inplace=True) matrix_data = {'row_ids': df.index.tolist(), 'col_ids': df.columns.tolist(), 'values': df.values.tolist()} return matrix_data def _mds_df_to_excel(self, mds_df, distance_df, result_dir, mds_matrix_ref): """ write MDS matrix df into excel """ logging.info('writting mds data frame to excel file') mds_matrix_obj = self.dfu.get_objects({'object_refs': [mds_matrix_ref]})['data'][0] mds_matrix_info = mds_matrix_obj['info'] mds_matrix_name = mds_matrix_info[1] file_path = os.path.join(result_dir, mds_matrix_name + ".xlsx") writer = pd.ExcelWriter(file_path) mds_df.to_excel(writer, "mds_matrix", index=True) if distance_df: distance_df.to_excel(writer, "mds_distance_matrix", index=True) writer.close() def _Matrix2D_to_df(self, Matrix2D): """ _Matrix2D_to_df: transform a FloatMatrix2D to data frame """ index = Matrix2D.get('row_ids') columns = Matrix2D.get('col_ids') values = Matrix2D.get('values') df = pd.DataFrame(values, index=index, columns=columns) return df def _mds_to_df(self, mds_matrix_ref): """ retrieve MDS matrix ws object to mds_df """ logging.info('converting mds matrix to data frame') mds_data = self.dfu.get_objects({'object_refs': [mds_matrix_ref]})['data'][0]['data'] rotation_matrix_data = mds_data.get('rotation_matrix') distance_matrix_data = mds_data.get('distance_matrix') original_matrix_ref = mds_data.get('original_matrix_ref') dimension = mds_data.get('mds_parameters').get('n_components') mds_df = self._Matrix2D_to_df(rotation_matrix_data) distance_df = None if distance_matrix_data: distance_df = self._Matrix2D_to_df(distance_matrix_data) if original_matrix_ref: logging.info('appending instance group information to mds data frame') obj_data = self.dfu.get_objects( {'object_refs': [original_matrix_ref]})['data'][0]['data'] attributemapping_ref = obj_data.get('{}_attributemapping_ref'.format(dimension)) am_data = self.dfu.get_objects( {'object_refs': [attributemapping_ref]})['data'][0]['data'] attributes = am_data.get('attributes') instances = am_data.get('instances') am_df = pd.DataFrame(data=list(instances.values()), columns=list(map(lambda x: x.get('attribute'), attributes)), index=instances.keys()) mds_df = mds_df.merge(am_df, left_index=True, right_index=True, how='left', validate='one_to_one') return mds_df, distance_df def _save_mds_matrix(self, workspace_name, input_obj_ref, mds_matrix_name, distance_df, mds_params_df, site_ordin_df, species_ordin_df): logging.info('Saving MDSMatrix...') if not isinstance(workspace_name, int): ws_name_id = self.dfu.ws_name_to_id(workspace_name) else: ws_name_id = workspace_name mds_data = {} mds_data.update({'distance_matrix': self._df_to_list(distance_df)}) mds_data.update({'site_ordination': self._df_to_list(site_ordin_df)}) mds_data.update({'species_ordination': self._df_to_list(species_ordin_df)}) mds_data.update({'mds_parameters': self._df_to_list(mds_params_df)}) mds_data.update({'original_matrix_ref': input_obj_ref}) mds_data.update({'rotation_matrix': self._df_to_list(distance_df)}) obj_type = 'KBaseExperiments.PCAMatrix' info = self.dfu.save_objects({ "id": ws_name_id, "objects": [{ "type": obj_type, "data": mds_data, "name": mds_matrix_name }] })[0] return "%s/%s/%s" % (info[6], info[0], info[4]) def _zip_folder(self, folder_path, output_path): """ _zip_folder: Zip the contents of an entire folder (with that folder included in the archive). Empty subfolders could be included in the archive as well if the 'Included all subfolders, including empty ones' portion. portion is used. """ with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED, allowZip64=True) as ziph: for root, folders, files in os.walk(folder_path): # Include all subfolders, including empty ones. for folder_name in folders: absolute_fpath = os.path.join(root, folder_name) relative_fpath = os.path.join(os.path.basename(root), folder_name) logging.info("Adding folder {} to archive.".format(absolute_fpath)) ziph.write(absolute_fpath, relative_fpath) for f in files: absolute_path = os.path.join(root, f) relative_path = os.path.join(os.path.basename(root), f) logging.info("Adding file {} to archive.".format(absolute_path)) ziph.write(absolute_path, relative_path) logging.info("{} created successfully.".format(output_path)) def _generate_output_file_list(self, out_dir): """ _generate_output_file_list: zip result files and generate file_links for report """ logging.info('Start packing result files from MDS...') output_files = list() output_dir = os.path.join(self.working_dir, str(uuid.uuid4())) self._mkdir_p(output_dir) mds_output = os.path.join(output_dir, 'metaMDS_output.zip') self._zip_folder(out_dir, mds_output) output_files.append({'path': mds_output, 'name': os.path.basename(mds_output), 'label': os.path.basename(mds_output), 'description': 'Output file(s) generated by metaMDS'}) return output_files def _generate_mds_html_report(self, mds_outdir, n_components): logging.info('Start generating html report for MDS results...') html_report = list() mds_plots = list() for root, folders, files in os.walk(mds_outdir): # Find the image files by their extensions. for f in files: if re.match('^[a-zA-Z]+.*.(html)$', f): # jpeg|jpg|bmp|png|tiff|pdf|ps| absolute_path = os.path.join(root, f) logging.info("Adding file {} to plot archive.".format(absolute_path)) mds_plots.append(absolute_path) result_dir = os.path.join(self.working_dir, str(uuid.uuid4())) self._mkdir_p(result_dir) result_file_path = os.path.join(result_dir, 'mds_result.html') visualization_content = '' for mds_plot in mds_plots: shutil.copy2(mds_plot, os.path.join(result_dir, os.path.basename(mds_plot))) visualization_content += '<iframe height="900px" width="100%" ' visualization_content += 'src="{}" '.format(os.path.basename(mds_plot)) visualization_content += 'style="border:none;"></iframe>\n<p></p>\n' with open(result_file_path, 'w') as result_file: with open(os.path.join(os.path.dirname(__file__), 'templates', 'mds_template.html'), 'r') as report_template_file: report_template = report_template_file.read() report_template = report_template.replace('<p>Visualization_Content</p>', visualization_content) report_template = report_template.replace('n_components', '{} Components'.format(n_components)) result_file.write(report_template) report_shock_id = self.dfu.file_to_shock({'file_path': result_dir, 'pack': 'zip'})['shock_id'] html_report.append({'shock_id': report_shock_id, 'name': os.path.basename(result_file_path), 'label': os.path.basename(result_file_path), 'description': 'HTML summary report for MDS Matrix App' }) return html_report def _generate_mds_report(self, mds_ref, output_dir, workspace_name, n_components): logging.info('Creating MDS report...') output_files = self._generate_output_file_list(output_dir) output_html_files = self._generate_mds_html_report(output_dir, n_components) objects_created = list() objects_created.append({'ref': mds_ref, 'description': 'MDS Matrix'}) report_params = {'message': '', 'workspace_name': workspace_name, 'file_links': output_files, 'objects_created': objects_created, 'html_links': output_html_files, 'direct_html_link_index': 0, 'html_window_height': 666, 'report_object_name': 'kb_mds_report_' + str(uuid.uuid4())} kbase_report_client = KBaseReport(self.callback_url) output = kbase_report_client.create_extended_report(report_params) report_output = {'report_name': output['name'], 'report_ref': output['ref']} return report_output def _get_metadata_from_obj(self): """ Get metadata from obj and return simplified pd.DataFrame :return: """ logging.info('Retrieving metadata..') # KBase obj data mdf = self.dfu.get_objects({'object_refs': [self.attribute_mapping_obj_ref]}) attr_l = mdf['data'][0]['data']['attributes'] # Get index location in mdf(metadata) of chosen color and scale color_index = None size_index = None for i in range(len(attr_l)): if attr_l[i]['attribute'] == self.color_marker_by: color_index = i if attr_l[i]['attribute'] == self.scale_size_by: size_index = i # Make list of color and scale data color_data = [] size_data = [] mdf_indx = mdf['data'][0]['data']['instances'].keys() for sample in mdf_indx: if color_index is not None: color_data.append(mdf['data'][0]['data']['instances'][sample][color_index]) if size_index is not None: try: size_data.append(float(mdf['data'][0]['data']['instances'][sample][size_index])) except: logging.info('ERROR: scaling is not int or float. scaling has been dropped') self.scale_size_by = None size_index = None # mdf is now new pd.DataFrame that only includes needed data mdf = pd.DataFrame(index=mdf_indx, columns=[self.color_marker_by, self.scale_size_by]) if color_index is not None: mdf[self.color_marker_by] = color_data if size_index is not None: mdf[self.scale_size_by] = size_data return mdf def _get_metadata_from_file(self): """ Get metadata from file and return simplified pd.DataFrame :return: """ logging.info('Retrieving metadata..') mdf = pd.read_csv(self.metadata_file, sep='\t', index_col=0) logging.info('MDF: {}'.format(mdf)) mdf = mdf[[self.color_marker_by, self.scale_size_by]] return mdf def _plot_with_grouping(self): logging.info('Plotting with grouping: "{}", and "{}"'.format(self.color_marker_by, self.scale_size_by)) # Both can not be the same right now.. mdf is now new pd would lead to problems if self.color_marker_by == self.scale_size_by: logging.info('ERROR: both color and scale are same field. scale set to None') self.scale_size_by = None if self.attribute_mapping_obj_ref is not None: mdf = self._get_metadata_from_obj() elif self.metadata_file is not None: mdf = self._get_metadata_from_file() else: FileNotFoundError('No metadata file was specified') # Get site data from previously saved file site_ordin_df = pd.read_csv(os.path.join(self.output_dir, "site_ordination.csv"), index_col=0) logging.info('SITE_ORDIN_DF:\n {}'.format(site_ordin_df)) # Check if metadata file is valid for this method for sample in site_ordin_df.index: try: mdf.loc[sample] except KeyError: raise KeyError('One or more samples in site_ordination is not found in chosen metadata obj. If you ran ' 'this using files, you might need to transpose the data in your files so samples are ' 'rows and OTU are columns.') # Fill site_ordin_df with metadata from mdf site_ordin_df['color'] = None site_ordin_df['size'] = None for ID in site_ordin_df.index: site_ordin_df['color'].loc[ID] = mdf[self.color_marker_by].loc[ID] site_ordin_df['size'].loc[ID] = mdf[self.scale_size_by].loc[ID] site_ordin_df.fillna('na', inplace=True) # Plot if self.color_marker_by is not None and self.scale_size_by is not None and all( isinstance(x, (int, float)) for x in list(site_ordin_df['size'])): fig = px.scatter(site_ordin_df, x="MDS1", y="MDS2", color="color", size="size", hover_name=site_ordin_df.index) elif self.color_marker_by is not None: fig = px.scatter(site_ordin_df, x="MDS1", y="MDS2", color="color", hover_name=site_ordin_df.index) elif self.scale_size_by is not None: fig = px.scatter(site_ordin_df, x="MDS1", y="MDS2", size="size", hover_name=site_ordin_df.index) # Save plotly_fig.html and return path plotly_html_file_path = os.path.join(self.output_dir, "plotly_fig.html") plot(fig, filename=plotly_html_file_path) return plotly_html_file_path def __init__(self, config): self.ws_url = config["workspace-url"] self.callback_url = config['SDK_CALLBACK_URL'] self.token = config['KB_AUTH_TOKEN'] self.scratch = config['scratch'] self.dfu = DataFileUtil(self.callback_url, service_ver='release') self.working_dir = self.scratch self.data_util = DataUtil(config) self.dfu = DataFileUtil(self.callback_url) self.output_dir = os.path.join(self.working_dir, self.MDS_OUT_DIR) self._mkdir_p(self.output_dir) # If input is from files, then pd.DataFrame needs to be transposed in run_metaMDS_with_file method self.need_to_transpose = True def run_metaMDS(self, params): """ run_metaMDS: perform metaMDS analysis on matrix :param input_obj_ref: object reference of a matrix :param workspace_name: the name of the workspace :param mds_matrix_name: name of MDS (KBaseExperiments.MDSMatrix) object :param n_components - dimentionality of the reduced space (default 2) :param max_iter: maximum iterations allowed :param metric: indication of running metric or non-metric MDS :param distance_metric: distance the ordination will be performed on, default to "bray" """ logging.info('--->\nrunning metaMDS with input\n' + 'params:\n{}'.format(json.dumps(params, indent=1))) self._validate_run_mds_params(params) input_obj_ref = params.get(self.PARAM_IN_MATRIX) workspace_name = params.get(self.PARAM_IN_WS) mds_matrix_name = params.get(self.PARAM_OUT_MATRIX) n_components = int(params.get('n_components', 2)) res = self.dfu.get_objects({'object_refs': [input_obj_ref]})['data'][0] obj_data = res['data'] obj_name = res['info'][1] obj_type = res['info'][2] max_size = len(obj_data['data']['col_ids']) if n_components > max_size: raise ValueError('Number of components should be less than number of samples') exitCode = -99 if "KBaseMatrices" in obj_type: # create the input file from obj_data matrix_tab = obj_data['data']['values'] row_ids = obj_data['data']['row_ids'] col_ids = obj_data['data']['col_ids'] matrix_df = pd.DataFrame(matrix_tab, index=row_ids, columns=col_ids) # Transpose DataFrame matrix_df = matrix_df.T self.need_to_transpose = False matrix_data_file = os.path.join(self.output_dir, obj_name + '.csv') with open(matrix_data_file, 'w') as m_file: matrix_df.to_csv(m_file, sep='\t') params['datafile'] = matrix_data_file exitCode = self.run_metaMDS_with_file(params) else: err_msg = 'Ooops! [{}] is not supported.\n'.format(obj_type) err_msg += 'Please provide a KBaseMatrices object' raise ValueError(err_msg) if exitCode == -99: raise ValueError('Caught subprocess.CalledProcessError while calling R.') # saving the mds_matrix object # read metaMDS results from files into data frames dist_matrix_df = pd.read_csv(os.path.join(self.output_dir, "dist_matrix.csv")) mds_params_df = pd.read_json(os.path.join(self.output_dir, "others.json")) site_ordin_df = pd.read_csv(os.path.join(self.output_dir, "site_ordination.csv")) species_ordin_df = pd.read_csv(os.path.join(self.output_dir, "species_ordination.csv")) mds_ref = self._save_mds_matrix(workspace_name, input_obj_ref, mds_matrix_name, dist_matrix_df, mds_params_df, site_ordin_df, species_ordin_df) returnVal = {'mds_ref': mds_ref} # generating report report_output = self._generate_mds_report(mds_ref, self.output_dir, workspace_name, n_components) returnVal.update(report_output) return returnVal def run_metaMDS_with_file(self, params): """ run_metaMDS_with_file: perform metaMDS analysis on matrix :param datafile: a file that contains the matrix data :param workspace_name: the name of the workspace :param mds_matrix_name: name of MDS (KBaseExperiments.MDSMatrix) object :param n_components - dimentionality of the reduced space (default 2) :param max_iter: maximum iterations allowed :param metric: indication of running metric or non-metric MDS :param distance_metric: distance the ordination will be performed on, default to "bray" """ # Variables for Grouping Features self.attribute_mapping_obj_ref = params.get('attribute_mapping_obj_ref') self.metadata_file = params.get('metadata_file') self.color_marker_by = params.get('color_marker_by') if self.color_marker_by is not None: try: self.color_marker_by = self.color_marker_by['attribute_color'][0] except KeyError: raise KeyError('Expected dictionary with key "attribute_color" containing a list of one element. ' 'Instead found: {}'.format(self.color_marker_by)) self.scale_size_by = params.get('scale_size_by') if self.scale_size_by is not None: try: self.scale_size_by = self.scale_size_by['attribute_size'][0] except KeyError: raise KeyError('Expected dictionary with key "attribute_size" containing a list of one element. ' 'Instead found: {}'.format(self.scale_size_by)) logging.info('--->\nrunning metaMDS with input \n' + 'params:\n{}'.format(json.dumps(params, indent=1))) rscrpt_file = self._build_rMDS_script(params) logging.info('--->\nR script file has been written to {}'.format(rscrpt_file)) result = self._execute_r_script(rscrpt_file) # Make and save plotly fig if self.color_marker_by is not None or self.scale_size_by is not None: self._plot_with_grouping() return result def export_mds_matrix_excel(self, params): """ export MDSMatrix as Excel """ logging.info('start exporting mds matrix') mds_matrix_ref = params.get('input_ref') mds_df, components_df = self._mds_to_df(mds_matrix_ref) result_dir = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(result_dir) self._mds_df_to_excel(mds_df, components_df, result_dir, mds_matrix_ref) package_details = self.dfu.package_for_download({ 'file_path': result_dir, 'ws_refs': [mds_matrix_ref] }) return {'shock_id': package_details['shock_id']}
class PCAUtil: def _mkdir_p(self, path): """ _mkdir_p: make directory for given path """ if not path: return try: os.makedirs(path) except OSError as exc: if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise def _validate_run_pca_params(self, params): """ _validate_run_pca_params: validates params passed to run_pca method """ logging.info('start validating run_pca params') # check for required parameters for p in ['input_obj_ref', 'workspace_name', 'pca_matrix_name']: if p not in params: raise ValueError('"{}" parameter is required, but missing'.format(p)) def _df_to_list(self, df): """ _df_to_list: convert Dataframe to FloatMatrix2D matrix data """ df.index = df.index.astype('str') df.columns = df.columns.astype('str') df.fillna(0, inplace=True) matrix_data = {'row_ids': df.index.tolist(), 'col_ids': df.columns.tolist(), 'values': df.values.tolist()} return matrix_data def _pca_df_to_excel(self, pca_df, components_df, result_dir, pca_matrix_ref): """ write PCA matrix df into excel """ logging.info('writting pca data frame to excel file') pca_matrix_obj = self.dfu.get_objects({'object_refs': [pca_matrix_ref]})['data'][0] pca_matrix_info = pca_matrix_obj['info'] pca_matrix_name = pca_matrix_info[1] file_path = os.path.join(result_dir, pca_matrix_name + ".xlsx") writer = pd.ExcelWriter(file_path) pca_df.to_excel(writer, "principal_component_matrix", index=True) if components_df is not None: components_df.to_excel(writer, "component_variance_matrix", index=True) writer.close() def _Matrix2D_to_df(self, Matrix2D): """ _Matrix2D_to_df: transform a FloatMatrix2D to data frame """ index = Matrix2D.get('row_ids') columns = Matrix2D.get('col_ids') values = Matrix2D.get('values') df = pd.DataFrame(values, index=index, columns=columns) return df def _pca_to_df(self, pca_matrix_ref): """ retrieve pca matrix ws object to pca_df """ logging.info('converting pca matrix to data frame') pca_data = self.dfu.get_objects({'object_refs': [pca_matrix_ref]})['data'][0]['data'] rotation_matrix_data = pca_data.get('rotation_matrix') # exit(rotation_matrix_data) {'col_ids': ['principal_component_1', 'principal_component_2'], 'row_ids': ['WRI_RS00010_CDS_1', 'WRI_RS00015_CDS_1', 'WRI_RS00025_CDS_1'], 'values': [[-0.45, 1.06], [-0.69, -0.92], [1.14, -0.13]]} components_matrix_data = pca_data.get('components_matrix') explained_variance = pca_data.get('explained_variance') #exit(explained_variance) [0.628769688409428, 0.371230311590572] explained_variance_ratio = pca_data.get('explained_variance_ratio') #exit(explained_variance_ratio) [0.628769688409428, 0.371230311590572] singular_values = pca_data.get('singular_values') #exit(singular_values) dimension = pca_data.get('pca_parameters').get('dimension') original_matrix_ref = pca_data.get('original_matrix_ref') pca_df = self._Matrix2D_to_df(rotation_matrix_data) components_df = None if components_matrix_data: components_df = self._Matrix2D_to_df(components_matrix_data) components_df.loc['explained_variance'] = explained_variance components_df.loc['explained_variance_ratio'] = explained_variance_ratio components_df.loc['singular_values'] = singular_values if original_matrix_ref: logging.info('appending instance group information to pca data frame') obj_data = self.dfu.get_objects({'object_refs': [original_matrix_ref]})['data'][0]['data'] attributemapping_ref = obj_data.get('{}_attributemapping_ref'.format(dimension)) am_data = self.dfu.get_objects({'object_refs': [attributemapping_ref]})['data'][0]['data'] attributes = am_data.get('attributes') instances = am_data.get('instances') am_df = pd.DataFrame(data=list(instances.values()), columns=list(map(lambda x: x.get('attribute'), attributes)), index=instances.keys()) pca_df = pca_df.merge(am_df, left_index=True, right_index=True, how='left', validate='one_to_one') return pca_df, components_df def _save_pca_matrix(self, workspace_name, input_obj_ref, pca_matrix_name, rotation_matrix_df, components_df, explained_variance, explained_variance_ratio, singular_values, n_components, dimension): logging.info('saving PCAMatrix') if not isinstance(workspace_name, int): ws_name_id = self.dfu.ws_name_to_id(workspace_name) else: ws_name_id = workspace_name pca_data = {} pca_data.update({'rotation_matrix': self._df_to_list(rotation_matrix_df)}) pca_data.update({'components_matrix': self._df_to_list(components_df)}) pca_data.update({'explained_variance': explained_variance}) pca_data.update({'explained_variance_ratio': explained_variance_ratio}) pca_data.update({'singular_values': singular_values}) pca_data.update({'pca_parameters': {'n_components': str(n_components), 'dimension': str(dimension)}}) pca_data.update({'original_matrix_ref': input_obj_ref}) obj_type = 'KBaseExperiments.PCAMatrix' info = self.dfu.save_objects({ "id": ws_name_id, "objects": [{ "type": obj_type, "data": pca_data, "name": pca_matrix_name }] })[0] return "%s/%s/%s" % (info[6], info[0], info[4]) def _pca_for_matrix(self, input_obj_ref, n_components, dimension): """ _pca_for_matrix: perform PCA analysis for matrix object """ data_matrix = self.data_util.fetch_data({'obj_ref': input_obj_ref}).get('data_matrix') data_df = pd.read_json(data_matrix) data_df.fillna(0, inplace=True) if dimension == 'col': data_df = data_df.T elif dimension != 'row': err_msg = 'Input dimension [{}] is not available.\n'.format(dimension) err_msg += 'Please choose either "col" or "row"' raise ValueError(err_msg) if n_components > min(data_df.index.size, data_df.columns.size): raise ValueError('Number of components should be less than min(n_samples, n_features)') # normalize sample # logging.info("Standardizing the matrix") # s_values = StandardScaler().fit_transform(data_df.values) # skip normalizing sample s_values = data_df.values # Projection to ND pca = PCA(n_components=n_components, whiten=True) principalComponents = pca.fit_transform(s_values) explained_variance = list(pca.explained_variance_) explained_variance_ratio = list(pca.explained_variance_ratio_) components = pca.components_ singular_values = list(pca.singular_values_) col = list() for i in range(n_components): col.append('principal_component_{}'.format(i+1)) rotation_matrix_df = pd.DataFrame(data=principalComponents, columns=col, index=data_df.index) components_df = pd.DataFrame(data=components, columns=data_df.columns, index=col).transpose() rotation_matrix_df.fillna(0, inplace=True) return (rotation_matrix_df, components_df, explained_variance, explained_variance_ratio, singular_values) def _generate_pca_html_report(self, pca_plots, n_components): logging.info('start generating html report') html_report = list() output_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(output_directory) result_file_path = os.path.join(output_directory, 'pca_report.html') visualization_content = '' for pca_plot in pca_plots: shutil.copy2(pca_plot, os.path.join(output_directory, os.path.basename(pca_plot))) visualization_content += '<iframe height="900px" width="100%" ' visualization_content += 'src="{}" '.format(os.path.basename(pca_plot)) visualization_content += 'style="border:none;"></iframe>\n<p></p>\n' with open(result_file_path, 'w') as result_file: with open(os.path.join(os.path.dirname(__file__), 'templates', 'pca_template.html'), 'r') as report_template_file: report_template = report_template_file.read() report_template = report_template.replace('<p>Visualization_Content</p>', visualization_content) report_template = report_template.replace('n_components', '{} Components'.format(n_components)) result_file.write(report_template) report_shock_id = self.dfu.file_to_shock({'file_path': output_directory, 'pack': 'zip'})['shock_id'] html_report.append({'shock_id': report_shock_id, 'name': os.path.basename(result_file_path), 'label': os.path.basename(result_file_path), 'description': 'HTML summary report for ExpressionMatrix Cluster App' }) return html_report def _generate_pca_report(self, pca_ref, pca_plots, workspace_name, n_components): logging.info('creating report') output_html_files = self._generate_pca_html_report(pca_plots, n_components) objects_created = list() objects_created.append({'ref': pca_ref, 'description': 'PCA Matrix'}) report_params = {'message': '', 'workspace_name': workspace_name, 'objects_created': objects_created, 'html_links': output_html_files, 'direct_html_link_index': 0, 'html_window_height': 666, 'report_object_name': 'kb_pca_report_' + str(uuid.uuid4())} kbase_report_client = KBaseReport(self.callback_url) output = kbase_report_client.create_extended_report(report_params) report_output = {'report_name': output['name'], 'report_ref': output['ref']} return report_output def _append_instance_group(self, plot_pca_matrix, obj_data, dimension): plot_pca_matrix = plot_pca_matrix.copy() #exit(obj_data) {'attributes': {'Instrument': 'Old Faithful', 'Scientist': 'Marie Currie'}, 'col_attributemapping_ref': '44071/7/79', 'col_mapping': {'instance_1': 'test_col_instance_1', 'instance_2': 'test_col_instance_2', 'instance_3': 'test_col_instance_3', 'instance_4': 'test_col_instance_4'}, 'col_normalization': 'test_col_normalization', 'data': {'col_ids': ['instance_1', 'instance_2', 'instance_3', 'instance_4'], 'row_ids': ['WRI_RS00010_CDS_1', 'WRI_RS00015_CDS_1', 'WRI_RS00025_CDS_1'], 'values': [[0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8], [None, None, 1.1, 1.2]]}, 'description': 'test_desc', 'row_attributemapping_ref': '44071/8/71', 'row_mapping': {'WRI_RS00010_CDS_1': 'test_row_instance_1', 'WRI_RS00015_CDS_1': 'test_row_instance_2', 'WRI_RS00025_CDS_1': 'test_row_instance_3'}, 'row_normalization': 'test_row_normalization', 'scale': 'log2', 'search_attributes': ['Scientist | Marie Currie', 'Instrument | Old Faithful']} if dimension == 'row': attribute_mapping = obj_data.get('row_mapping') elif dimension == 'col': attribute_mapping = obj_data.get('col_mapping') else: raise ValueError('Unexpected dimension') if not attribute_mapping: logging.warning('Matrix object does not have {}_mapping attribute'.format(dimension)) # build matrix with unify color and shape return plot_pca_matrix else: # append instance col mapping from row/col_mapping plot_pca_matrix['instance'] = plot_pca_matrix.index.map(attribute_mapping) #exit(plot_pca_matrix) ''' principal_component_1 ... instance WRI_RS00010_CDS_1 -0.853094 ... test_row_instance_1 WRI_RS00015_CDS_1 -0.247377 ... test_row_instance_2 WRI_RS00025_CDS_1 1.100471 ... test_row_instance_3''' return plot_pca_matrix def _build_size_pca_matrix(self, plot_pca_matrix, obj_data, dimension, attribute_name): """ _build_size_pca_matrix: append attribute value to rotation_matrix """ logging.info('appending attribute value for sizing to rotation matrix') plot_pca_matrix = plot_pca_matrix.copy() if dimension == 'row': attribute_mapping = obj_data.get('row_mapping') attribute_mapping_ref = obj_data.get('row_attributemapping_ref') elif dimension == 'col': attribute_mapping = obj_data.get('col_mapping') attribute_mapping_ref = obj_data.get('col_attributemapping_ref') else: raise ValueError('Unexpected dimension') if not attribute_mapping: logging.warning('Matrix object does not have {}_mapping attribute'.format(dimension)) # build matrix with unify color and shape return plot_pca_matrix else: # append instance col mapping from row/col_mapping # exit(plot_pca_matrix.index.map(attribute_mapping)) Index(['test_row_instance_1', 'test_row_instance_2', 'test_row_instance_3'], dtype='object') plot_pca_matrix['instance'] = plot_pca_matrix.index.map(attribute_mapping) res = self.dfu.get_objects({'object_refs': [attribute_mapping_ref]})['data'][0] attri_data = res['data'] attri_name = res['info'][1] attributes = attri_data.get('attributes') #exit(attributes) [{'attribute': 'test_attribute_1', 'attribute_ont_id': 'OBI_0500020', 'source': 'upload', 'unit': 'Hour', 'unit_ont_id': 'UO:0000032', 'unit_ont_ref': '6308/15/6'}, {'attribute': 'test_attribute_2', 'attribute_ont_id': 'CHEBI:9168', 'attribute_ont_ref': '6308/19/1', 'source': 'upload', 'unit': 'nanogram per milliliter', 'unit_ont_id': 'UO:0000275', 'unit_ont_ref': '6308/15/6'}, {'attribute': 'test_attribute_3', 'attribute_ont_id': 'CHEBI:9168', 'attribute_ont_ref': '6308/19/1', 'source': 'upload', 'unit': 'nanogram per milliliter', 'unit_ont_id': 'UO:0000275', 'unit_ont_ref': '6308/15/6'}] attr_pos = None for idx, attribute in enumerate(attributes): if attribute.get('attribute') == attribute_name: attr_pos = idx break if attr_pos is None: raise ValueError('Cannot find attribute [{}] in [{}]'.format(attribute_name, attri_name)) instances = attri_data.get('instances') #exit(instances) {'test_row_instance_1': ['1', '4', '7'], 'test_row_instance_2': ['3', '4', '8'], 'test_row_instance_3': ['3', '6', '7']} plot_pca_matrix['attribute_value_size'] = None for instance_name, attri_values in instances.items(): plot_pca_matrix.loc[plot_pca_matrix.instance == instance_name, ['attribute_value_size']] = attri_values[attr_pos] #exit(plot_pca_matrix) ''' principal_component_1 ... attribute_value_size WRI_RS00010_CDS_1 -0.853094 ... 1 WRI_RS00015_CDS_1 -0.247377 ... 3 WRI_RS00025_CDS_1 1.100471 ... 3 ''' return plot_pca_matrix def _build_color_pca_matrix(self, plot_pca_matrix, obj_data, dimension, attribute_name): """ _build_color_pca_matrix: append attribute value to rotation_matrix """ logging.info('appending attribute value for grouping color to rotation matrix') plot_pca_matrix = plot_pca_matrix.copy() if dimension == 'row': attribute_mapping = obj_data.get('row_mapping') attribute_mapping_ref = obj_data.get('row_attributemapping_ref') elif dimension == 'col': attribute_mapping = obj_data.get('col_mapping') attribute_mapping_ref = obj_data.get('col_attributemapping_ref') else: raise ValueError('Unexpected dimension') if not attribute_mapping: logging.warning('Matrix object does not have {}_mapping attribute'.format(dimension)) # build matrix with unify color and shape return plot_pca_matrix else: # append instance col mapping from row/col_mapping plot_pca_matrix['instance'] = plot_pca_matrix.index.map(attribute_mapping) res = self.dfu.get_objects({'object_refs': [attribute_mapping_ref]})['data'][0] attri_data = res['data'] attri_name = res['info'][1] attributes = attri_data.get('attributes') attr_pos = None for idx, attribute in enumerate(attributes): if attribute.get('attribute') == attribute_name: attr_pos = idx break if attr_pos is None: raise ValueError('Cannot find attribute [{}] in [{}]'.format(attribute_name, attri_name)) instances = attri_data.get('instances') #exit(instances) {'test_row_instance_1': ['1', '4', '7'], 'test_row_instance_2': ['3', '4', '8'], 'test_row_instance_3': ['3', '6', '7']} plot_pca_matrix['attribute_value_color'] = None for instance_name, attri_values in instances.items(): #exit(attri_values) ['1', '4', '7'] plot_pca_matrix.loc[plot_pca_matrix.instance == instance_name, ['attribute_value_color']] = attri_values[attr_pos] return plot_pca_matrix def _build_2_comp_trace(self, plot_pca_matrix, components_x, components_y): #exit(plot_pca_matrix) ''' principal_component_1 ... attribute_value_size WRI_RS00010_CDS_1 -0.853094 ... 1 WRI_RS00015_CDS_1 -0.247377 ... 3 WRI_RS00025_CDS_1 1.100471 ... 3 ''' #exit(components_x) principal_component_1 #exit(components_y) principal_component_2 traces = [] if 'attribute_value_color' in plot_pca_matrix.columns and 'attribute_value_size' in plot_pca_matrix.columns: maximum_marker_size = 10 sizeref = 2.*float(max(plot_pca_matrix['attribute_value_size']))/(maximum_marker_size**2) for name in set(plot_pca_matrix.attribute_value_color): attribute_value_size = plot_pca_matrix.loc[plot_pca_matrix['attribute_value_color'].eq(name)].attribute_value_size size_list = list(map(abs, list(map(float, attribute_value_size)))) for idx, val in enumerate(size_list): if val == 0: size_list[idx] = sys.float_info.min trace = go.Scatter( x=list(plot_pca_matrix.loc[plot_pca_matrix['attribute_value_color'].eq(name)][components_x]), y=list(plot_pca_matrix.loc[plot_pca_matrix['attribute_value_color'].eq(name)][components_y]), mode='markers', name=name, text=list(plot_pca_matrix.loc[plot_pca_matrix['attribute_value_color'].eq(name)].index), textposition='bottom center', marker=go.Marker(symbol='circle', sizemode='area', sizeref=sizeref, size=size_list, sizemin=2, line=go.Line(color='rgba(217, 217, 217, 0.14)', width=0.5), opacity=0.8)) traces.append(trace) elif 'attribute_value_color' in plot_pca_matrix.columns: for name in set(plot_pca_matrix.attribute_value_color): trace = go.Scatter( x=list(plot_pca_matrix.loc[plot_pca_matrix['attribute_value_color'].eq(name)][components_x]), y=list(plot_pca_matrix.loc[plot_pca_matrix['attribute_value_color'].eq(name)][components_y]), mode='markers', name=name, text=list(plot_pca_matrix.loc[plot_pca_matrix['attribute_value_color'].eq(name)].index), textposition='bottom center', marker=go.Marker(size=10, opacity=0.8, line=go.Line(color='rgba(217, 217, 217, 0.14)', width=0.5))) traces.append(trace) elif 'attribute_value_size' in plot_pca_matrix.columns: maximum_marker_size = 10 sizeref = 2.*float(max(plot_pca_matrix['attribute_value_size']))/(maximum_marker_size**2) for name in set(plot_pca_matrix.instance): attribute_value_size = plot_pca_matrix.loc[plot_pca_matrix['instance'].eq(name)].attribute_value_size size_list = list(map(abs, list(map(float, attribute_value_size)))) for idx, val in enumerate(size_list): if val == 0: size_list[idx] = sys.float_info.min trace = go.Scatter( x=list(plot_pca_matrix.loc[plot_pca_matrix['instance'].eq(name)][components_x]), y=list(plot_pca_matrix.loc[plot_pca_matrix['instance'].eq(name)][components_y]), mode='markers', name=name, text=list(plot_pca_matrix.loc[plot_pca_matrix['instance'].eq(name)].index), textposition='bottom center', marker=go.Marker(symbol='circle', sizemode='area', sizeref=sizeref, size=size_list, sizemin=2, line=go.Line(color='rgba(217, 217, 217, 0.14)', width=0.5), opacity=0.8)) traces.append(trace) else: trace = go.Scatter( x=list(plot_pca_matrix[components_x]), y=list(plot_pca_matrix[components_y]), mode='markers', text=list(plot_pca_matrix.index), textposition='bottom center', marker=go.Marker(size=10, opacity=0.8, line=go.Line(color='rgba(217, 217, 217, 0.14)', width=0.5))) traces.append(trace) return traces def _plot_pca_matrix(self, plot_pca_matrix, n_components): output_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(output_directory) result_file_paths = [] all_pairs = list(itertools.combinations(range(1, n_components+1), 2)) for pair in all_pairs: first_component = pair[0] second_component = pair[1] result_file_path = os.path.join(output_directory, 'pca_plot_{}_{}.html'.format( first_component, second_component)) traces = self._build_2_comp_trace(plot_pca_matrix, 'principal_component_{}'.format(first_component), 'principal_component_{}'.format(second_component)) data = go.Data(traces) layout = go.Layout(xaxis=go.XAxis(title='PC{}'.format(first_component), showline=False), yaxis=go.YAxis(title='PC{}'.format(second_component), showline=False)) fig = go.Figure(data=data, layout=layout) plot(fig, filename=result_file_path) result_file_paths.append(result_file_path) return result_file_paths def _validate_pca_matrix(self, obj_data, dimension, color_marker_by, scale_size_by): if dimension == 'row': attribute_mapping = obj_data.get('row_mapping') elif dimension == 'col': attribute_mapping = obj_data.get('col_mapping') else: raise ValueError('Unexpected dimension') if not attribute_mapping: if (color_marker_by and color_marker_by.get('attribute_color')[0]) or \ (scale_size_by and scale_size_by.get('attribute_size')[0]): raise ValueError('Matrix object is not associated with any {} attribute mapping'.format(dimension)) def __init__(self, config): self.ws_url = config["workspace-url"] self.callback_url = config['SDK_CALLBACK_URL'] self.token = config['KB_AUTH_TOKEN'] self.scratch = config['scratch'] self.data_util = DataUtil(config) self.dfu = DataFileUtil(self.callback_url) plt.switch_backend('agg') def run_pca(self, params): """ perform PCA analysis on matrix input_obj_ref: object reference of a matrix workspace_name: the name of the workspace pca_matrix_name: name of PCA (KBaseExperiments.PCAMatrix) object n_components - number of components (default 2) dimension: compute correlation on column or row, one of ['col', 'row'] """ logging.info('--->\nrunning NetworkUtil.build_network\n' + 'params:\n{}'.format(json.dumps(params, indent=1))) self._validate_run_pca_params(params) input_obj_ref = params.get('input_obj_ref') workspace_name = params.get('workspace_name') pca_matrix_name = params.get('pca_matrix_name') n_components = int(params.get('n_components', 2)) dimension = params.get('dimension', 'row') res = self.dfu.get_objects({'object_refs': [input_obj_ref]})['data'][0] obj_data = res['data'] obj_type = res['info'][2] self._validate_pca_matrix(obj_data, dimension, params.get('color_marker_by'), params.get('scale_size_by')) if "KBaseMatrices" in obj_type: (rotation_matrix_df, components_df, explained_variance, explained_variance_ratio, singular_values) = self._pca_for_matrix(input_obj_ref, n_components, dimension) else: err_msg = 'Ooops! [{}] is not supported.\n'.format(obj_type) err_msg += 'Please supply KBaseMatrices object' raise ValueError("err_msg") pca_ref = self._save_pca_matrix(workspace_name, input_obj_ref, pca_matrix_name, rotation_matrix_df, components_df, explained_variance, explained_variance_ratio, singular_values, n_components, dimension) plot_pca_matrix = self._append_instance_group(rotation_matrix_df.copy(), obj_data, dimension) if params.get('color_marker_by'): plot_pca_matrix = self._build_color_pca_matrix( plot_pca_matrix, obj_data, dimension, params.get('color_marker_by').get('attribute_color')[0]) #exit(plot_pca_matrix) ''' principal_component_1 ... attribute_value_color WRI_RS00010_CDS_1 -0.853094 ... 4 WRI_RS00015_CDS_1 -0.247377 ... 4 WRI_RS00025_CDS_1 1.100471 ... 6 ''' if params.get('scale_size_by'): plot_pca_matrix = self._build_size_pca_matrix( plot_pca_matrix, obj_data, dimension, params.get('scale_size_by').get('attribute_size')[0]) returnVal = {'pca_ref': pca_ref} report_output = self._generate_pca_report(pca_ref, self._plot_pca_matrix(plot_pca_matrix, n_components), workspace_name, n_components) returnVal.update(report_output) return returnVal def export_pca_matrix_excel(self, params): """ export PCAMatrix as Excel """ logging.info('start exporting pca matrix') pca_matrix_ref = params.get('input_ref') #44071/62/8 pca_df, components_df = self._pca_to_df(pca_matrix_ref) #exit(pca_df) ''' principal_component_1 principal_component_2 WRI_RS00010_CDS_1 -0.45 1.06 WRI_RS00015_CDS_1 -0.69 -0.92 WRI_RS00025_CDS_1 1.14 -0.13 ''' result_dir = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(result_dir) #exit(result_dir) #/kb/module/work/tmp/54e9610e-9b51-4296-a971-333f9f154a1f self._pca_df_to_excel(pca_df, components_df, result_dir, pca_matrix_ref) package_details = self.dfu.package_for_download({ 'file_path': result_dir, 'ws_refs': [pca_matrix_ref] }) return {'shock_id': package_details['shock_id']}
class FeatureSetDownload: def __init__(self, config): self.cfg = config self.scratch = config['scratch'] self.gsu = GenomeSearchUtil(os.environ['SDK_CALLBACK_URL']) self.dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL']) self.ws = Workspace(config["workspace-url"]) @staticmethod def validate_params(params, expected={"workspace_name", "featureset_name"}): expected = set(expected) pkeys = set(params) if expected - pkeys: raise ValueError( "Required keys {} not in supplied parameters".format( ", ".join(expected - pkeys))) def to_tsv(self, params): working_dir = os.path.join(self.scratch, 'featureset-download-' + str(uuid.uuid4())) os.makedirs(working_dir) header = ['Feature Id', 'Aliases', 'Genome', 'Type', 'Function'] fs_name, fs_dicts = self.make_featureset_dict(params['featureset_ref']) files = {'file_path': "{}/{}.tsv".format(working_dir, fs_name)} writer = csv.DictWriter(open(files['file_path'], 'w'), header, delimiter='\t', lineterminator='\n') writer.writeheader() for feat in fs_dicts: writer.writerow(feat) return fs_name, files def make_featureset_dict(self, fs_ref): features = [] ret = self.dfu.get_objects({'object_refs': [fs_ref]})['data'][0] feat_set = ret['data'] fs_name = ret['info'][1] feat_by_genome = defaultdict(list) for k, v in feat_set['elements'].items(): feat_by_genome[v[0]].append(k) for genome, fids in feat_by_genome.items(): genome_name = self.ws.get_object_info3( {'objects': [{ 'ref': genome }]})['infos'][0][1] res = self.gsu.search({ 'ref': genome, 'structured_query': { 'feature_id': fids }, 'sort_by': [['contig_id', 1]], 'start': 0, 'limit': len(fids) }) for feat in res['features']: features.append({ 'Feature Id': feat['feature_id'], 'Aliases': ", ".join(sorted(feat['aliases'].keys())), 'Genome': "{} ({})".format(genome_name, genome), 'Type': feat['feature_type'], 'Function': feat['function'] }) return fs_name, features def export(self, files, name, params): export_package_dir = os.path.join(self.scratch, name + str(uuid.uuid4())) os.makedirs(export_package_dir) for file in files: shutil.move( file, os.path.join(export_package_dir, os.path.basename(file))) # package it up and be done package_details = self.dfu.package_for_download({ 'file_path': export_package_dir, 'ws_refs': [params['featureset_ref']] }) return {'shock_id': package_details['shock_id']}
def export_genome_as_gff(self, ctx, params): """ :param params: instance of type "ExportParams" (input and output structure functions for standard downloaders) -> structure: parameter "input_ref" of String :returns: instance of type "ExportOutput" -> structure: parameter "shock_id" of String """ # ctx is the context object # return variables are: output #BEGIN export_genome_as_gff if 'input_ref' not in params: raise ValueError('Cannot run export_genome_as_gff- no "input_ref" ' 'field defined.') # get WS metadata to get ws_name and obj_name ws = Workspace(url=self.cfg.workspaceURL) info = ws.get_objects2({ 'objects': [{ 'ref': params['input_ref'], 'included': ['/assembly_ref', '/contigset_ref', '/id', '/gff_handle_ref'] }] })['data'][0]['data'] # export to file (building from KBase Genome Object) result = self.genome_to_gff(ctx, {'genome_ref': params['input_ref']})[0] # get assembly if 'assembly_ref' in info: assembly_ref = info['assembly_ref'] else: assembly_ref = info['contigset_ref'] print(('Assembly reference = ' + assembly_ref)) print('Downloading assembly') au = AssemblyUtil(self.cfg.callbackURL) assembly_file_path = au.get_assembly_as_fasta( {'ref': params['input_ref'] + ";" + assembly_ref})['path'] # create the output directory and move the files there export_package_dir = os.path.join(self.cfg.sharedFolder, info['id']) os.makedirs(export_package_dir) shutil.move( result['file_path'], os.path.join( export_package_dir, 'KBase_derived_' + os.path.basename(result['file_path']))) shutil.move( assembly_file_path, os.path.join(export_package_dir, os.path.basename(assembly_file_path))) # add cached genome if appropriate exporter = GenomeToGFF(self.cfg) cached = exporter.get_gff_handle(info, export_package_dir) # package it up dfUtil = DataFileUtil(self.cfg.callbackURL) package_details = dfUtil.package_for_download({ 'file_path': export_package_dir, 'ws_refs': [params['input_ref']] }) output = {'shock_id': package_details['shock_id']} #END export_genome_as_gff # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method export_genome_as_gff return value ' + 'output is not type dict as required.') # return the results return [output]
class ReadsAlignmentUtils: ''' Module Name: ReadsAlignmentUtils Module Description: A KBase module: ReadsAlignmentUtils This module is intended for use by Aligners and Assemblers to upload and download alignment files. The alignment may be uploaded as a sam or bam file. If a sam file is given, it is converted to the sorted bam format and saved. Upon downloading, optional parameters may be provided to get files in sam and bai formats from the downloaded bam file. This utility also generates stats from the stored alignment. ''' ######## WARNING FOR GEVENT USERS ####### noqa # Since asynchronous IO can lead to methods - even the same method - # interrupting each other, you must be *very* careful when using global # state. A method could easily clobber the state set by another while # the latter method is running. ######################################### noqa VERSION = "0.3.6" GIT_URL = "https://github.com/kbaseapps/ReadsAlignmentUtils.git" GIT_COMMIT_HASH = "75ef2c24694c056dfca71859d6f344ccff7d4725" #BEGIN_CLASS_HEADER PARAM_IN_FILE = 'file_path' PARAM_IN_SRC_REF = 'source_ref' PARAM_IN_DST_REF = 'destination_ref' PARAM_IN_CONDITION = 'condition' PARAM_IN_READ_LIB_REF = 'read_library_ref' PARAM_IN_ASM_GEN_REF = 'assembly_or_genome_ref' PARAM_IN_ALIGNED_USING = 'aligned_using' PARAM_IN_ALIGNER_VER = 'aligner_version' PARAM_IN_ALIGNER_OPTS = 'aligner_opts' PARAM_IN_REPLICATE_ID = 'replicate_id' PARAM_IN_PLATFORM = 'platform' PARAM_IN_BOWTIE2_INDEX = 'bowtie2_index' PARAM_IN_SAMPLESET_REF = 'sampleset_ref' PARAM_IN_MAPPED_SAMPLE_ID = 'mapped_sample_id' PARAM_IN_DOWNLOAD_SAM = 'downloadSAM' PARAM_IN_DOWNLOAD_BAI = 'downloadBAI' PARAM_IN_VALIDATE = 'validate' INVALID_WS_OBJ_NAME_RE = re.compile('[^\\w\\|._-]') INVALID_WS_NAME_RE = re.compile('[^\\w:._-]') def _get_file_path_info(self, file_path): """ Given a file path, returns the directory, file name, file base and file extension """ dir, file_name = os.path.split(file_path) file_base, file_ext = os.path.splitext(file_name) return dir, file_name, file_base, file_ext def _mkdir_p(self, path): """ _mkdir_p: make directory for given path """ if not path: return try: os.makedirs(path) except OSError as exc: if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise def _check_required_param(self, in_params, param_list): """ Checks if each of the params in the list are in the input params """ for param in param_list: if (param not in in_params or not in_params[param]): raise ValueError('{} parameter is required'.format(param)) def _proc_ws_obj_params(self, ctx, params): """ Checks the validity of workspace and object params and returns them """ dst_ref = params.get(self.PARAM_IN_DST_REF) ws_name_id, obj_name_id = os.path.split(dst_ref) if not bool(ws_name_id.strip()) or ws_name_id == '/': raise ValueError("Workspace name or id is required in " + self.PARAM_IN_DST_REF) if not bool(obj_name_id.strip()): raise ValueError("Object name or id is required in " + self.PARAM_IN_DST_REF) if not isinstance(ws_name_id, int): try: ws_name_id = self.dfu.ws_name_to_id(ws_name_id) except DFUError as se: prefix = se.message.split('.')[0] raise ValueError(prefix) self.__LOGGER.info('Obtained workspace name/id ' + str(ws_name_id)) return ws_name_id, obj_name_id def _get_ws_info(self, obj_ref): ws = Workspace(self.ws_url) try: info = ws.get_object_info_new({'objects': [{'ref': obj_ref}]})[0] except WorkspaceError as wse: self.__LOGGER.error('Logging workspace exception') self.__LOGGER.error(str(wse)) raise return info def _proc_upload_alignment_params(self, ctx, params): """ Checks the presence and validity of upload alignment params """ self._check_required_param(params, [ self.PARAM_IN_DST_REF, self.PARAM_IN_FILE, self.PARAM_IN_CONDITION, self.PARAM_IN_READ_LIB_REF, self.PARAM_IN_ASM_GEN_REF ]) ws_name_id, obj_name_id = self._proc_ws_obj_params(ctx, params) file_path = params.get(self.PARAM_IN_FILE) if not (os.path.isfile(file_path)): raise ValueError('File does not exist: ' + file_path) lib_type = self._get_ws_info(params.get(self.PARAM_IN_READ_LIB_REF))[2] if lib_type.startswith('KBaseFile.SingleEndLibrary') or \ lib_type.startswith('KBaseFile.PairedEndLibrary') or \ lib_type.startswith('KBaseAssembly.SingleEndLibrary') or \ lib_type.startswith('KBaseAssembly.PairedEndLibrary'): pass else: raise ValueError(self.PARAM_IN_READ_LIB_REF + ' parameter should be of type' + ' KBaseFile.SingleEndLibrary or' + ' KBaseFile.PairedEndLibrary or' + ' KBaseAssembly.SingleEndLibrary or' + ' KBaseAssembly.PairedEndLibrary') obj_type = self._get_ws_info(params.get(self.PARAM_IN_ASM_GEN_REF))[2] if obj_type.startswith('KBaseGenomes.Genome') or \ obj_type.startswith('KBaseGenomeAnnotations.Assembly') or \ obj_type.startswith('KBaseGenomes.ContigSet'): pass else: raise ValueError(self.PARAM_IN_ASM_GEN_REF + ' parameter should be of type' + ' KBaseGenomes.Genome or' + ' KBaseGenomeAnnotations.Assembly or' + ' KBaseGenomes.ContigSet') return ws_name_id, obj_name_id, file_path, lib_type def _get_aligner_stats(self, bam_file): """ Gets the aligner stats from BAM file How we compute this stats: For each segment (line) in SAM/BAM file: we take the first element as `reads_id` the second element as `flag` if the last bit (0x1) of flag is `1`: we treat this segment as paired end reads otherwise: we treat this segment as single end reads For single end reads: if the 3rd last bit (0x8) of flag is `1`: we increment unmapped_reads_count else: we treat this `reads_id` as mapped for all mapped `reads_ids`" if it appears only once: we treat this `reads_id` as `singletons` else: we treat this `reads_id` as `multiple_alignments` lastly, total_reads = unmapped_reads_count + identical mapped `reads_id` For paired end reads: if the 7th last bit (0x40) of flag is `1`: if the 3rd last bit (0x8) of flag is `1`: we increment unmapped_left_reads_count else: we treat this `reads_id` as mapped if the 8th last bit ( 0x80) of flag is `1`: if the 3rd last bit (0x8) of flag is `1`: we increment unmapped_right_reads_count else: we treat this `reads_id` as mapped for all mapped `reads_ids`" if it appears only once: we treat this `reads_id` as `singletons` else: we treat this `reads_id` as `multiple_alignments` lastly, total_reads = unmapped_left_reads_count + unmapped_right_reads_count + identical mapped `reads_id` """ path, file = os.path.split(bam_file) self.__LOGGER.info('Start to generate aligner stats') start_time = time.time() infile = pysam.AlignmentFile(bam_file, 'r') properly_paired = 0 unmapped_reads_count = 0 unmapped_left_reads_count = 0 unmapped_right_reads_count = 0 mapped_reads_ids = [] mapped_left_reads_ids = [] mapped_right_reads_ids = [] paired = False for alignment in infile: seg = alignment.to_string().split('\t') reads_id = seg[0] flag = "0000000" + "{0:b}".format(int(seg[1])) if flag[-1] == '1': paired = True if paired: # process paired end sequence if flag[-7] == '1': # first sequence of a pair if flag[-3] == '1': unmapped_left_reads_count += 1 else: mapped_left_reads_ids.append(reads_id) if flag[-8] == '1': # second sequence of a pair if flag[-3] == '1': unmapped_right_reads_count += 1 else: mapped_right_reads_ids.append(reads_id) if flag[-2] == '1': properly_paired += 1 else: # process single end sequence if flag[-3] == '1': unmapped_reads_count += 1 else: mapped_reads_ids.append(reads_id) if flag[-2] == '1': properly_paired += 1 infile.close() if paired: mapped_reads_ids = mapped_left_reads_ids + mapped_right_reads_ids unmapped_reads_count = unmapped_left_reads_count + unmapped_right_reads_count mapped_reads_ids_counter = Counter(mapped_reads_ids) mapped_reads_count = len(list(mapped_reads_ids_counter)) singletons = list(mapped_reads_ids_counter.values()).count(1) multiple_alignments = mapped_reads_count - singletons total_reads = unmapped_reads_count + mapped_reads_count properly_paired = properly_paired // 2 else: mapped_reads_ids_counter = Counter(mapped_reads_ids) mapped_reads_count = len(list(mapped_reads_ids_counter)) singletons = list(mapped_reads_ids_counter.values()).count(1) multiple_alignments = mapped_reads_count - singletons total_reads = unmapped_reads_count + mapped_reads_count try: alignment_rate = round( float(mapped_reads_count) / total_reads * 100, 3) except ZeroDivisionError: alignment_rate = 0 if alignment_rate > 100: alignment_rate = 100.0 elapsed_time = time.time() - start_time self.__LOGGER.info('Used: {}'.format( time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))) stats_data = { "alignment_rate": alignment_rate, "mapped_reads": mapped_reads_count, "multiple_alignments": multiple_alignments, "properly_paired": properly_paired, "singletons": singletons, "total_reads": total_reads, "unmapped_reads": unmapped_reads_count } return stats_data def _validate(self, params): samt = SamTools(self.config, self.__LOGGER) if 'ignore' in params: path, file = os.path.split(params['file_path']) rval = samt.validate(ifile=file, ipath=path, ignore=params['ignore']) else: path, file = os.path.split(params['file_path']) rval = samt.validate(ifile=file, ipath=path) return rval #END_CLASS_HEADER # config contains contents of config file in a hash or None if it couldn't # be found def __init__(self, config): #BEGIN_CONSTRUCTOR self.config = config self.__LOGGER = logging.getLogger('KBaseRNASeq') if 'log_level' in config: self.__LOGGER.setLevel(config['log_level']) else: self.__LOGGER.setLevel(logging.INFO) streamHandler = logging.StreamHandler(sys.stdout) formatter = logging.Formatter( "%(asctime)s - %(filename)s - %(lineno)d - \ %(levelname)s - %(message)s") formatter.converter = time.gmtime streamHandler.setFormatter(formatter) self.__LOGGER.addHandler(streamHandler) self.__LOGGER.info("Logger was set") script_utils.check_sys_stat(self.__LOGGER) self.scratch = config['scratch'] self.callback_url = os.environ['SDK_CALLBACK_URL'] self.ws_url = config['workspace-url'] self.dfu = DataFileUtil(self.callback_url) self.samtools = SamTools(config) #END_CONSTRUCTOR pass def validate_alignment(self, ctx, params): """ :param params: instance of type "ValidateAlignmentParams" (* Input parameters for validating a reads alignment. For validation errors to ignore, see http://broadinstitute.github.io/picard/command-line-overview.html#V alidateSamFile) -> structure: parameter "file_path" of String, parameter "ignore" of list of String :returns: instance of type "ValidateAlignmentOutput" (* Results from validate alignment *) -> structure: parameter "validated" of type "boolean" (A boolean - 0 for false, 1 for true. @range (0, 1)) """ # ctx is the context object # return variables are: returnVal #BEGIN validate_alignment rval = self._validate(params) if rval == 0: returnVal = {'validated': True} else: returnVal = {'validated': False} #END validate_alignment # At some point might do deeper type checking... if not isinstance(returnVal, dict): raise ValueError('Method validate_alignment return value ' + 'returnVal is not type dict as required.') # return the results return [returnVal] def upload_alignment(self, ctx, params): """ Validates and uploads the reads alignment How we compute BAM stats: For each segment (line) in SAM/BAM file: we take the first element as `reads_id` the second element as `flag` if the last bit (0x1) of flag is `1`: we treat this segment as paired end reads otherwise: we treat this segment as single end reads For single end reads: if the 3rd last bit (0x8) of flag is `1`: we increment unmapped_reads_count else: we treat this `reads_id` as mapped for all mapped `reads_ids`" if it appears only once: we treat this `reads_id` as `singletons` else: we treat this `reads_id` as `multiple_alignments` lastly, total_reads = unmapped_reads_count + identical mapped `reads_id` For paired end reads: if the 7th last bit (0x40) of flag is `1`: if the 3rd last bit (0x8) of flag is `1`: we increment unmapped_left_reads_count else: we treat this `reads_id` as mapped if the 8th last bit ( 0x80) of flag is `1`: if the 3rd last bit (0x8) of flag is `1`: we increment unmapped_right_reads_count else: we treat this `reads_id` as mapped for all mapped `reads_ids`" if it appears only once: we treat this `reads_id` as `singletons` else: we treat this `reads_id` as `multiple_alignments` lastly, total_reads = unmapped_left_reads_count + unmapped_right_reads_count + identical mapped `reads_id` :param params: instance of type "UploadAlignmentParams" (* Required input parameters for uploading a reads alignment string destination_ref - object reference of alignment destination. The object ref is 'ws_name_or_id/obj_name_or_id' where ws_name_or_id is the workspace name or id and obj_name_or_id is the object name or id file_path - File with the path of the sam or bam file to be uploaded. If a sam file is provided, it will be converted to the sorted bam format before being saved read_library_ref - workspace object ref of the read sample used to make the alignment file condition - assembly_or_genome_ref - workspace object ref of genome assembly or genome object that was used to build the alignment *) -> structure: parameter "destination_ref" of String, parameter "file_path" of String, parameter "read_library_ref" of String, parameter "condition" of String, parameter "assembly_or_genome_ref" of String, parameter "aligned_using" of String, parameter "aligner_version" of String, parameter "aligner_opts" of mapping from String to String, parameter "replicate_id" of String, parameter "platform" of String, parameter "bowtie2_index" of type "ws_bowtieIndex_id", parameter "sampleset_ref" of type "ws_Sampleset_ref", parameter "mapped_sample_id" of mapping from String to mapping from String to String, parameter "validate" of type "boolean" (A boolean - 0 for false, 1 for true. @range (0, 1)), parameter "ignore" of list of String :returns: instance of type "UploadAlignmentOutput" (* Output from uploading a reads alignment *) -> structure: parameter "obj_ref" of String """ # ctx is the context object # return variables are: returnVal #BEGIN upload_alignment self.__LOGGER.info( 'Starting upload Reads Alignment, parsing parameters ') pprint(params) ws_name_id, obj_name_id, file_path, lib_type = self._proc_upload_alignment_params( ctx, params) dir, file_name, file_base, file_ext = self._get_file_path_info( file_path) if self.PARAM_IN_VALIDATE in params and params[ self.PARAM_IN_VALIDATE] is True: if self._validate(params) == 1: raise Exception('{0} failed validation'.format(file_path)) bam_file = file_path if file_ext.lower() == '.sam': bam_file = os.path.join(dir, file_base + '.bam') self.samtools.convert_sam_to_sorted_bam(ifile=file_name, ipath=dir, ofile=bam_file) uploaded_file = self.dfu.file_to_shock({ 'file_path': bam_file, 'make_handle': 1 }) file_handle = uploaded_file['handle'] file_size = uploaded_file['size'] aligner_stats = self._get_aligner_stats(file_path) aligner_data = { 'file': file_handle, 'size': file_size, 'condition': params.get(self.PARAM_IN_CONDITION), 'read_sample_id': params.get(self.PARAM_IN_READ_LIB_REF), 'library_type': lib_type, 'genome_id': params.get(self.PARAM_IN_ASM_GEN_REF), 'alignment_stats': aligner_stats } optional_params = [ self.PARAM_IN_ALIGNED_USING, self.PARAM_IN_ALIGNER_VER, self.PARAM_IN_ALIGNER_OPTS, self.PARAM_IN_REPLICATE_ID, self.PARAM_IN_PLATFORM, self.PARAM_IN_BOWTIE2_INDEX, self.PARAM_IN_SAMPLESET_REF, self.PARAM_IN_MAPPED_SAMPLE_ID ] for opt_param in optional_params: if opt_param in params and params[opt_param] is not None: aligner_data[opt_param] = params[opt_param] self.__LOGGER.info('========= Adding extra_provenance_refs') self.__LOGGER.info(params.get(self.PARAM_IN_READ_LIB_REF)) self.__LOGGER.info(params.get(self.PARAM_IN_ASM_GEN_REF)) self.__LOGGER.info('=======================================') res = self.dfu.save_objects({ "id": ws_name_id, "objects": [{ "type": "KBaseRNASeq.RNASeqAlignment", "data": aligner_data, "name": obj_name_id, "extra_provenance_input_refs": [ params.get(self.PARAM_IN_READ_LIB_REF), params.get(self.PARAM_IN_ASM_GEN_REF) ] }] })[0] self.__LOGGER.info('save complete') returnVal = { 'obj_ref': str(res[6]) + '/' + str(res[0]) + '/' + str(res[4]) } self.__LOGGER.info('Uploaded object: ') self.__LOGGER.info(returnVal) #END upload_alignment # At some point might do deeper type checking... if not isinstance(returnVal, dict): raise ValueError('Method upload_alignment return value ' + 'returnVal is not type dict as required.') # return the results return [returnVal] def download_alignment(self, ctx, params): """ Downloads alignment files in .bam, .sam and .bai formats. Also downloads alignment stats * :param params: instance of type "DownloadAlignmentParams" (* Required input parameters for downloading a reads alignment string source_ref - object reference of alignment source. The object ref is 'ws_name_or_id/obj_name_or_id' where ws_name_or_id is the workspace name or id and obj_name_or_id is the object name or id *) -> structure: parameter "source_ref" of String, parameter "downloadSAM" of type "boolean" (A boolean - 0 for false, 1 for true. @range (0, 1)), parameter "downloadBAI" of type "boolean" (A boolean - 0 for false, 1 for true. @range (0, 1)), parameter "validate" of type "boolean" (A boolean - 0 for false, 1 for true. @range (0, 1)), parameter "ignore" of list of String :returns: instance of type "DownloadAlignmentOutput" (* The output of the download method. *) -> structure: parameter "destination_dir" of String, parameter "stats" of type "AlignmentStats" -> structure: parameter "properly_paired" of Long, parameter "multiple_alignments" of Long, parameter "singletons" of Long, parameter "alignment_rate" of Double, parameter "unmapped_reads" of Long, parameter "mapped_reads" of Long, parameter "total_reads" of Long """ # ctx is the context object # return variables are: returnVal #BEGIN download_alignment self.__LOGGER.info('Running download_alignment with params:\n' + pformat(params)) inref = params.get(self.PARAM_IN_SRC_REF) if not inref: raise ValueError('{} parameter is required'.format( self.PARAM_IN_SRC_REF)) try: alignment = self.dfu.get_objects({'object_refs': [inref]})['data'] except DFUError as e: self.__LOGGER.error( 'Logging stacktrace from workspace exception:\n' + e.data) raise # set the output dir uuid_str = str(uuid.uuid4()) output_dir = os.path.join(self.scratch, 'download_' + uuid_str) self._mkdir_p(output_dir) file_ret = self.dfu.shock_to_file({ 'shock_id': alignment[0]['data']['file']['id'], 'file_path': output_dir }) if zipfile.is_zipfile(file_ret.get('file_path')): with zipfile.ZipFile(file_ret.get('file_path')) as z: z.extractall(output_dir) for f in glob.glob(output_dir + '/*.zip'): os.remove(f) bam_files = glob.glob(output_dir + '/*.bam') if len(bam_files) == 0: raise ValueError("Alignment object does not contain a bam file") for bam_file_path in bam_files: dir, file_name, file_base, file_ext = self._get_file_path_info( bam_file_path) if params.get(self.PARAM_IN_VALIDATE, False): validate_params = {'file_path': bam_file_path} if self._validate(validate_params) == 1: raise Exception( '{0} failed validation'.format(bam_file_path)) if params.get(self.PARAM_IN_DOWNLOAD_BAI, False): bai_file = file_base + '.bai' bai_file_path = os.path.join(output_dir, bai_file) self.samtools.create_bai_from_bam(ifile=file_name, ipath=output_dir, ofile=bai_file) if not os.path.isfile(bai_file_path): raise ValueError('Error creating {}'.format(bai_file_path)) if params.get(self.PARAM_IN_DOWNLOAD_SAM, False): sam_file = file_base + '.sam' sam_file_path = os.path.join(output_dir, sam_file) self.samtools.convert_bam_to_sam(ifile=file_name, ipath=output_dir, ofile=sam_file) if not os.path.isfile(sam_file_path): raise ValueError('Error creating {}'.format(sam_file_path)) returnVal = { 'destination_dir': output_dir, 'stats': alignment[0]['data']['alignment_stats'] } #END download_alignment # At some point might do deeper type checking... if not isinstance(returnVal, dict): raise ValueError('Method download_alignment return value ' + 'returnVal is not type dict as required.') # return the results return [returnVal] def export_alignment(self, ctx, params): """ Wrapper function for use by in-narrative downloaders to download alignments from shock * :param params: instance of type "ExportParams" (* Required input parameters for exporting a reads alignment string source_ref - object reference of alignment source. The object ref is 'ws_name_or_id/obj_name_or_id' where ws_name_or_id is the workspace name or id and obj_name_or_id is the object name or id *) -> structure: parameter "source_ref" of String, parameter "exportSAM" of type "boolean" (A boolean - 0 for false, 1 for true. @range (0, 1)), parameter "exportBAI" of type "boolean" (A boolean - 0 for false, 1 for true. @range (0, 1)), parameter "validate" of type "boolean" (A boolean - 0 for false, 1 for true. @range (0, 1)), parameter "ignore" of list of String :returns: instance of type "ExportOutput" -> structure: parameter "shock_id" of String """ # ctx is the context object # return variables are: output #BEGIN export_alignment inref = params.get(self.PARAM_IN_SRC_REF) if not inref: raise ValueError('{} parameter is required'.format( self.PARAM_IN_SRC_REF)) if params.get(self.PARAM_IN_VALIDATE, False) or \ params.get('exportBAI', False) or \ params.get('exportSAM', False): """ Need to validate or convert files. Use download_alignment """ download_params = {} for key, val in params.items(): download_params[key.replace('export', 'download')] = val download_retVal = self.download_alignment(ctx, download_params)[0] export_dir = download_retVal['destination_dir'] # package and load to shock ret = self.dfu.package_for_download({ 'file_path': export_dir, 'ws_refs': [inref] }) output = {'shock_id': ret['shock_id']} else: """ return shock id from the object """ try: alignment = self.dfu.get_objects({'object_refs': [inref]})['data'] except DFUError as e: self.__LOGGER.error( 'Logging stacktrace from workspace exception:\n' + e.data) raise output = {'shock_id': alignment[0]['data']['file']['id']} #END export_alignment # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method export_alignment return value ' + 'output is not type dict as required.') # return the results return [output] def status(self, ctx): #BEGIN_STATUS returnVal = { 'state': "OK", 'message': "", 'version': self.VERSION, 'git_url': self.GIT_URL, 'git_commit_hash': self.GIT_COMMIT_HASH } #END_STATUS return [returnVal]
class BiomUtil: def _mkdir_p(self, path): """ _mkdir_p: make directory for given path """ if not path: return try: os.makedirs(path) except OSError as exc: if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise def _process_params(self, params): logging.info('start validating import_matrix_from_biom params') # check for required parameters for p in [ 'obj_type', 'matrix_name', 'workspace_name', 'scale', 'amplicon_set_name' ]: if p not in params: raise ValueError( '"{}" parameter is required, but missing'.format(p)) obj_type = params.get('obj_type') if obj_type not in self.matrix_types: raise ValueError('Unknown matrix object type: {}'.format(obj_type)) scale = params.get('scale') if scale not in SCALE_TYPES: raise ValueError('Unknown scale type: {}'.format(scale)) biom_file = None tsv_file = None fasta_file = None metadata_keys = DEFAULT_META_KEYS if params.get('biom_tsv'): biom_tsv = params.get('biom_tsv') biom_file = biom_tsv.get('biom_file_biom_tsv') tsv_file = biom_tsv.get('tsv_file_biom_tsv') if not (biom_file and tsv_file): raise ValueError('missing BIOM or TSV file') biom_file = self.dfu.download_staging_file({ 'staging_file_subdir_path': biom_file }).get('copy_file_path') tsv_file = self.dfu.download_staging_file({ 'staging_file_subdir_path': tsv_file }).get('copy_file_path') mode = 'biom_tsv' elif params.get('biom_fasta'): biom_fasta = params.get('biom_fasta') biom_file = biom_fasta.get('biom_file_biom_fasta') fasta_file = biom_fasta.get('fasta_file_biom_fasta') if not (biom_file and fasta_file): raise ValueError('missing BIOM or FASTA file') biom_file = self.dfu.download_staging_file({ 'staging_file_subdir_path': biom_file }).get('copy_file_path') fasta_file = self.dfu.download_staging_file({ 'staging_file_subdir_path': fasta_file }).get('copy_file_path') mode = 'biom_fasta' elif params.get('tsv_fasta'): tsv_fasta = params.get('tsv_fasta') tsv_file = tsv_fasta.get('tsv_file_tsv_fasta') fasta_file = tsv_fasta.get('fasta_file_tsv_fasta') if not (tsv_file and fasta_file): raise ValueError('missing TSV or FASTA file') tsv_file = self.dfu.download_staging_file({ 'staging_file_subdir_path': tsv_file }).get('copy_file_path') fasta_file = self.dfu.download_staging_file({ 'staging_file_subdir_path': fasta_file }).get('copy_file_path') metadata_keys_str = tsv_fasta.get('metadata_keys_tsv_fasta') if metadata_keys_str: metadata_keys += [ x.strip() for x in metadata_keys_str.split(',') ] mode = 'tsv_fasta' elif params.get('tsv'): tsv = params.get('tsv') tsv_file = tsv.get('tsv_file_tsv') if not tsv_file: raise ValueError('missing TSV file') tsv_file = self.dfu.download_staging_file({ 'staging_file_subdir_path': tsv_file }).get('copy_file_path') metadata_keys_str = tsv.get('metadata_keys_tsv') if metadata_keys_str: metadata_keys += [ x.strip() for x in metadata_keys_str.split(',') ] mode = 'tsv' else: raise ValueError('missing valide file group type in parameters') return (biom_file, tsv_file, fasta_file, mode, list(set(metadata_keys))) def _retrieve_value(self, biom_metadata_dict, tsv_metadata_df, key, required=False): if key in biom_metadata_dict: return {k.lower(): v for k, v in biom_metadata_dict.items()}.get(key) elif key in tsv_metadata_df: return {k.lower(): v for k, v in tsv_metadata_df.items()}.get(key) elif required: raise ValueError('missing necessary [{}] from file'.format(key)) else: return None def _search_taxon(self, scientific_name): """ logic borrowed from: GFU.GenomeInterface https://github.com/kbaseapps/GenomeFileUtil/blob/master/lib/GenomeFileUtil/core/GenomeInterface.py#L216 """ taxon_id = None search_params = { "object_types": ["taxon"], "match_filter": { "lookup_in_keys": { "scientific_name": { "value": scientific_name } }, "exclude_subobjects": 1 }, "access_filter": { "with_private": 0, "with_public": 1 }, "sorting_rules": [{ "is_object_property": 0, "property": "timestamp", "ascending": 0 }] } objects = self.kbse.search_objects(search_params)['objects'] if not objects: search_params['match_filter']['lookup_in_keys'] = { "aliases": { "value": scientific_name } } objects = self.kbse.search_objects(search_params)['objects'] if objects: taxon_id = objects[0].get('object_name') return taxon_id def _fetch_taxon_level(self, taxon_char): taxon_level_mapping = { 'l': 'Life', 'd': 'Domain', 'k': 'Kingdom', 'p': 'Phylum', 'c': 'Class', 'o': 'Order', 'f': 'Family', 'g': 'Genus', 's': 'Species' } return taxon_level_mapping.get(taxon_char[0].lower(), 'Unknown') def _fetch_taxonomy(self, datarow): lineage = self._retrieve_value([], datarow, 'taxonomy') if isinstance(lineage, str): delimiter = csv.Sniffer().sniff(lineage).delimiter lineage = [x.strip() for x in lineage.split(delimiter)] taxonomy = {'lineage': lineage} for key in ['score', 'taxonomy_source', 'species_name']: val = self._retrieve_value([], datarow, key) if val: taxonomy[key] = val for item in lineage[::-1]: scientific_name = item.split('_')[-1] taxon_level_char = item.split('_')[0] if scientific_name: taxon_id = self._search_taxon(scientific_name) if taxon_id: taxon_ref = f"{self.taxon_wsname}/{taxon_id}" taxon_level = self._fetch_taxon_level(taxon_level_char) taxonomy.update({ 'taxon_ref': taxon_ref, 'taxon_id': taxon_id, 'scientific_name': scientific_name, 'taxon_level': taxon_level }) break return taxonomy def _retrieve_tsv_amplicon_set_data(self, tsv_file): amplicons = dict() try: logging.info('start parsing TSV file') reader = pd.read_csv(tsv_file, sep=None, iterator=True) inferred_sep = reader._engine.data.dialect.delimiter df = pd.read_csv(tsv_file, sep=inferred_sep, index_col=0) except Exception: raise ValueError( 'Cannot parse file. Please provide valide TSV file') if 'consensus_sequence' not in df.columns.tolist(): raise ValueError('TSV file does not include consensus_sequence') logging.info('start processing each row in TSV') for observation_id in df.index: taxonomy = self._fetch_taxonomy(df.loc[observation_id]) amplicon = { 'consensus_sequence': df.loc[observation_id, 'consensus_sequence'], 'taxonomy': taxonomy } amplicons.update({observation_id: amplicon}) logging.info('finished parsing TSV file') return amplicons def _retrieve_tsv_fasta_amplicon_set_data(self, tsv_file, fasta_file): amplicons = dict() try: logging.info('start parsing FASTA file') fastq_dict = SeqIO.index(fasta_file, "fasta") except Exception: raise ValueError( 'Cannot parse file. Please provide valide FASTA file') try: logging.info('start parsing TSV file') reader = pd.read_csv(tsv_file, sep=None, iterator=True) inferred_sep = reader._engine.data.dialect.delimiter df = pd.read_csv(tsv_file, sep=inferred_sep, index_col=0) except Exception: raise ValueError( 'Cannot parse file. Please provide valide TSV file') logging.info('start processing files') for observation_id in df.index: if observation_id not in fastq_dict: raise ValueError('FASTA file does not have [{}] OTU id'.format( observation_id)) taxonomy = self._fetch_taxonomy(df.loc[observation_id]) amplicon = { 'consensus_sequence': str(fastq_dict.get(observation_id).seq), 'taxonomy': taxonomy } amplicons.update({observation_id: amplicon}) logging.info('finished processing files') return amplicons def _retrieve_biom_fasta_amplicon_set_data(self, biom_file, fasta_file): amplicons = dict() try: logging.info('start parsing FASTA file') fastq_dict = SeqIO.index(fasta_file, "fasta") except Exception: raise ValueError( 'Cannot parse file. Please provide valide FASTA file') logging.info('start parsing BIOM file') table = biom.load_table(biom_file) observation_ids = table._observation_ids.tolist() observation_metadata = table._observation_metadata logging.info('start processing files') for index, observation_id in enumerate(observation_ids): if observation_id not in fastq_dict: raise ValueError('FASTA file does not have [{}] OTU id'.format( observation_id)) taxonomy = self._fetch_taxonomy(observation_metadata[index]) amplicon = { 'consensus_sequence': str(fastq_dict.get(observation_id).seq), 'taxonomy': taxonomy } amplicons.update({observation_id: amplicon}) logging.info('finished processing files') return amplicons def _retrieve_biom_tsv_amplicon_set_data(self, biom_file, tsv_file): amplicons = dict() try: logging.info('start parsing TSV file') reader = pd.read_csv(tsv_file, sep=None, iterator=True) inferred_sep = reader._engine.data.dialect.delimiter df = pd.read_csv(tsv_file, sep=inferred_sep, index_col=0) except Exception: raise ValueError( 'Cannot parse file. Please provide valide tsv file') if 'consensus_sequence' not in df.columns.tolist(): raise ValueError('TSV file does not include consensus_sequence') logging.info('start parsing BIOM file') table = biom.load_table(biom_file) observation_ids = table._observation_ids.tolist() observation_metadata = table._observation_metadata logging.info('start processing files') for index, observation_id in enumerate(observation_ids): if observation_id not in df.index: raise ValueError('TSV file does not have [{}] OTU id'.format( observation_id)) taxonomy = self._fetch_taxonomy(df.loc[observation_id]) amplicon = { 'consensus_sequence': df.loc[observation_id, 'consensus_sequence'], 'taxonomy': taxonomy } amplicons.update({observation_id: amplicon}) logging.info('finished processing files') return amplicons def _file_to_amplicon_set_data(self, biom_file, tsv_file, fasta_file, mode, refs, description, matrix_obj_ref): logging.info('start parsing amplicon_set_data') amplicon_set_data = dict() if mode == 'biom_tsv': amplicons = self._retrieve_biom_tsv_amplicon_set_data( biom_file, tsv_file) elif mode == 'biom_fasta': amplicons = self._retrieve_biom_fasta_amplicon_set_data( biom_file, fasta_file) elif mode == 'tsv_fasta': amplicons = self._retrieve_tsv_fasta_amplicon_set_data( tsv_file, fasta_file) elif mode == 'tsv': amplicons = self._retrieve_tsv_amplicon_set_data(tsv_file) else: raise ValueError( 'error parsing _file_to_amplicon_set_data, mode: {}'.format( mode)) amplicon_set_data.update({'amplicons': amplicons}) if 'reads_set_ref' in refs: amplicon_set_data['reads_set_ref'] = refs.get('reads_set_ref') if description: amplicon_set_data['description'] = description matrix_obj_ref_array = matrix_obj_ref.split('/') amplicon_set_data['amplicon_matrix_ref'] = '{}/{}'.format( matrix_obj_ref_array[0], matrix_obj_ref_array[1]) return amplicon_set_data def _file_to_amplicon_data(self, biom_file, tsv_file, mode, refs, matrix_name, workspace_id, scale, description, metadata_keys=None): amplicon_data = refs if mode.startswith('biom'): logging.info('start parsing BIOM file for matrix data') table = biom.load_table(biom_file) observation_metadata = table._observation_metadata sample_metadata = table._sample_metadata matrix_data = { 'row_ids': table._observation_ids.tolist(), 'col_ids': table._sample_ids.tolist(), 'values': table.matrix_data.toarray().tolist() } logging.info('start building attribute mapping object') amplicon_data.update( self.get_attribute_mapping("row", observation_metadata, matrix_data, matrix_name, refs, workspace_id)) amplicon_data.update( self.get_attribute_mapping("col", sample_metadata, matrix_data, matrix_name, refs, workspace_id)) amplicon_data['attributes'] = {} for k in ('create_date', 'generated_by'): val = getattr(table, k) if not val: continue if isinstance(val, bytes): amplicon_data['attributes'][k] = val.decode('utf-8') else: amplicon_data['attributes'][k] = str(val) elif mode.startswith('tsv'): observation_metadata = None sample_metadata = None try: logging.info('start parsing TSV file for matrix data') reader = pd.read_csv(tsv_file, sep=None, iterator=True) inferred_sep = reader._engine.data.dialect.delimiter df = pd.read_csv(tsv_file, sep=inferred_sep, index_col=0) except Exception: raise ValueError( 'Cannot parse file. Please provide valide tsv file') else: metadata_df = None if metadata_keys: shared_metadata_keys = list( set(metadata_keys) & set(df.columns)) if mode == 'tsv' and 'consensus_sequence' not in shared_metadata_keys: raise ValueError( 'TSV file does not include consensus_sequence') if shared_metadata_keys: metadata_df = df[shared_metadata_keys] df.drop(columns=shared_metadata_keys, inplace=True) try: df = df.astype(float) except ValueError: err_msg = 'Found some non-float values. Matrix contains only numeric values\n' err_msg += 'Please list any non-numeric column names in Metadata Keys field' raise ValueError(err_msg) df.fillna(0, inplace=True) matrix_data = { 'row_ids': df.index.tolist(), 'col_ids': df.columns.tolist(), 'values': df.values.tolist() } logging.info('start building attribute mapping object') amplicon_data.update( self.get_attribute_mapping("row", observation_metadata, matrix_data, matrix_name, refs, workspace_id, metadata_df)) amplicon_data.update( self.get_attribute_mapping("col", sample_metadata, matrix_data, matrix_name, refs, workspace_id)) amplicon_data['attributes'] = {} else: raise ValueError( 'error parsing _file_to_amplicon_data, mode: {}'.format(mode)) amplicon_data.update({'data': matrix_data}) amplicon_data['search_attributes'] = [ f'{k}|{v}' for k, v in amplicon_data['attributes'].items() ] amplicon_data['scale'] = scale if description: amplicon_data['description'] = description return amplicon_data def get_attribute_mapping(self, axis, metadata, matrix_data, matrix_name, refs, workspace_id, metadata_df=None): mapping_data = {} axis_ids = matrix_data[f'{axis}_ids'] if refs.get(f'{axis}_attributemapping_ref'): am_data = self.dfu.get_objects( {'object_refs': [refs[f'{axis}_attributemapping_ref']]})['data'][0]['data'] unmatched_ids = set(axis_ids) - set(am_data['instances'].keys()) if unmatched_ids: name = "Column" if axis == 'col' else "Row" raise ValueError( f"The following {name} IDs from the uploaded matrix do not match " f"the supplied {name} attribute mapping: {', '.join(unmatched_ids)}" f"\nPlease verify the input data or upload an excel file with a" f"{name} mapping tab.") else: mapping_data[f'{axis}_mapping'] = {x: x for x in axis_ids} elif metadata: name = matrix_name + "_{}_attributes".format(axis) mapping_data[ f'{axis}_attributemapping_ref'] = self._metadata_to_attribute_mapping( axis_ids, metadata, name, workspace_id) # if coming from biom file, metadata and axis IDs are guaranteed to match mapping_data[f'{axis}_mapping'] = {x: x for x in axis_ids} elif metadata_df is not None: name = matrix_name + "_{}_attributes".format(axis) mapping_data[ f'{axis}_attributemapping_ref'] = self._meta_df_to_attribute_mapping( axis_ids, metadata_df, name, workspace_id) mapping_data[f'{axis}_mapping'] = {x: x for x in axis_ids} return mapping_data def _meta_df_to_attribute_mapping(self, axis_ids, metadata_df, obj_name, ws_id): data = {'ontology_mapping_method': "TSV file", 'instances': {}} attribute_keys = metadata_df.columns.tolist() data['attributes'] = [{ 'attribute': key, 'source': 'upload' } for key in attribute_keys] for axis_id in axis_ids: data['instances'][axis_id] = metadata_df.loc[axis_id].tolist() logging.info( 'start saving AttributeMapping object: {}'.format(obj_name)) info = self.dfu.save_objects({ "id": ws_id, "objects": [{ "type": "KBaseExperiments.AttributeMapping", "data": data, "name": obj_name }] })[0] return f'{info[6]}/{info[0]}/{info[4]}' def _metadata_to_attribute_mapping(self, instances, metadata, obj_name, ws_id): data = {'ontology_mapping_method': "BIOM file", 'instances': {}} sample_set = metadata[0:min(len(metadata), 25)] metadata_keys = sorted( set((k for m_dict in sample_set for k in m_dict))) data['attributes'] = [{ 'attribute': key, 'source': 'upload' } for key in metadata_keys] for inst, meta in zip(instances, metadata): data['instances'][inst] = [ str(meta[attr]) for attr in metadata_keys ] logging.info( 'start saving AttributeMapping object: {}'.format(obj_name)) info = self.dfu.save_objects({ "id": ws_id, "objects": [{ "type": "KBaseExperiments.AttributeMapping", "data": data, "name": obj_name }] })[0] return f'{info[6]}/{info[0]}/{info[4]}' def _generate_report(self, matrix_obj_ref, amplicon_set_obj_ref, new_row_attr_ref, new_col_attr_ref, workspace_name): """ _generate_report: generate summary report """ objects_created = [{ 'ref': matrix_obj_ref, 'description': 'Imported Amplicon Matrix' }, { 'ref': amplicon_set_obj_ref, 'description': 'Imported Amplicon Set' }] if new_row_attr_ref: objects_created.append({ 'ref': new_row_attr_ref, 'description': 'Imported Amplicons(Row) Attribute Mapping' }) if new_col_attr_ref: objects_created.append({ 'ref': new_col_attr_ref, 'description': 'Imported Samples(Column) Attribute Mapping' }) report_params = { 'message': '', 'objects_created': objects_created, 'workspace_name': workspace_name, 'report_object_name': 'import_matrix_from_biom_' + str(uuid.uuid4()) } kbase_report_client = KBaseReport(self.callback_url, token=self.token) output = kbase_report_client.create_extended_report(report_params) report_output = { 'report_name': output['name'], 'report_ref': output['ref'] } return report_output def _df_to_tsv(self, amplicon_set_df, result_dir, amplicon_set_ref): logging.info('writting amplicon set data frame to tsv file') amplicon_set_obj = self.dfu.get_objects( {'object_refs': [amplicon_set_ref]})['data'][0] amplicon_set_info = amplicon_set_obj['info'] amplicon_set_name = amplicon_set_info[1] file_path = os.path.join(result_dir, amplicon_set_name + ".tsv") amplicon_set_df.to_csv(file_path, sep='\t', index=True, header=True) return file_path def _amplicon_set_to_df(self, amplicon_set_ref): logging.info('converting amplicon set to data frame') am_set_data = self.dfu.get_objects({'object_refs': [amplicon_set_ref] })['data'][0]['data'] amplicon_matrix_ref = am_set_data.get('amplicon_matrix_ref') matrix_data = self.dfu.get_objects( {'object_refs': [amplicon_matrix_ref]})['data'][0]['data'] matrix_value_data = matrix_data.get('data') index = matrix_value_data.get('row_ids') columns = matrix_value_data.get('col_ids') values = matrix_value_data.get('values') df = pd.DataFrame(values, index=index, columns=columns) amplicons = am_set_data.get('amplicons') meta_index = list() meta_columns = [ 'taxonomy', 'taxon_id', 'taxon_ref', 'taxon_level', 'score', 'taxonomy_source', 'species_name', 'consensus_sequence' ] meta_values = list() for otu_id, amplicon in amplicons.items(): meta_index.append(otu_id) taxonomy_data = amplicon.get('taxonomy') taxonomy = taxonomy_data.get('lineage') taxon_id = taxonomy_data.get('taxon_id') taxon_ref = taxonomy_data.get('taxon_ref') taxon_level = taxonomy_data.get('taxon_level') score = taxonomy_data.get('score') taxonomy_source = taxonomy_data.get('taxonomy_source') species_name = taxonomy_data.get('species_name') consensus_sequence = amplicon.get('consensus_sequence') meta_values.append([ taxonomy, taxon_id, taxon_ref, taxon_level, score, taxonomy_source, species_name, consensus_sequence ]) meta_df = pd.DataFrame(meta_values, index=meta_index, columns=meta_columns) merged_df = df.merge(meta_df, left_index=True, right_index=True, how='left', validate='one_to_one') return merged_df def __init__(self, config): self.callback_url = config['SDK_CALLBACK_URL'] self.scratch = config['scratch'] self.token = config['KB_AUTH_TOKEN'] self.dfu = DataFileUtil(self.callback_url) self.data_util = DataUtil(config) self.attr_util = AttributesUtil(config) self.matrix_util = MatrixUtil(config) self.matrix_types = [ x.split(".")[1].split('-')[0] for x in self.data_util.list_generic_types() ] self.taxon_wsname = config['taxon-workspace-name'] self.kbse = KBaseSearchEngine(config['search-url']) def import_matrix_from_biom(self, params): """ arguments: obj_type: one of ExpressionMatrix, FitnessMatrix, DifferentialExpressionMatrix matrix_name: matrix object name workspace_name: workspace name matrix object to be saved to input_shock_id: file shock id or input_file_path: absolute file path or input_staging_file_path: staging area file path optional arguments: col_attributemapping_ref: column AttributeMapping reference row_attributemapping_ref: row AttributeMapping reference genome_ref: genome reference matrix_obj_ref: Matrix reference """ (biom_file, tsv_file, fasta_file, mode, metadata_keys) = self._process_params(params) workspace_name = params.get('workspace_name') matrix_name = params.get('matrix_name') amplicon_set_name = params.get('amplicon_set_name') obj_type = params.get('obj_type') scale = params.get('scale') description = params.get('description') refs = {k: v for k, v in params.items() if "_ref" in k} if not isinstance(workspace_name, int): workspace_id = self.dfu.ws_name_to_id(workspace_name) else: workspace_id = workspace_name amplicon_data = self._file_to_amplicon_data(biom_file, tsv_file, mode, refs, matrix_name, workspace_id, scale, description, metadata_keys) new_row_attr_ref = None if not params.get('row_attributemapping_ref'): new_row_attr_ref = amplicon_data.get('row_attributemapping_ref') new_col_attr_ref = None if not params.get('col_attributemapping_ref'): new_col_attr_ref = amplicon_data.get('col_attributemapping_ref') logging.info('start saving Matrix object: {}'.format(matrix_name)) matrix_obj_ref = self.data_util.save_object({ 'obj_type': 'KBaseMatrices.{}'.format(obj_type), 'obj_name': matrix_name, 'data': amplicon_data, 'workspace_name': workspace_id })['obj_ref'] amplicon_set_data = self._file_to_amplicon_set_data( biom_file, tsv_file, fasta_file, mode, refs, description, matrix_obj_ref) logging.info( 'start saving AmpliconSet object: {}'.format(amplicon_set_name)) amplicon_set_obj_ref = self.data_util.save_object({ 'obj_type': 'KBaseExperiments.AmpliconSet', 'obj_name': amplicon_set_name, 'data': amplicon_set_data, 'workspace_name': workspace_id })['obj_ref'] logging.info( 'start resaving Matrix object with amplicon set: {}'.format( matrix_name)) amplicon_data['amplicon_set_ref'] = '{}/{}'.format( workspace_id, amplicon_set_name) matrix_obj_ref = self.data_util.save_object({ 'obj_type': 'KBaseMatrices.{}'.format(obj_type), 'obj_name': matrix_name, 'data': amplicon_data, 'workspace_name': workspace_id })['obj_ref'] returnVal = { 'matrix_obj_ref': matrix_obj_ref, 'amplicon_set_obj_ref': amplicon_set_obj_ref } report_output = self._generate_report(matrix_obj_ref, amplicon_set_obj_ref, new_row_attr_ref, new_col_attr_ref, workspace_name) returnVal.update(report_output) return returnVal def export_amplicon_set_tsv(self, params): """ export AmpliconSet as TSV """ logging.info('start exporting amplicon set object') amplicon_set_ref = params.get('input_ref') amplicon_set_df = self._amplicon_set_to_df(amplicon_set_ref) result_dir = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(result_dir) self._df_to_tsv(amplicon_set_df, result_dir, amplicon_set_ref) package_details = self.dfu.package_for_download({ 'file_path': result_dir, 'ws_refs': [amplicon_set_ref] }) return {'shock_id': package_details['shock_id']}
class sample_uploader: ''' Module Name: sample_uploader Module Description: A KBase module: sample_uploader ''' ######## WARNING FOR GEVENT USERS ####### noqa # Since asynchronous IO can lead to methods - even the same method - # interrupting each other, you must be *very* careful when using global # state. A method could easily clobber the state set by another while # the latter method is running. ######################################### noqa VERSION = "0.0.14" GIT_URL = "[email protected]:Tianhao-Gu/sample_uploader.git" GIT_COMMIT_HASH = "fddb10ca67368def8437569f8157b71b59f41e1c" #BEGIN_CLASS_HEADER #END_CLASS_HEADER # config contains contents of config file in a hash or None if it couldn't # be found def __init__(self, config): #BEGIN_CONSTRUCTOR self.callback_url = os.environ['SDK_CALLBACK_URL'] self.workspace_url = config['workspace-url'] self.scratch = config['scratch'] # janky, but works for now self.sw_url = config.get('kbase-endpoint') + '/service_wizard' self.dfu = DataFileUtil(url=self.callback_url) logging.basicConfig(format='%(created)s %(levelname)s: %(message)s', level=logging.INFO) #END_CONSTRUCTOR pass def import_samples(self, ctx, params): """ :param params: instance of type "ImportSampleInputs" -> structure: parameter "sample_set_ref" of String, parameter "sample_file" of String, parameter "workspace_name" of String, parameter "workspace_id" of Long, parameter "file_format" of String, parameter "description" of String, parameter "set_name" of String, parameter "header_row_index" of Long, parameter "id_field" of String, parameter "output_format" of String, parameter "taxonomy_source" of String, parameter "num_otus" of Long, parameter "incl_seq" of Long, parameter "otu_prefix" of String, parameter "share_within_workspace" of Long, parameter "prevalidate" of Long, parameter "incl_input_in_output" of Long :returns: instance of type "ImportSampleOutputs" -> structure: parameter "report_name" of String, parameter "report_ref" of String, parameter "sample_set" of type "SampleSet" -> structure: parameter "samples" of list of type "sample_info" -> structure: parameter "id" of type "sample_id", parameter "name" of String, parameter "description" of String, parameter "sample_set_ref" of String """ # ctx is the context object # return variables are: output #BEGIN import_samples print(f"Beginning sample import with following parameters:") print(f"params -- {params}") sample_set = {"samples": []} # Check if we have an existing Sample Set as input # if so, download if params.get('sample_set_ref'): ret = self.dfu.get_objects( {'object_refs': [params['sample_set_ref']]})['data'][0] sample_set = ret['data'] set_name = ret['info'][1] save_ws_id = params['sample_set_ref'].split('/')[0] else: if not params.get('set_name'): raise ValueError( f"Sample set name required, when new SampleSet object is created." ) set_name = params['set_name'] save_ws_id = params.get('workspace_id') if params.get('header_row_index'): header_row_index = int(params["header_row_index"]) - 1 else: header_row_index = 0 if params.get('file_format') == "SESAR": header_row_index = 1 username = ctx['user_id'] if params.get('file_format') == 'ENIGMA': # ENIGMA_mappings['verification_mapping'].update( # {key: ("is_string", []) for key in ENIGMA_mappings['basic_columns']} # ) sample_set, errors = import_samples_from_file( params, self.sw_url, self.workspace_url, username, ctx['token'], ENIGMA_mappings['column_mapping'], ENIGMA_mappings.get('groups', []), ENIGMA_mappings['date_columns'], ENIGMA_mappings.get('column_unit_regex', []), sample_set, header_row_index) elif params.get('file_format') == 'SESAR': # SESAR_mappings['verification_mapping'].update( # {key: ("is_string", []) for key in SESAR_mappings['basic_columns']} # ) sample_set, errors = import_samples_from_file( params, self.sw_url, self.workspace_url, username, ctx['token'], SESAR_mappings['column_mapping'], SESAR_mappings.get('groups', []), SESAR_mappings['date_columns'], SESAR_mappings.get('column_unit_regex', []), sample_set, header_row_index) elif params.get('file_format') == 'KBASE': sample_set, errors = import_samples_from_file( params, self.sw_url, self.workspace_url, username, ctx['token'], {}, [], [], [], sample_set, header_row_index) else: raise ValueError( f"Only SESAR and ENIGMA formats are currently supported for importing samples. " "File of format {params.get('file_format')} not supported.") file_links = [] sample_set_ref = None html_link = None if errors: # create UI to display the errors clearly html_link = _error_ui(errors, self.scratch) else: # only save object if there are no errors obj_info = self.dfu.save_objects({ 'id': save_ws_id, 'objects': [{ "name": set_name, "type": "KBaseSets.SampleSet", "data": sample_set }] })[0] sample_set_ref = '/'.join( [str(obj_info[6]), str(obj_info[0]), str(obj_info[4])]) sample_file_name = os.path.basename( params['sample_file']).split('.')[0] + '_OTU' # -- Format outputs below -- # if output file format specified, add one to output if params.get('output_format') in ['csv', 'xls']: otu_path = sample_set_to_OTU_sheet(sample_set, sample_file_name, self.scratch, params) file_links.append({ 'path': otu_path, 'name': os.path.basename(otu_path), 'label': "OTU template file", 'description': "file with each column containing the assigned sample_id and sample " "name of each saved sample. Intended for uploading OTU data." }) if params.get('incl_input_in_output'): sample_file = params.get('sample_file') if not os.path.isfile(sample_file): # try prepending '/staging/' to file and check then if os.path.isfile(os.path.join('/staging', sample_file)): sample_file = os.path.join('/staging', sample_file) else: raise ValueError( f"input file {sample_file} does not exist.") sample_file_copy = os.path.join(self.scratch, os.path.basename(sample_file)) shutil.copy(sample_file, sample_file_copy) file_links.append({ "path": sample_file_copy, "name": os.path.basename(sample_file_copy), "label": "Input Sample file", "description": "Input file provided to create the sample set." }) # create report report_client = KBaseReport(self.callback_url) report_data = { 'report_object_name': "SampleSet_import_report_" + str(uuid.uuid4()), 'workspace_name': params['workspace_name'] } if file_links: report_data['file_links'] = file_links if sample_set_ref: report_data[ 'message'] = f"SampleSet object named \"{set_name}\" imported." report_data['objects_created'] = [{'ref': sample_set_ref}] if html_link: report_data['html_links'] = [{ 'path': html_link, 'name': 'index.html', 'description': 'Sample Set Import Error ui' }] report_data['direct_html_link_index'] = 0 report_info = report_client.create_extended_report(report_data) output = { 'report_ref': report_info['ref'], 'report_name': report_info['name'], 'sample_set': sample_set, 'sample_set_ref': sample_set_ref, 'errors': errors } #END import_samples # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method import_samples return value ' + 'output is not type dict as required.') # return the results return [output] def import_samples_from_IGSN(self, ctx, params): """ :param params: instance of type "ImportSampleIGSNInputs" -> structure: parameter "sample_set_ref" of String, parameter "igsns" of list of String, parameter "workspace_name" of String, parameter "workspace_id" of Long, parameter "description" of String, parameter "set_name" of String, parameter "output_format" of String, parameter "taxonomy_source" of String, parameter "num_otus" of Long, parameter "incl_seq" of Long, parameter "otu_prefix" of String, parameter "share_within_workspace" of Long, parameter "prevalidate" of Long, parameter "incl_input_in_output" of Long :returns: instance of type "ImportSampleOutputs" -> structure: parameter "report_name" of String, parameter "report_ref" of String, parameter "sample_set" of type "SampleSet" -> structure: parameter "samples" of list of type "sample_info" -> structure: parameter "id" of type "sample_id", parameter "name" of String, parameter "description" of String, parameter "sample_set_ref" of String """ # ctx is the context object # return variables are: output #BEGIN import_samples_from_IGSN igsns = params.get('igsns') if not igsns: raise ValueError('Please provide IGSNs') if isinstance(igsns, str): if igsns.isalnum(): # single igsn given e.g. 'IEAWH0001' igsns = [igsns] else: # multiple igsn given e.g. 'IEAWH0001, GEE0000O4' or 'IEAWH0001; GEE0000O4' delimiter = csv.Sniffer().sniff(igsns).delimiter igsns = [x.strip() for x in igsns.split(delimiter)] logging.info('Start importing samples from IGSNs: {}'.format(igsns)) sample_file_name = 'isgn_sample_{}.csv'.format(str(uuid.uuid4())) sample_file_dir = os.path.join(self.scratch, str(uuid.uuid4())) os.makedirs(sample_file_dir) sample_file = os.path.join(sample_file_dir, sample_file_name) igsns_to_csv(igsns, sample_file) params['sample_file'] = sample_file params['file_format'] = 'SESAR' output = self.import_samples(ctx, params)[0] #END import_samples_from_IGSN # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method import_samples_from_IGSN return value ' + 'output is not type dict as required.') # return the results return [output] def generate_OTU_sheet(self, ctx, params): """ :param params: instance of type "GenerateOTUSheetParams" (Generate a customized OTU worksheet using a SampleSet input to generate the appropriate columns.) -> structure: parameter "workspace_name" of String, parameter "workspace_id" of Long, parameter "sample_set_ref" of String, parameter "output_name" of String, parameter "output_format" of String, parameter "num_otus" of Long, parameter "taxonomy_source" of String, parameter "incl_seq" of Long, parameter "otu_prefix" of String :returns: instance of type "GenerateOTUSheetOutputs" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN generate_OTU_sheet # first we download sampleset sample_set_ref = params.get('sample_set_ref') ret = self.dfu.get_objects({'object_refs': [sample_set_ref]})['data'][0] sample_set = ret['data'] if params.get('output_name'): output_name = params.get('output_name') else: # if output_name not specified use name of sample_set as output + "_OTUs" output_name = ret['info'][1] + "_OTUs" otu_path = sample_set_to_OTU_sheet(sample_set, output_name, self.scratch, params) report_client = KBaseReport(self.callback_url) report_name = "Generate_OTU_sheet_report_" + str(uuid.uuid4()) report_info = report_client.create_extended_report({ 'file_links': [{ 'path': otu_path, 'name': os.path.basename(otu_path), 'label': "CSV with headers for OTU", 'description': "CSV file with each column containing the assigned sample_id and sample " "name of each saved sample. Intended for uploading OTU data." }], 'report_object_name': report_name, 'workspace_name': params['workspace_name'] }) output = { 'report_ref': report_info['ref'], 'report_name': report_info['name'], } #END generate_OTU_sheet # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method generate_OTU_sheet return value ' + 'output is not type dict as required.') # return the results return [output] def update_sample_set_acls(self, ctx, params): """ :param params: instance of type "update_sample_set_acls_params" -> structure: parameter "workspace_name" of String, parameter "workspace_id" of Long, parameter "sample_set_ref" of String, parameter "new_users" of list of String, parameter "is_reader" of Long, parameter "is_writer" of Long, parameter "is_admin" of Long, parameter "share_within_workspace" of Long :returns: instance of type "update_sample_set_acls_output" -> structure: parameter "status" of String """ # ctx is the context object # return variables are: output #BEGIN update_sample_set_acls # first get sample_set object sample_set_ref = params.get('sample_set_ref') ret = self.dfu.get_objects({'object_refs': [sample_set_ref]})['data'][0] sample_set = ret['data'] sample_url = get_sample_service_url(self.sw_url) acls = {'read': [], 'write': [], 'admin': []} if params.get('share_within_workspace'): acls = get_workspace_user_perms(self.workspace_url, params.get('workspace_id'), ctx['token'], ctx['user_id'], acls) for new_user in params.get('new_users'): if params.get('is_admin'): acls['admin'].append(new_user) elif params.get('is_writer'): acls['write'].append(new_user) elif params.get('is_reader'): acls['read'].append(new_user) for sample in sample_set['samples']: sample_id = sample['id'] status = update_acls(sample_url, sample_id, acls, ctx['token']) output = {"status": status} #END update_sample_set_acls # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method update_sample_set_acls return value ' + 'output is not type dict as required.') # return the results return [output] def export_samples(self, ctx, params): """ :param params: instance of type "ExportParams" (export function for samples) -> structure: parameter "input_ref" of String, parameter "file_format" of String :returns: instance of type "ExportOutput" -> structure: parameter "shock_id" of String """ # ctx is the context object # return variables are: output #BEGIN export_samples if not params.get('input_ref'): raise ValueError(f"variable input_ref required") sample_set_ref = params.get('input_ref') output_file_format = params.get('file_format', 'SESAR') ret = self.dfu.get_objects({'object_refs': [sample_set_ref]})['data'][0] sample_set = ret['data'] sample_set_name = ret['info'][1] sample_url = get_sample_service_url(self.sw_url) export_package_dir = os.path.join(self.scratch, "output") if not os.path.isdir(export_package_dir): os.mkdir(export_package_dir) output_file = os.path.join(export_package_dir, '_'.join(sample_set_name.split()) + ".csv") sample_set_to_output(sample_set, sample_url, ctx['token'], output_file, output_file_format) # package it up package_details = self.dfu.package_for_download({ 'file_path': export_package_dir, 'ws_refs': [params['input_ref']] }) output = { 'shock_id': package_details['shock_id'], 'result_dir': export_package_dir } #END export_samples # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method export_samples return value ' + 'output is not type dict as required.') # return the results return [output] def link_reads(self, ctx, params): """ :param params: instance of type "LinkReadsParams" -> structure: parameter "workspace_name" of String, parameter "workspace_id" of String, parameter "sample_set_ref" of String, parameter "links" of list of type "ReadsLink" (Create links between samples and reads objects.) -> structure: parameter "sample_name" of String, parameter "reads_ref" of String :returns: instance of type "LinkReadsOutput" -> structure: parameter "report_name" of String, parameter "report_ref" of String, parameter "links" of list of unspecified object """ # ctx is the context object # return variables are: output #BEGIN link_reads logging.info(params) ss = SampleService(self.sw_url, service_ver='dev') sample_set_ref = params['sample_set_ref'] sample_set_obj = self.dfu.get_objects( {'object_refs': [sample_set_ref]})['data'][0]['data'] sample_name_2_info = {d['name']: d for d in sample_set_obj['samples']} links = [(d['sample_name'][0], d['reads_ref']) for d in params['links']] new_data_links = [] for sample_name, reads_ref in links: sample_id = sample_name_2_info[sample_name]['id'] version = sample_name_2_info[sample_name]['version'] sample = ss.get_sample({ 'id': sample_id, 'version': version, }) ret = ss.create_data_link( dict( upa=reads_ref, id=sample_id, version=version, node=sample['node_tree'][0]['id'], update=1, )) new_data_links.append(ret) report_client = KBaseReport(self.callback_url) report_info = report_client.create_extended_report({ 'workspace_name': params['workspace_name'], }) output = { 'report_name': report_info['name'], 'report_ref': report_info['ref'], 'links': new_data_links, } #END link_reads # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method link_reads return value ' + 'output is not type dict as required.') # return the results return [output] def status(self, ctx): #BEGIN_STATUS returnVal = { 'state': "OK", 'message': "", 'version': self.VERSION, 'git_url': self.GIT_URL, 'git_commit_hash': self.GIT_COMMIT_HASH } #END_STATUS return [returnVal]
class ParmigeneUtils: R_BIN = '/kb/deployment/bin' PARMI_OUT_DIR = 'parmigene_output' PARAM_IN_WS = 'workspace_name' PARAM_IN_MATRIX = 'input_obj_ref' PARAM_OUT_MATRIX = 'parmigene_matrix_name' OMP_NUM_THREADS = 'num_threads' def _mkdir_p(self, path): """ _mkdir_p: make directory for given path """ if not path: return try: os.makedirs(path) except OSError as exc: if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise def _validate_run_mi_params(self, params): """ _validate_run_mi_params: validates params passed to run_mi method """ logging.info('start validating run_mi params') # check for required parameters for p in [ self.PARAM_IN_MATRIX, self.PARAM_IN_WS, self.PARAM_OUT_MATRIX ]: if p not in params: raise ValueError( '"{}" parameter is required, but missing'.format(p)) def _build_rParmigene_script(self, mi, algthm, num_threads, eps): """ _build_rParmigene_script: build a sequence of R command calls according to params Note: To run the Parmigene functions, we will call different functions from the parmigene package, that requires a mutual information matrix (mi), the algorithm name (algthm), actual number of threads used (num_threads) and, sometimes, a positive numeric criteria (eps) to remove the weakest edge of each triple of nodes. """ parmi_scrpt = 'library(parmigene)\n' parmi_scrpt += algthm + '(' + mi + ',' + eps + ')\n' # save the results in the memory # 1) store species ordination parmi_scrpt += 'variableScores <- vg_data.parmi$species\n' # 2) store site ordination parmi_scrpt += 'sampleScores <- vg_data.parmi$points\n' # save the results to the current dir # Write CSV in R parmi_scrpt += 'write.csv(dist_matrix,file="dist_matrix.csv",row.names=TRUE,na="")\n' parmi_scrpt += 'write.csv(variableScores,file="species_ordination.csv",' + \ 'row.names=TRUE,na="")\n' # Write JSON in R parmi_scrpt += 'write_json(toJSON(dist_matrix),path="dist_matrix.json",pretty=TRUE,' + \ 'auto_unbox=FALSE)\n' # save Parmigene plot parmi_scrpt += 'bmp(file="saving_mi_plot.bmp",width=6,height=4,units="in",res=100)\n' parmi_scrpt += 'plot(vg_data.parmi,type="n",display="sites")\n' parmi_scrpt += 'points(vg_data.parmi)\n' parmi_scrpt += 'dev.off()\n' parmi_rscript = 'parmi_script.R' rscrpt_file_path = os.path.join(self.output_dir, parmi_rscript) with open(rscrpt_file_path, 'w') as r_file: r_file.write(parmi_scrpt) return rscrpt_file_path def _execute_r_script(self, rfile_name): """ _execute_r_script: Calling the Rscript executable to run the R script in rfile_name """ logging.info('Calling R......') result_dir = os.path.dirname(rfile_name) if not result_dir: result_dir = self.working_dir rcmd = [os.path.join(self.R_BIN, 'Rscript')] rcmd.append(rfile_name) logging.info( 'Running Parmigene script in current working directory: {}'.format( result_dir)) exitCode = 0 try: complete_proc = subprocess.run(rcmd, cwd=result_dir, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, close_fds=True) exitCode = complete_proc.returncode if (exitCode == 0): logging.info('\n{}'.format(complete_proc.stdout)) logging.info( '\n{} was executed successfully, exit code was: {}'.format( ' '.join(rcmd), str(exitCode))) logging.info("Finished calling R.") else: logging.info('Error running command: {} Exit Code: '.format( ' '.join(rcmd), str(exitCode))) logging.info('\n{}'.format(complete_proc.stderr)) except subprocess.CalledProcessError as sub_e: exitCode = -99 logging.info( 'Caught subprocess.CalledProcessError {}'.format(sub_e)) return exitCode def _df_to_list(self, df): """ _df_to_list: convert Dataframe to FloatMatrix2D matrix data """ df.index = df.index.astype('str') df.columns = df.columns.astype('str') df.fillna(0, inplace=True) matrix_data = { 'row_ids': df.index.tolist(), 'col_ids': df.columns.tolist(), 'values': df.values.tolist() } return matrix_data def _mi_df_to_excel(self, mi_df, distance_df, result_dir, mi_matrix_ref): """ write mutual information matrix df into excel """ logging.info('writting mi data frame to excel file') mi_matrix_obj = self.dfu.get_objects({'object_refs': [mi_matrix_ref]})['data'][0] mi_matrix_info = mi_matrix_obj['info'] mi_matrix_name = mi_matrix_info[1] file_path = os.path.join(result_dir, mi_matrix_name + ".xlsx") writer = pd.ExcelWriter(file_path) mi_df.to_excel(writer, "mi_matrix", index=True) if distance_df: distance_df.to_excel(writer, "mi_distance_matrix", index=True) writer.close() def _Matrix2D_to_df(self, Matrix2D): """ _Matrix2D_to_df: transform a FloatMatrix2D to data frame """ index = Matrix2D.get('row_ids') columns = Matrix2D.get('col_ids') values = Matrix2D.get('values') df = pd.DataFrame(values, index=index, columns=columns) return df def _mi_to_df(self, mi_matrix_ref): """ retrieve mutual information matrix ws object to mi_df """ logging.info('converting mutual information matrix to data frame') mi_data = self.dfu.get_objects({'object_refs': [mi_matrix_ref]})['data'][0]['data'] rotation_matrix_data = mi_data.get('rotation_matrix') distance_matrix_data = mi_data.get('distance_matrix') original_matrix_ref = mi_data.get('original_matrix_ref') dimension = mi_data.get('mi_parameters').get('n_components') mi_df = self._Matrix2D_to_df(rotation_matrix_data) distance_df = None if distance_matrix_data: distance_df = self._Matrix2D_to_df(distance_matrix_data) if original_matrix_ref: logging.info( 'appending instance group information to mutual information data frame' ) obj_data = self.dfu.get_objects( {'object_refs': [original_matrix_ref]})['data'][0]['data'] attributemapping_ref = obj_data.get( '{}_attributemapping_ref'.format(dimension)) am_data = self.dfu.get_objects( {'object_refs': [attributemapping_ref]})['data'][0]['data'] attributes = am_data.get('attributes') instances = am_data.get('instances') am_df = pd.DataFrame(data=list(instances.values()), columns=list( map(lambda x: x.get('attribute'), attributes)), index=instances.keys()) mi_df = mi_df.merge(am_df, left_index=True, right_index=True, how='left', validate='one_to_one') return mi_df, distance_df def _save_mi_matrix(self, workspace_name, input_obj_ref, mi_matrix_name, distance_df, mi_params_df, site_ordin_df, species_ordin_df): logging.info('Saving MIMatrix...') if not isinstance(workspace_name, int): ws_name_id = self.dfu.ws_name_to_id(workspace_name) else: ws_name_id = workspace_name mi_data = {} mi_data.update({'distance_matrix': self._df_to_list(distance_df)}) mi_data.update({'site_ordination': self._df_to_list(site_ordin_df)}) mi_data.update( {'species_ordination': self._df_to_list(species_ordin_df)}) mi_data.update({'mi_parameters': self._df_to_list(mi_params_df)}) mi_data.update({'original_matrix_ref': input_obj_ref}) mi_data.update({'rotation_matrix': self._df_to_list(distance_df)}) obj_type = 'KBaseExperiments.PCAMatrix' info = self.dfu.save_objects({ "id": ws_name_id, "objects": [{ "type": obj_type, "data": mi_data, "name": mi_matrix_name }] })[0] return "%s/%s/%s" % (info[6], info[0], info[4]) def _zip_folder(self, folder_path, output_path): """ _zip_folder: Zip the contents of an entire folder (with that folder included in the archive). Empty subfolders could be included in the archive as well if the 'Included all subfolders, including empty ones' portion. portion is used. """ with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED, allowZip64=True) as ziph: for root, folders, files in os.walk(folder_path): # Include all subfolders, including empty ones. for folder_name in folders: absolute_fpath = os.path.join(root, folder_name) relative_fpath = os.path.join(os.path.basename(root), folder_name) logging.info( "Adding {} to archive.".format(absolute_fpath)) ziph.write(absolute_fpath, relative_fpath) for f in files: absolute_path = os.path.join(root, f) relative_path = os.path.join(os.path.basename(root), f) logging.info("Adding {} to archive.".format(absolute_path)) ziph.write(absolute_path, relative_path) logging.info("{} created successfully.".format(output_path)) def _generate_output_file_list(self, out_dir): """ _generate_output_file_list: zip result files and generate file_links for report """ logging.info('Start packing result files from Parmigene...') output_files = list() output_dir = os.path.join(self.working_dir, str(uuid.uuid4())) self._mkdir_p(output_dir) mi_output = os.path.join(output_dir, 'metami_output.zip') self._zip_folder(out_dir, mi_output) output_files.append({ 'path': mi_output, 'name': os.path.basename(mi_output), 'label': os.path.basename(mi_output), 'description': 'Output file(s) generated by Parmigene' }) return output_files def _generate_mi_html_report(self, mi_outdir, n_components): logging.info('Start generating html report for Parmigene results...') html_report = list() result_dir = os.path.join(self.working_dir, str(uuid.uuid4())) self._mkdir_p(result_dir) result_file_path = os.path.join(result_dir, 'mi_result.html') mi_plots = list() for root, folders, files in os.walk(mi_outdir): # Find the image files by their extensions. for f in files: if re.match('^[a-zA-Z]+.*.(jpeg|jpg|bmp|tiff|pdf|ps)$', f): absolute_path = os.path.join(root, f) logging.info( "Adding {} to plot archive.".format(absolute_path)) mi_plots.append(absolute_path) visualization_content = '' for mi_plot in mi_plots: shutil.copy2(mi_plot, os.path.join(result_dir, os.path.basename(mi_plot))) visualization_content += '<iframe height="900px" width="100%" ' visualization_content += 'src="{}" '.format( os.path.basename(mi_plot)) visualization_content += 'style="border:none;"></iframe>\n<p></p>\n' with open(result_file_path, 'w') as result_file: with open( os.path.join(os.path.dirname(__file__), 'templates', 'mi_template.html'), 'r') as report_template_file: report_template = report_template_file.read() report_template = report_template.replace( '<p>Visualization_Content</p>', visualization_content) report_template = report_template.replace( 'n_components', '{} Components'.format(n_components)) result_file.write(report_template) report_shock_id = self.dfu.file_to_shock({ 'file_path': result_dir, 'pack': 'zip' })['shock_id'] html_report.append({ 'shock_id': report_shock_id, 'name': os.path.basename(result_file_path), 'label': os.path.basename(result_file_path), 'description': 'HTML summary report for Parmigene Matrix App' }) return html_report def _generate_mi_report(self, mi_ref, output_dir, workspace_name, n_components): logging.info('Creating Parmigene report...') output_files = self._generate_output_file_list(output_dir) output_html_files = self._generate_mi_html_report( output_dir, n_components) objects_created = list() objects_created.append({ 'ref': mi_ref, 'description': 'Mutual Information Matrix' }) report_params = { 'message': '', 'workspace_name': workspace_name, 'file_links': output_files, 'objects_created': objects_created, 'html_links': output_html_files, 'direct_html_link_index': 0, 'html_window_height': 666, 'report_object_name': 'kb_mi_report_' + str(uuid.uuid4()) } kbase_report_client = KBaseReport(self.callback_url) output = kbase_report_client.create_extended_report(report_params) report_output = { 'report_name': output['name'], 'report_ref': output['ref'] } return report_output def __init__(self, config): self.ws_url = config["workspace-url"] self.callback_url = config['SDK_CALLBACK_URL'] self.token = config['KB_AUTH_TOKEN'] self.scratch = config['scratch'] self.dfu = DataFileUtil(self.callback_url, service_ver='release') self.working_dir = self.scratch self.data_util = DataUtil(config) self.dfu = DataFileUtil(self.callback_url) self.output_dir = os.path.join(self.working_dir, self.PARMI_OUT_DIR) self._mkdir_p(self.output_dir) def run_mi(self, params): """ run_mi: perform Parmigene analysis on matrix :param input_obj_ref: object reference of a matrix :param workspace_name: the name of the workspace :param mi_matrix_name: name of Parmigene (KBaseExperiments.MIMatrix) object :param n_components - dimentionality of the reduced space (default 2) :param max_iter: maximum iterations allowed :param distance_metric: distance the ordination will be performed on, default to "bray" """ logging.info('--->\nrunning Parmigene with input\n' + 'params:\n{}'.format(json.dumps(params, indent=1))) self._validate_run_mi_params(params) input_obj_ref = params.get(self.PARAM_IN_MATRIX) workspace_name = params.get(self.PARAM_IN_WS) mi_matrix_name = params.get(self.PARAM_OUT_MATRIX) n_threads = int(params.get(self.OMP_NUM_THREADS, 2)) res = self.dfu.get_objects({'object_refs': [input_obj_ref]})['data'][0] obj_data = res['data'] obj_name = res['info'][1] obj_type = res['info'][2] exitCode = -99 if "KBaseMatrices" in obj_type: # create the input file from obj_data matrix_tab = obj_data['data']['values'] row_ids = obj_data['data']['row_ids'] col_ids = obj_data['data']['col_ids'] matrix_df = pd.DataFrame(matrix_tab, index=row_ids, columns=col_ids) matrix_data_file = os.path.join(self.output_dir, obj_name + '.csv') with open(matrix_data_file, 'w') as m_file: matrix_df.to_csv(m_file, sep='\t') params['datafile'] = matrix_data_file exitCode = self.run_mi_with_file(params) else: err_msg = 'Ooops! [{}] is not supported.\n'.format(obj_type) err_msg += 'Please provide a KBaseMatrices object' raise ValueError("err_msg") if exitCode == -99: raise ValueError( 'Caught subprocess.CalledProcessError while calling R.') # saving the mi_matrix object # read Parmigene results from files into data frames dist_matrix_df = pd.read_csv( os.path.join(self.output_dir, "dist_matrix.csv")) mi_params_df = pd.read_json( os.path.join(self.output_dir, "others.json")) site_ordin_df = pd.read_csv( os.path.join(self.output_dir, "site_ordination.csv")) species_ordin_df = pd.read_csv( os.path.join(self.output_dir, "species_ordination.csv")) mi_ref = self._save_mi_matrix(workspace_name, input_obj_ref, mi_matrix_name, dist_matrix_df, mi_params_df, site_ordin_df, species_ordin_df) returnVal = {'mi_ref': mi_ref} # generating report report_output = self._generate_mi_report(mi_ref, self.output_dir, workspace_name, n_threads) returnVal.update(report_output) return returnVal def run_mi_with_file(self, params): """ run_mi_with_file: perform Parmigene analysis on matrix :param datafile: a file that contains the matrix data :param workspace_name: the name of the workspace :param mi_matrix_name: name of Parmigene (KBaseExperiments.MIMatrix) object :param n_components - dimentionality of the reduced space (default 2) :param max_iter: maximum iterations allowed :param distance_metric: distance the ordination will be performed on, default to "bray" """ logging.info('--->\nrunning Parmigene with input \n' + 'params:\n{}'.format(json.dumps(params, indent=1))) rscrpt_file = self._build_rmi_script(params) logging.info( '--->\nR script file has been written to {}'.format(rscrpt_file)) return self._execute_r_script(rscrpt_file) def export_mi_matrix_excel(self, params): """ export MIMatrix as Excel """ logging.info('start exporting Parmigene matrix') mi_matrix_ref = params.get('input_ref') mi_df, components_df = self._mi_to_df(mi_matrix_ref) result_dir = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(result_dir) self._mi_df_to_excel(mi_df, components_df, result_dir, mi_matrix_ref) package_details = self.dfu.package_for_download({ 'file_path': result_dir, 'ws_refs': [mi_matrix_ref] }) return {'shock_id': package_details['shock_id']}