def _generate(self) -> ReportResult: PathBuilder.build(self.result_path) test_metadata_filepath = self.test_dataset.encoded_data.info[ 'metadata_filepath'] label_names = [self.label] hdf5_filepath = self.method._metadata_to_hdf5(test_metadata_filepath, label_names) n_examples_test = len(self.test_dataset.encoded_data.example_ids) indices = np.array(range(n_examples_test)) dataloader = self.method.make_data_loader(hdf5_filepath, pre_loaded_hdf5_file=None, indices=indices, label=self.label, eval_only=True, is_train=False) model = self.method.get_model(self.label)[self.label] compute_contributions(intgrds_set_loader=dataloader, deeprc_model=model, n_steps=self.n_steps, threshold=self.threshold, resdir=self.result_path, filename_inputs=self.filename_inputs, filename_kernels=self.filename_kernels) return ReportResult(self.name, output_figures=[ ReportOutput(self.filename_inputs), ReportOutput(self.filename_kernels) ])
def plot_roc(self, optimal_hp_items, label: Label, colors) -> Tuple[ReportOutput, List[ReportOutput]]: report_data_outputs = [] figure = go.Figure() figure.add_trace( go.Scatter(x=[0, 1], y=[0, 1], mode='lines', name='baseline', line=dict(color=PerformanceOverview.PLOTLY_BLACK, dash='dash'), hoverinfo="skip")) for index, item in enumerate(optimal_hp_items): if item.test_predictions_path is None: logging.warning( f'{PerformanceOverview.__name__}: there are no test predictions for dataset ' f'{self.instruction_states[index].dataset.name}, skipping this dataset when generating performance overview...' ) else: df = pd.read_csv(item.test_predictions_path) true_class = df[f"{label.name}_true_class"].values predicted_class = df[ f"{label.name}_{label.positive_class}_proba"].values fpr, tpr, _ = metrics.roc_curve(y_true=true_class, y_score=predicted_class) auc = metrics.roc_auc_score(true_class, predicted_class) name = self.instruction_states[ index].dataset.name + f' (AUC = {round(auc, 2)})' figure.add_trace( go.Scatter(x=fpr, y=tpr, mode='lines', name=name, marker=dict(color=colors[index], line=dict(width=3)), hoverinfo="skip")) data_path = self.result_path + f"roc_curve_data_{name}.csv" pd.DataFrame({ "FPR": fpr, "TPR": tpr }).to_csv(data_path, index=False) report_data_outputs.append( ReportOutput(data_path, f'ROC curve data for dataset {name} (csv)')) figure_path = self.result_path + "roc_curve.html" figure.update_layout(template='plotly_white', xaxis_title='false positive rate', yaxis_title='true positive rate') figure.write_html(figure_path) return ReportOutput(figure_path, 'ROC curve'), report_data_outputs
def _generate(self) -> ReportResult: self.result_path = PathBuilder.build(f"{self.result_path}/{self.name}/") self._extract_label() hp_items = [state.optimal_hp_items[self.label] for state in self.instruction_states] overlap_matrix = SequenceAnalysisHelper.compute_overlap_matrix(hp_items) labels = [state.dataset.name for state in self.instruction_states] figure_path = self._make_figure(overlap_matrix, labels) data_path = self._export_matrix(overlap_matrix, labels) return ReportResult(output_figures=[ReportOutput(figure_path, 'sequence overlap across datasets')], output_tables=[ReportOutput(data_path, 'sequence overlap across datasets (csv)')])
def _discover_motif_in_cluster( self, tcr_rep, index, row, negative_examples=None ) -> Tuple[List[ReportOutput], List[ReportOutput]]: from tcrdist.adpt_funcs import get_centroid_seq from palmotif import compute_pal_motif from palmotif import svg_logo dfnode = tcr_rep.clone_df.iloc[row['neighbors_i'], ] figure_outputs, table_outputs = [], [] logging.info( f"{TCRdistMotifDiscovery.__name__}: in cluster {index+1}, there are {dfnode.shape[0]} neighbors." ) for chain in ['a', 'b']: if dfnode.shape[0] > 2: centroid, *_ = get_centroid_seq(df=dfnode) else: centroid = dfnode[f'cdr3_{chain}_aa'].to_list()[0] motif, stat = compute_pal_motif( seqs=_select(df=tcr_rep.clone_df, iloc_rows=row['neighbors_i'], col=f'cdr3_{chain}_aa'), centroid=centroid, refs=negative_examples[chain] if self.use_reference_sequences else None) figure_path = self.result_path + f"motif_{chain}_{index + 1}.svg" svg_logo(motif, filename=figure_path) motif_data_path = self.result_path + f"motif_{chain}_{index + 1}.csv" motif.to_csv(motif_data_path) figure_outputs.append( ReportOutput( figure_path, f'Motif {index + 1} ({Chain.get_chain(chain.upper()).name.lower()} chain)' )) table_outputs.append( ReportOutput( motif_data_path, f'motif {index + 1} ({Chain.get_chain(chain.upper()).name.lower()} chain) csv data' )) return figure_outputs, table_outputs
def _store_dataframes(self, training_dataframe: pd.DataFrame, test_dataframe: pd.DataFrame) -> List[ReportOutput]: train_path = self.result_path + "training_performance.csv" test_path = self.result_path + "test_performance.csv" training_dataframe.to_csv(train_path, index=False) test_dataframe.to_csv(test_path, index=False) return [ ReportOutput( path=train_path, name=f"Training performance w.r.t. {self.feature} values"), ReportOutput(path=test_path, name=f"Test performance w.r.t. {self.feature} values") ]
def _store_sequence_distribution_data(self, fig, dfs, chains): fig.write_html(self.result_path + "sequence_length_distribution.html") image_output = ReportOutput( self.result_path + "sequence_length_distribution.html", name="sequence length distribution per chain") table_outputs = [ ReportOutput( self.result_path + f"sequence_length_distribution_chain_{chains[index]}.csv") for index in range(len(chains)) ] for index, df in enumerate(dfs): df.to_csv(table_outputs[index].path, index=False) return image_output, table_outputs
def _write_repertoire_sizes(self): """ Writes the repertoire sizes (# clones & # reads) per subject, per chain. """ all_subjects = self.dataset.encoded_data.example_ids all_chains = sorted( set(self.dataset.encoded_data.feature_annotations["chain"])) results_df = pd.DataFrame(list( itertools.product(all_subjects, all_chains)), columns=["subject_id", "chain"]) results_df["n_reads"] = 0 results_df["n_clones"] = 0 for repertoire in self.dataset.repertoires: rep_counts = repertoire.get_counts() rep_chains = repertoire.get_chains() for chain in all_chains: indices = rep_chains == Chain.get_chain(chain.upper()) results_df.loc[( results_df.subject_id == repertoire.metadata["subject_id"]) & (results_df.chain == chain), 'n_reads'] += np.sum(rep_counts[indices]) results_df.loc[( results_df.subject_id == repertoire.metadata["subject_id"]) & (results_df.chain == chain), 'n_clones'] += len(rep_counts[indices]) results_path = os.path.join(self.result_path, "repertoire_sizes.csv") results_df.to_csv(results_path, index=False) return ReportOutput(results_path, "repertoire sizes")
def _write_paired_matches(self, paired_matches_path) -> List[ReportOutput]: PathBuilder.build(paired_matches_path) report_outputs = [] for i in range(0, len(self.dataset.encoded_data.example_ids) ): # todo don't mention subject in the name twice filename = "example_{}_".format( self.dataset.encoded_data.example_ids[i]) filename += "_".join([ "{label}_{value}".format(label=label, value=values[i]) for label, values in self.dataset.encoded_data.labels.items() ]) filename += ".csv" filename = os.path.join(paired_matches_path, filename) if self.dataset.encoded_data.encoding == "MatchedReceptorsEncoder": self._write_paired_receptor_matches_for_repertoire( self.dataset.encoded_data.examples[i], filename) elif self.dataset.encoded_data.encoding == "MatchedRegexEncoder": self._write_paired_regex_matches_for_repertoire( self.dataset.encoded_data.examples[i], filename) report_outputs.append( ReportOutput( filename, f"example {self.dataset.encoded_data.example_ids[i]} paired matches" )) return report_outputs
def _plot(self, data) -> ReportOutput: from rpy2.robjects import pandas2ri from rpy2.robjects.packages import STAP pandas2ri.activate() with open(EnvironmentSettings.root_path + "source/visualization/SequencingDepthOverview.R") as f: string = f.read() plot = STAP(string, "plot") PathBuilder.build(self.result_path) plot.plot_sequencing_depth_overview( data=data[[self.x, "value", "frame_type", "feature", "id"] + self.facets], x=self.x, color=self.color, facets=self.facets, palette=json.dumps(self.palette), nrow_distributions=self.nrow_distributions, nrow_scatterplot=self.nrow_scatterplot, height_distributions=self.height_distributions, height_scatterplot=self.height_scatterplot, width=self.width, result_path=self.result_path, result_name=self.result_name) return ReportOutput(path=f"{self.result_path}{self.result_name}.pdf")
def _export_matrix(self, overlap_matrix, filename, row_col_names) -> ReportOutput: data_path = f"{self.result_path}{filename}.csv" pd.DataFrame(overlap_matrix, columns=row_col_names, index=row_col_names).to_csv(data_path) return ReportOutput(data_path, " ".join(filename.split('_') + ['data']))
def _write_receptor_info(self, receptor_info_path) -> List[ReportOutput]: PathBuilder.build(receptor_info_path) receptor_chains = self.dataset.encoded_data.feature_annotations chain_types = receptor_chains["chain"].unique() first_chains = receptor_chains.loc[receptor_chains.chain == chain_types[0]] second_chains = receptor_chains.loc[receptor_chains.chain == chain_types[1]] first_chains.drop(columns=["chain"], inplace=True) second_chains.drop(columns=["chain"], inplace=True) on_cols = ["receptor_id"] if "clonotype_id" in second_chains.columns and first_chains.columns: on_cols += ["clonotype_id"] receptors = pd.merge(first_chains, second_chains, on=on_cols, suffixes=(f"_{chain_types[0]}", f"_{chain_types[1]}")) unique_alpha_chains = first_chains.drop_duplicates( subset=["sequence", "v_gene", "j_gene"]) unique_beta_chains = second_chains.drop_duplicates( subset=["sequence", "v_gene", "j_gene"]) unique_receptors = receptors.drop_duplicates(subset=[ f"sequence_{chain_types[0]}", f"v_gene_{chain_types[0]}", f"j_gene_{chain_types[0]}", f"sequence_{chain_types[1]}", f"v_gene_{chain_types[1]}", f"j_gene_{chain_types[1]}" ]) receptor_chains_path = os.path.join(receptor_info_path, "all_chains.csv") receptor_chains.to_csv(receptor_chains_path, index=False) receptors_path = os.path.join(receptor_info_path, "all_receptors.csv") receptors.to_csv(receptors_path, index=False) unique_chain1_path = os.path.join( receptor_info_path, f"unique_{chain_types[0]}_chains.csv") unique_alpha_chains.to_csv(unique_chain1_path, index=False) unique_chain2_path = os.path.join( receptor_info_path, f"unique_{chain_types[1]}_chains.csv") unique_beta_chains.to_csv(unique_chain2_path, index=False) unique_receptors_path = os.path.join(receptor_info_path, "unique_receptors.csv") unique_receptors.to_csv(unique_receptors_path, index=False) return [ ReportOutput(p) for p in [ receptors_path, receptor_chains_path, unique_receptors_path, unique_chain1_path, unique_chain2_path ] ]
def _plot_fc_figure(self, df, bias): fig = make_subplots(rows=1, cols=2, column_widths=[0.8, 0.2], specs=[[{"type": "bar"}, {'type': "table"}]]) fig.add_trace(go.Bar(x=df["names"], y=df["weights"], name="weights", hovertemplate='Weight for %{x}: %{y:.4f}<extra></extra>', hoverlabel={"font_color": "white"}, marker_color=px.colors.diverging.Tealrose[0]), row=1, col=1) table = go.Table(header={"values": ["bias"]}, cells={"values": bias}) table.cells.format = [[None], ['.3f']] fig.add_trace(table, row=1, col=2) fig.update_layout(template="plotly_white") fig.write_html(self.result_path + "fully_connected_layer_weights.html") return ReportOutput(self.result_path + "fully_connected_layer_weights.html", "fully-connected layer weights")
def plot_precision_recall(self, optimal_hp_items: list, label: Label, colors): report_data_outputs = [] figure = go.Figure() for index, item in enumerate(optimal_hp_items): df = pd.read_csv(item.test_predictions_path) true_class = df[f"{label.name}_true_class"].values predicted_proba = df[ f"{label.name}_{label.positive_class}_proba"].values precision, recall, _ = precision_recall_curve( y_true=true_class, probas_pred=predicted_proba) name = self.instruction_states[index].dataset.name figure.add_trace( go.Scatter(x=recall, y=precision, mode='lines', name=name, marker=dict(color=colors[index], line=dict(width=3)), hoverinfo="skip")) data_path = self.result_path + f"precision_recall_data_{name}.csv" pd.DataFrame({ "precision": precision, "recall": recall }).to_csv(data_path, index=False) report_data_outputs.append( ReportOutput( data_path, f'precision-recall curve data for dataset {name}')) figure_path = self.result_path + "precision_recall_curve.html" figure.update_layout(template='plotly_white', xaxis_title="recall", yaxis_title="precision") figure.write_html(figure_path) return ReportOutput(figure_path, 'precision-recall curve'), report_data_outputs
def _export_details(self) -> ReportOutput: file_path = f"{self.result_path}encoding_details.yaml" with open(file_path, "w") as file: details = { "feature_names": self.dataset.encoded_data.feature_names, "encoding": self.dataset.encoded_data.encoding, "example_ids": list(self.dataset.encoded_data.example_ids) } yaml.dump(details, file) return ReportOutput(file_path, "encoding details")
def _write_match_table(self): id_df = pd.DataFrame( {"repertoire_id": self.dataset.encoded_data.example_ids}) label_df = pd.DataFrame(self.dataset.encoded_data.labels) matches_df = pd.DataFrame( self.dataset.encoded_data.examples, columns=self.dataset.encoded_data.feature_names) result_path = os.path.join(self.result_path, "complete_match_count_table.csv") id_df.join(label_df).join(matches_df).to_csv(result_path, index=False) return ReportOutput(result_path, "All matches")
def _plot(self, sequence_lengths: Counter): df = pd.DataFrame({"counts": list(sequence_lengths.values()), 'sequence_lengths': list(sequence_lengths.keys())}) figure = px.bar(df, x="sequence_lengths", y="counts") figure.update_layout(xaxis=dict(tickmode='array', tickvals=df["sequence_lengths"]), yaxis=dict(tickmode='array', tickvals=df["counts"]), title="Sequence length distribution", template="plotly_white") figure.update_traces(marker_color=px.colors.diverging.Tealrose[0]) PathBuilder.build(self.result_path) file_path = self.result_path + "sequence_length_distribution.html" figure.write_html(file_path) return ReportOutput(path=file_path, name="sequence length distribution plot")
def _generate(self) -> ReportResult: df = pd.read_csv( self.dataset.encoded_data.info["relevant_sequence_path"]) column_mapping = self._compute_column_mapping(df) df.rename(columns=column_mapping, inplace=True) PathBuilder.build(self.result_path) filename = f"{self.result_path}relevant_sequences.csv" df.to_csv(filename, index=False) return ReportResult( self.name, output_tables=[ReportOutput(filename, "relevant sequences")])
def _write_sequence_info(self, sequence_info_path) -> List[ReportOutput]: PathBuilder.build(sequence_info_path) chains = self.dataset.encoded_data.feature_annotations unique_chains = chains.drop_duplicates( subset=["sequence", "v_gene", "j_gene"]) chains_path = os.path.join(sequence_info_path, "all_chains.csv") chains.to_csv(chains_path, index=False) unique_chains_path = os.path.join(sequence_info_path, "unique_chains.csv") unique_chains.to_csv(unique_chains_path, index=False) return [ReportOutput(p) for p in [chains_path, unique_chains_path]]
def _plot_kernels(self, kernel_name, sequence_alphabet): figure_outputs = [] table_outputs = [] friendly_kernel_name = copy(kernel_name).replace("chain_1", self.method.chain_names[0]).replace("chain_2", self.method.chain_names[1]) for i in range(self.method.kernel_count): kernel = getattr(self.method.CNN, kernel_name) kernel_df = pd.DataFrame(kernel.weight[i].detach().numpy().T[:, :len(sequence_alphabet)], columns=sequence_alphabet) kernel_csv_path = self.result_path + friendly_kernel_name + f"_{i + 1}.csv" kernel_df.to_csv(kernel_csv_path, index=False) table_outputs.append(ReportOutput(kernel_csv_path, friendly_kernel_name + f"_{i + 1}")) logo = logomaker.Logo(kernel_df, shade_below=0.5, fade_below=0.5, font_name='Arial Rounded MT Bold', vpad=0.05, vsep=0.01) logo_path = self.result_path + friendly_kernel_name + f"_{i + 1}.png" logo.style_spines(visible=False) logo.style_spines(spines=('left', 'bottom'), visible=True) logo.style_xticks(fmt='%d', anchor=0) logo.fig.savefig(logo_path) plt.close(logo.fig) figure_outputs.append(ReportOutput(logo_path, friendly_kernel_name + f"_{i + 1}")) return figure_outputs, table_outputs
def _plot(self, plotting_data, output_name): if plotting_data.empty: logging.warning(f"Coefficients: empty data subset specified, skipping {output_name} plot...") else: filename = f"{self.result_path}{output_name}.html" figure = px.bar(plotting_data, x='features', y='coefficients', template='plotly_white', title=f"{type(self.method).__name__}{' (' + self.method.name + ') - ' if self.method.name is not None else ' - '}" f"{' '.join(output_name.split('_'))}") figure.update_traces(marker_color=px.colors.sequential.Teal[3]) figure.write_html(filename) return ReportOutput(filename)
def _compute_model_overlap(self, figure_filename, df_filename, encoder, name): reference_sequences_df = pd.read_csv( self.reference_path, usecols=self.comparison_attributes) reference_sequences = list( reference_sequences_df.to_records(index=False)) attributes = reference_sequences_df.columns.tolist() model_sequences = self._extract_from_model(encoder) overlap_sequences = [ sequence for sequence in model_sequences if sequence in reference_sequences ] count_overlap = len(overlap_sequences) count_ref_only = len([ sequence for sequence in reference_sequences if sequence not in model_sequences ]) count_model_only = len([ sequence for sequence in model_sequences if sequence not in reference_sequences ]) self._make_venn_diagram(count_ref_only, count_overlap, count_model_only, 'reference', 'model', figure_filename) figure = ReportOutput(figure_filename, name) pd.DataFrame.from_records(overlap_sequences, columns=attributes).to_csv(df_filename, index=False) data = ReportOutput(df_filename, name) return figure, data
def _make_figure(self, overlap_matrix, filename, row_col_names) -> ReportOutput: figure = px.imshow(overlap_matrix, x=row_col_names, y=row_col_names, zmin=0, zmax=100, color_continuous_scale=px.colors.sequential.Teal, template='plotly_white') figure.update_traces( hovertemplate= "Overlap of disease-associated<br>sequences between<br>%{x} and %{y}:<br>%{z}%<extra></extra>" ) figure_path = f"{self.result_path}{filename}.html" figure.write_html(figure_path) return ReportOutput(figure_path, " ".join(filename.split('_')))
def _plot(self, plotting_data): plotting_data = self._preprocess_plotting_data(plotting_data) metric_name = self.state.optimization_metric.name.replace("_", " ").title() if self.single_axis_labels: figure = self._plot_single_axis_labels( plotting_data, "ML method", f"Performance ({metric_name})") else: figure = self._plot_rescalable(plotting_data, "ML method", f"Performance<br>({metric_name})") file_path = f"{self.result_path}{self.result_name}.html" figure.write_html(file_path) return ReportOutput(path=file_path)
def _generate(self) -> ReportResult: PathBuilder.build(self.result_path) alpha_chains, beta_chains, trbv, trbj, subject_condition, count = [], [], [], [], [], [] for index, receptor in enumerate(self.dataset.get_data()): alpha_chains.append(receptor.get_chain("alpha").amino_acid_sequence) beta_chains.append(receptor.get_chain("beta").amino_acid_sequence) trbv.append(receptor.get_chain("beta").metadata.v_gene) trbj.append(receptor.get_chain("beta").metadata.j_gene) subject_condition.append(f"{getattr(receptor.metadata, 'subject_id', str(index))}:{receptor.metadata[self.condition]}") count.append(receptor.get_chain("beta").metadata.count if receptor.get_chain('beta').metadata is not None else 1) df = pd.DataFrame({"CDR3b": beta_chains, "TRBV": trbv, "TRBJ": trbj, "CDR3a": alpha_chains, "subject:condition": subject_condition, "count": count}) file_path = self.result_path + "exported_data.tsv" df.to_csv(file_path, sep="\t", index=False) return ReportResult(self.name, output_tables=[ReportOutput(file_path, "exported data in GLIPH2 format")])
def _generate(self) -> ReportResult: self.label = list(self.train_dataset.encoded_data.labels.keys())[0] from source.util.TCRdistHelper import TCRdistHelper from tcrdist.rep_diff import hcluster_diff PathBuilder.build(self.result_path) subsampled_dataset = self._extract_positive_example_dataset() reference_sequences = self._extract_reference_sequences() tcr_rep = TCRdistHelper.compute_tcr_dist(subsampled_dataset, [self.label], self.cores) tcr_rep.hcluster_df, tcr_rep.Z = hcluster_diff( clone_df=tcr_rep.clone_df, pwmat=tcr_rep.pw_alpha + tcr_rep.pw_beta, x_cols=["epitope"], count_col='count') figures, tables = [], [] logging.info( f'{TCRdistMotifDiscovery.__name__}: created {tcr_rep.hcluster_df.shape[0]} clusters, now discovering motifs in clusters.' ) for index, row in tcr_rep.hcluster_df.iterrows(): if len(row['neighbors_i']) >= self.min_cluster_size: figure_outputs, table_outputs = self._discover_motif_in_cluster( tcr_rep, index, row, reference_sequences) figures.extend(figure_outputs) tables.extend(table_outputs) res_summary = member_summ(res_df=tcr_rep.hcluster_df, clone_df=tcr_rep.clone_df, addl_cols=['epitope']) res_summary.to_csv(self.result_path + "tcrdist_summary.csv") tables.append( ReportOutput(path=self.result_path + "tcrdist_summary.csv", name="TCRdist summary (csv)")) return ReportResult("TCRdist motif discovery", figures, tables)
def _plot(self, upper_limit, lower_limit): from rpy2.robjects import pandas2ri from rpy2.robjects.packages import STAP pandas2ri.activate() with open(EnvironmentSettings.root_path + "source/visualization/StatDistributionPlot.R") as f: string = f.read() plot = STAP(string, "plot") plot.plot_beta_distribution_binary_class(alpha0=self.method.alpha_0, beta0=self.method.beta_0, alpha1=self.method.alpha_1, beta1=self.method.beta_1, x_label=f"probability that receptor sequence is {self.method.label_name}-associated", label0=f"{self.method.label_name} {self.method.class_mapping[0]}", label1=f"{self.method.label_name} {self.method.class_mapping[1]}", upper_limit=upper_limit, lower_limit=lower_limit, result_path=self.result_path, result_name=self.result_name) return ReportOutput(f"{self.result_path}{self.result_name}.pdf")
def _plot(self, data_long_format) -> ReportOutput: groupby_cols = [self.x, self.color, self.facet_row, self.facet_column] groupby_cols = [i for i in groupby_cols if i] groupby_cols = list(set(groupby_cols)) plotting_data = data_long_format.groupby(groupby_cols, as_index=False).agg( {"value": ['mean', self.std]}) plotting_data.columns = plotting_data.columns.map(''.join) figure = px.bar(plotting_data, x=self.x, y="valuemean", color=self.color, barmode="relative", facet_row=self.facet_row, facet_col=self.facet_column, error_y="valuestd", labels={ "valuemean": self.y_title, self.x: self.x_title, }, template='plotly_white', color_discrete_sequence=px.colors.diverging.Tealrose) file_path = f"{self.result_path}{self.result_name}.html" figure.write_html(file_path) return ReportOutput(path=file_path, name="feature bar plot")
def _plot(self, training_dataframe, test_dataframe): optimization_metric = self.state.optimization_metric.name.lower() fig = go.Figure() fig.add_trace( go.Scatter(x=training_dataframe["x"], y=training_dataframe["y"], name="training", mode="markers", marker_size=11, marker_color="#CC79A7", hovertemplate=f"training {optimization_metric}" + ": %{y}<extra></extra>", opacity=0.8)) fig.add_trace( go.Scatter(x=test_dataframe["x"], y=test_dataframe["y"], name="test", mode="markers", marker_size=11, marker_color="#009E73", hovertemplate=f"test {optimization_metric}" + ": %{y}<extra></extra>", opacity=0.8)) fig.update_layout(legend_title_text="Data", title="Performance across feature values", template="plotly_white") fig.update_xaxes(title_text=self.feature) if self.is_feature_axis_categorical: fig.update_xaxes(type='category') fig.update_yaxes( title_text= f"performance ({self.state.optimization_metric.name.lower()})") fig.update_layout(hovermode="x unified") file_path = f"{self.result_path}{self.result_name}.html" fig.write_html(file_path) return ReportOutput(path=file_path)
def _plot(self, data_long_format): from rpy2.robjects import pandas2ri from rpy2.robjects.packages import STAP pandas2ri.activate() with open(EnvironmentSettings.root_path + "source/visualization/Distributions.R") as f: string = f.read() plot = STAP(string, "plot") plot.plot_distribution(data=data_long_format, x=self.grouping_label, y="value", color=self.color, group=self.group, type=self.type, facet_rows=self.facet_rows, facet_columns=self.facet_columns, facet_type=self.facet_type, facet_scales=self.facet_scales, facet_switch=self.facet_switch, nrow=self.nrow, ncol=self.ncol, height=self.height, width=self.width, x_lab=self.x_title, y_lab=self.y_title, color_lab=self.color_title, palette=self.palette, result_path=self.result_path, result_name=self.result_name) return ReportOutput(f"{self.result_path}{self.result_name}.pdf", "feature dist plot")
def _generate(self): PathBuilder.build(self.result_path) paths = [] self._set_plotting_parameters() plot_data = self._retrieve_plot_data() plot_data["abs_coefficients"] = abs(plot_data["coefficients"]) plot_data.sort_values(by="abs_coefficients", inplace=True, ascending=False) result_table_path = self._write_results_table(plot_data[["features", "coefficients"]]) self._write_settings() if CoefficientPlottingSetting.ALL in self._coefs_to_plot: report_output_fig = self._plot(plotting_data=plot_data, output_name="all_coefficients") paths.append(report_output_fig) if CoefficientPlottingSetting.NONZERO in self._coefs_to_plot: nonzero_data = plot_data[plot_data["coefficients"] != 0] report_output_fig = self._plot(plotting_data=nonzero_data, output_name="nonzero_coefficients") paths.append(report_output_fig) if CoefficientPlottingSetting.CUTOFF in self._coefs_to_plot: for cutoff_val in self._cutoff: cutoff_data = plot_data[plot_data["abs_coefficients"] >= cutoff_val] report_output_fig = self._plot(plotting_data=cutoff_data, output_name="cutoff_{}_coefficients".format(cutoff_val)) paths.append(report_output_fig) if CoefficientPlottingSetting.N_LARGEST in self._coefs_to_plot: for n_val in self._n_largest: n_largest_data = plot_data.nlargest(n=n_val, columns=["abs_coefficients"]) report_output_fig = self._plot(plotting_data=n_largest_data, output_name="largest_{}_coefficients".format(n_val)) paths.append(report_output_fig) return ReportResult(self.name, output_tables=[ReportOutput(result_table_path, "features and coefficients csv")], output_figures=[p for p in paths if p is not None])