def _generate(self) -> ReportResult: x = self.test_dataset.encoded_data y_score = self.method.predict_proba(x, self.label)[self.label] fpr, tpr, _ = roc_curve(x.labels[self.label], y_score[:, 0]) roc_auc = auc(fpr, tpr) trace1 = go.Scatter(x=fpr, y=tpr, mode='lines', line=dict(color='darkorange', width=2), name=f"ROC curve (area = {roc_auc})") trace2 = go.Scatter(x=[0, 1], y=[0, 1], mode='lines', line=dict(color='navy', width=2, dash='dash'), showlegend=False) layout = go.Layout(title='Receiver operating characteristic example', xaxis=dict(title='False Positive Rate'), yaxis=dict(title='True Positive Rate')) fig = go.Figure(data=[trace1, trace2], layout=layout) PathBuilder.build(self.result_path) path_htm = self.result_path / f"{self.name}.html" path_csv = self.result_path / f"{self.name}.csv" csv_result = np.concatenate((fpr.reshape(1, -1), tpr.reshape(1, -1))) fig.write_html(str(path_htm)) np.savetxt(str(path_csv), csv_result, header="fpr,tpr") return ReportResult(self.name, output_figures=[ReportOutput(path_htm)], output_tables=[ReportOutput(path_csv)])
def _generate(self) -> ReportResult: self.result_path = PathBuilder.build(self.result_path / self.name) self._extract_label() hp_items = [ state.optimal_hp_items[self.label.name] for state in self.instruction_states ] overlap_matrix = SequenceAnalysisHelper.compute_overlap_matrix( hp_items) labels = [state.dataset.name for state in self.instruction_states] figure_path = self._make_figure(overlap_matrix, labels) data_path = self._export_matrix(overlap_matrix, labels) return ReportResult( name=self.name, info= "A heatmap showing the overlap of disease-associated sequences produced by SequenceAbundance encoders between multiple datasets of different sizes.", output_figures=[ ReportOutput(figure_path, 'sequence overlap across datasets') ], output_tables=[ ReportOutput(data_path, 'sequence overlap across datasets (csv)') ])
def _discover_motif_in_cluster(self, tcr_rep, index, row, negative_examples=None) -> Tuple[List[ReportOutput], List[ReportOutput]]: from tcrdist.adpt_funcs import get_centroid_seq from tcrdist.summarize import _select from palmotif import compute_pal_motif from palmotif import svg_logo dfnode = tcr_rep.clone_df.iloc[row['neighbors_i'],] figure_outputs, table_outputs = [], [] logging.info(f"{TCRdistMotifDiscovery.__name__}: in cluster {index+1}, there are {dfnode.shape[0]} neighbors.") for chain in ['a', 'b']: if dfnode.shape[0] > 2: centroid, *_ = get_centroid_seq(df=dfnode) else: centroid = dfnode[f'cdr3_{chain}_aa'].to_list()[0] motif, stat = compute_pal_motif(seqs=_select(df=tcr_rep.clone_df, iloc_rows=row['neighbors_i'], col=f'cdr3_{chain}_aa'), centroid=centroid, refs=negative_examples[chain] if self.use_reference_sequences else None) figure_path = self.result_path / f"motif_{chain}_{index + 1}.svg" svg_logo(motif, filename=figure_path) motif_data_path = self.result_path / f"motif_{chain}_{index + 1}.csv" motif.to_csv(motif_data_path) figure_outputs.append(ReportOutput(figure_path, f'Motif {index + 1} ({Chain.get_chain(chain.upper()).name.lower()} chain)')) table_outputs.append(ReportOutput(motif_data_path, f'motif {index + 1} ({Chain.get_chain(chain.upper()).name.lower()} chain) csv data')) return figure_outputs, table_outputs
def _generate(self) -> ReportResult: PathBuilder.build(self.result_path) paths = [] # make predictions predictions = self.method.predict( self.test_dataset.encoded_data, self.label)[self.label] # label = disease true_labels = self.test_dataset.get_metadata(self.metadata_labels + [self.label]) metrics = ["FP", "FN"] plot = make_subplots(rows=len(self.metadata_labels), cols=2) listOfPlot = [] for label_index, meta_label in enumerate(self.metadata_labels): csv_data = {} for metric_index, metric in enumerate(metrics): plotting_data = self._metrics(metric=metric, label=self.label, meta_label=meta_label, predictions=predictions, true_labels=true_labels) csv_data[f"{metric}"] = plotting_data[f"{metric}"] plot.add_trace(go.Bar(x=plotting_data[meta_label], y=plotting_data[metric]), row=label_index + 1, col=metric_index + 1) plot.update_xaxes(title_text=f"{meta_label}", row=label_index + 1, col=metric_index + 1, type='category') plot.update_yaxes(title_text=f"{metric}", row=label_index + 1, col=metric_index + 1, rangemode="nonnegative", tick0=0, dtick=1) csv_data[f"{meta_label}"] = plotting_data[f"{meta_label}"] csv_data = pd.DataFrame(csv_data) listOfPlot.append(csv_data) plot.update_traces(marker_color=px.colors.sequential.Teal[3], showlegend=False) filename = self.result_path / "plots.html" plot.write_html(str(filename)) report_output_fig = ReportOutput(filename) paths.append(report_output_fig) result_table_path = self._write_results_table(listOfPlot, self.metadata_labels) return ReportResult(name=self.name, output_figures=paths, output_tables=[ReportOutput(result_table_path[0])])
def _generate(self) -> ReportResult: PathBuilder.build(self.result_path) test_metadata_filepath = self.test_dataset.encoded_data.info[ 'metadata_filepath'] label_names = [self.label] hdf5_filepath = self.method._metadata_to_hdf5(test_metadata_filepath, label_names) n_examples_test = len(self.test_dataset.encoded_data.example_ids) indices = np.array(range(n_examples_test)) dataloader = self.method.make_data_loader(hdf5_filepath, pre_loaded_hdf5_file=None, indices=indices, label=self.label, eval_only=True, is_train=False) model = self.method.get_model(self.label)[self.label] compute_contributions(intgrds_set_loader=dataloader, deeprc_model=model, n_steps=self.n_steps, threshold=self.threshold, resdir=self.result_path, filename_inputs=self.filename_inputs, filename_kernels=self.filename_kernels) return ReportResult(self.name, output_figures=[ ReportOutput(self.filename_inputs), ReportOutput(self.filename_kernels) ])
def _store_sequence_distribution_data(self, fig, dfs, chains): fig.write_html(str(self.result_path / "sequence_length_distribution.html")) image_output = ReportOutput(self.result_path / "sequence_length_distribution.html", name="sequence length distribution per chain") table_outputs = [ReportOutput(self.result_path / f"sequence_length_distribution_chain_{chains[index]}.csv") for index in range(len(chains))] for index, df in enumerate(dfs): df.to_csv(table_outputs[index].path, index=False) return image_output, table_outputs
def plot_roc(self, optimal_hp_items, label: Label, colors) -> Tuple[ReportOutput, List[ReportOutput]]: report_data_outputs = [] figure = go.Figure() figure.add_trace( go.Scatter(x=[0, 1], y=[0, 1], mode='lines', name='baseline', line=dict(color=PerformanceOverview.PLOTLY_BLACK, dash='dash'), hoverinfo="skip")) for index, item in enumerate(optimal_hp_items): if item.test_predictions_path is None: logging.warning( f'{PerformanceOverview.__name__}: there are no test predictions for dataset ' f'{self.instruction_states[index].dataset.name}, skipping this dataset when generating performance overview...' ) else: df = pd.read_csv(item.test_predictions_path) true_class = df[f"{label.name}_true_class"].values predicted_class = df[ f"{label.name}_{label.positive_class}_proba"].values fpr, tpr, _ = metrics.roc_curve(y_true=true_class, y_score=predicted_class) auc = metrics.roc_auc_score(true_class, predicted_class) name = self.instruction_states[ index].dataset.name + f' (AUC = {round(auc, 2)})' figure.add_trace( go.Scatter(x=fpr, y=tpr, mode='lines', name=name, marker=dict(color=colors[index], line=dict(width=3)), hoverinfo="skip")) data_path = self.result_path / f"roc_curve_data_{name}.csv" pd.DataFrame({ "FPR": fpr, "TPR": tpr }).to_csv(data_path, index=False) report_data_outputs.append( ReportOutput(data_path, f'ROC curve data for dataset {name} (csv)')) figure_path = self.result_path / "roc_curve.html" figure.update_layout(template='plotly_white', xaxis_title='false positive rate', yaxis_title='true positive rate') figure.write_html(str(figure_path)) return ReportOutput(figure_path, 'ROC curve'), report_data_outputs
def _generate(self) -> ReportResult: self.result_path = PathBuilder.build(self.result_path / self.name) self._extract_label() hp_items = [state.optimal_hp_items[self.label] for state in self.instruction_states] overlap_matrix = SequenceAnalysisHelper.compute_overlap_matrix(hp_items) labels = [state.dataset.name for state in self.instruction_states] figure_path = self._make_figure(overlap_matrix, labels) data_path = self._export_matrix(overlap_matrix, labels) return ReportResult(output_figures=[ReportOutput(figure_path, 'sequence overlap across datasets')], output_tables=[ReportOutput(data_path, 'sequence overlap across datasets (csv)')])
def _store_dataframes(self, training_dataframe: pd.DataFrame, test_dataframe: pd.DataFrame) -> List[ReportOutput]: train_path = self.result_path / "training_performance.csv" test_path = self.result_path / "test_performance.csv" training_dataframe.to_csv(train_path, index=False) test_dataframe.to_csv(test_path, index=False) return [ ReportOutput( path=train_path, name=f"Training performance w.r.t. {self.feature} values"), ReportOutput(path=test_path, name=f"Test performance w.r.t. {self.feature} values") ]
def _generate_heatmap(self, x, y, z, metric, output, xlabel='Prediction', ylabel='Ground Truth', zlabel='Count'): path_csv = self.result_path / f"{self.name}_{metric.lower()}.csv" path_html = self.result_path / f"{self.name}_{metric.lower()}.html" z_flip = np.flipud(z) hovertext = [] for yi, yy in enumerate(y): hovertext.append(list()) for xi, xx in enumerate(x): hovertext[-1].append( f"{xlabel}: {xx}<br />{ylabel}: {yy}<br />{zlabel}: {z_flip[yi][xi]}" ) layout = go.Layout(title=f'Evaluation: {metric} ({self.label})', xaxis=dict(title=xlabel), yaxis=dict(title=ylabel)) trace = go.Heatmap(z=z_flip, x=x, y=y, hoverongaps=False, colorscale='burgyl', hoverinfo='text', text=hovertext) fig = go.Figure(data=[trace], layout=layout) fig.write_html(str(path_html)) z_df = pd.DataFrame(z) z_df.columns = f'{xlabel} (' + pd.Index(map(str, x)) + ')' z_df.index = f'{ylabel} (' + pd.Index(map(str, y)) + ')' z_df.to_csv(path_csv) output['tables'].append( ReportOutput(path_csv, f"TrainingPerformance table ({metric.lower()})")) output['figures'].append( ReportOutput(path_html, f"TrainingPerformance html ({metric.lower()})")) return
def _plot(self, plotting_data, output_name): if plotting_data.empty: logging.warning( f"Coefficients: empty data subset specified, skipping {output_name} plot..." ) else: filename = self.result_path / f"{output_name}.html" import plotly.express as px figure = px.box( plotting_data, x="max_seed_overlap", y="coefficients", labels={ "max_seed_overlap": self._x_axis_title, "coefficients": self._y_axis_title }, template='plotly_white', color_discrete_sequence=px.colors.diverging.Tealrose) # figure.update_layout(title={"text":self.title, "x":0.5, "font": {"size":14}}) figure.write_html(str(filename)) return ReportOutput( filename, f"Overlap between implanted motif seeds and features versus {self._y_axis_title.lower()}" )
def _write_paired_matches(self, paired_matches_path: Path) -> List[ReportOutput]: PathBuilder.build(paired_matches_path) report_outputs = [] for i in range(0, len(self.dataset.encoded_data.example_ids) ): # todo don't mention subject in the name twice file_name = "example_{}_".format( self.dataset.encoded_data.example_ids[i]) file_name += "_".join([ "{label}_{value}".format(label=label, value=values[i]) for label, values in self.dataset.encoded_data.labels.items() ]) file_name += ".csv" file_path = paired_matches_path / file_name if self.dataset.encoded_data.encoding == "MatchedReceptorsEncoder": self._write_paired_receptor_matches_for_repertoire( self.dataset.encoded_data.examples[i], file_path) elif self.dataset.encoded_data.encoding == "MatchedRegexEncoder": self._write_paired_regex_matches_for_repertoire( self.dataset.encoded_data.examples[i], file_path) report_outputs.append( ReportOutput( file_path, f"Example {self.dataset.encoded_data.example_ids[i]} paired matches" )) return report_outputs
def _write_repertoire_sizes(self): """ Writes the repertoire sizes (# clones & # reads) per subject, per chain. """ all_subjects = self.dataset.encoded_data.example_ids all_chains = sorted( set(self.dataset.encoded_data.feature_annotations["chain"])) results_df = pd.DataFrame(list( itertools.product(all_subjects, all_chains)), columns=["subject_id", "chain"]) results_df["n_reads"] = 0 results_df["n_clones"] = 0 for repertoire in self.dataset.repertoires: rep_counts = repertoire.get_counts() rep_chains = repertoire.get_chains() for chain in all_chains: indices = rep_chains == Chain.get_chain(chain.upper()) results_df.loc[( results_df.subject_id == repertoire.metadata["subject_id"]) & (results_df.chain == chain), 'n_reads'] += np.sum(rep_counts[indices]) results_df.loc[( results_df.subject_id == repertoire.metadata["subject_id"]) & (results_df.chain == chain), 'n_clones'] += len(rep_counts[indices]) results_path = self.result_path / "repertoire_sizes.csv" results_df.to_csv(results_path, index=False) return ReportOutput(results_path, "Repertoire sizes")
def _plot(self, data_long_format) -> ReportOutput: groupby_cols = [self.x, self.color, self.facet_row, self.facet_column] groupby_cols = [i for i in groupby_cols if i] groupby_cols = list(set(groupby_cols)) plotting_data = data_long_format.groupby( groupby_cols, as_index=False).agg({"value": ['mean', self.std]}) plotting_data.columns = plotting_data.columns.map(''.join) error_y = "valuestd" if self.show_error_bar else None figure = px.bar(plotting_data, x=self.x, y="valuemean", color=self.color, barmode="relative", facet_row=self.facet_row, facet_col=self.facet_column, error_y=error_y, labels={ "valuemean": self.y_title, self.x: self.x_title, }, template='plotly_white', color_discrete_sequence=px.colors.diverging.Tealrose) file_path = self.result_path / f"{self.result_name}.html" figure.write_html(str(file_path)) return ReportOutput(path=file_path, name="Average feature values")
def _export_matrix(self) -> ReportOutput: """Create a file for the design matrix in the desired format.""" data = self._get_data() file_path = self.result_path / "design_matrix" ext = os.path.splitext(self.file_format)[0] file_path = file_path.with_suffix('.' + ext) # Use h5py to create a hdf5 file. if ext == "hdf5": with h5py.File(str(file_path), 'w') as hf_object: hf_object.create_dataset(str(file_path), data=data) # Use numpy to create a csv or npy file. elif len(data.shape) <= 2 and ext == "csv": feature_names = self.dataset.encoded_data.feature_names header = ",".join(str(name) for name in feature_names) if feature_names is not None else "" np.savetxt(fname=str(file_path), X=data, delimiter=",", comments='', header=header) else: if ext != "npy": logging.info('The selected Report format is not compatible, .npy is used instead') file_path = file_path.with_suffix(".npy") ext = "npy" np.save(str(file_path), data) # If requested, compress the file into a .zip. if self.file_format.endswith(".zip"): file_path_zip = file_path.with_suffix('.' + ext + '.zip') with zipfile.ZipFile(str(file_path_zip), 'w') as zipped_file: zipped_file.write(str(file_path), compress_type=zipfile.ZIP_DEFLATED) os.remove(str(file_path)) file_path = file_path_zip return ReportOutput(file_path, "design matrix")
def _plot_fc_figure(self, df, bias): fig = make_subplots(rows=1, cols=2, column_widths=[0.8, 0.2], specs=[[{ "type": "bar" }, { 'type': "table" }]]) fig.add_trace(go.Bar( x=df["names"], y=df["weights"], name="weights", hovertemplate='Weight for %{x}: %{y:.4f}<extra></extra>', hoverlabel={"font_color": "white"}, marker_color=px.colors.diverging.Tealrose[0]), row=1, col=1) table = go.Table(header={"values": ["bias"]}, cells={"values": bias}) table.cells.format = [[None], ['.3f']] fig.add_trace(table, row=1, col=2) fig.update_layout(template="plotly_white") fig.write_html( str(self.result_path / "fully_connected_layer_weights.html")) return ReportOutput( self.result_path / "fully_connected_layer_weights.html", "fully-connected layer weights")
def _plot_sparse(self, data_long_format) -> ReportOutput: columns_to_filter = [self.x, "value"] for optional_column in [self.color, self.facet_row, self.facet_column]: if optional_column is not None: columns_to_filter.append(optional_column) data_long_format_filtered = data_long_format.loc[data_long_format.value != 0, columns_to_filter] columns_to_filter.remove("value") total_counts = data_long_format_filtered.groupby(columns_to_filter, as_index=False).agg( {"value": 'sum'}) data_long_format_filtered = data_long_format_filtered.merge(total_counts, on=self.x, how="left", suffixes=('', '_sum')) \ .fillna(0) \ .sort_values(by=self.x) \ .reset_index(drop=True) figure = px.box(data_long_format_filtered, x=self.x, y="value", color=self.color, facet_row=self.facet_row, facet_col=self.facet_column, labels={ "valuemean": self.y_title, self.x: self.x_title, }, template='plotly_white', color_discrete_sequence=px.colors.diverging.Tealrose) file_path = self.result_path / f"{self.result_name}.html" figure.write_html(str(file_path)) return ReportOutput(path=file_path, name="feature boxplots")
def _generate(self) -> ReportResult: PathBuilder.build(self.result_path) alpha_chains, beta_chains, trbv, trbj, subject_condition, count = [], [], [], [], [], [] for index, receptor in enumerate(self.dataset.get_data()): alpha_chains.append( receptor.get_chain("alpha").amino_acid_sequence) beta_chains.append(receptor.get_chain("beta").amino_acid_sequence) trbv.append(receptor.get_chain("beta").metadata.v_gene) trbj.append(receptor.get_chain("beta").metadata.j_gene) subject_condition.append( f"{getattr(receptor.metadata, 'subject_id', str(index))}:{receptor.metadata[self.condition]}" ) count.append( receptor.get_chain("beta").metadata.count if receptor.get_chain('beta').metadata is not None and receptor. get_chain('beta').metadata.count is not None else 1) df = pd.DataFrame({ "CDR3b": beta_chains, "TRBV": trbv, "TRBJ": trbj, "CDR3a": alpha_chains, "subject:condition": subject_condition, "count": count }) file_path = self.result_path / "exported_data.tsv" df.to_csv(file_path, sep="\t", index=False) return ReportResult(self.name, output_tables=[ ReportOutput(file_path, "exported data in GLIPH2 format") ])
def _generate(self) -> ReportResult: PathBuilder.build(self.result_path) text_path = self.result_path / "dataset_description.txt" dataset_name = self.dataset.name if self.dataset.name is not None else self.dataset.identifier output_text = self._get_generic_dataset_text() if isinstance(self.dataset, RepertoireDataset): output_text += self._get_repertoire_dataset_text() elif isinstance(self.dataset, ReceptorDataset): output_text += self._get_receptor_dataset_text() elif isinstance(self.dataset, SequenceDataset): output_text += self._get_sequence_dataset_text() text_path.write_text(output_text) return ReportResult( name=self.name, info= "A simple text-based overview of the properties of any dataset, including the dataset name, size, and metadata labels.", output_text=[ ReportOutput(text_path, f"Description of dataset {dataset_name}") ])
def _generate(self) -> ReportResult: from immuneML.util.TCRdistHelper import TCRdistHelper from tcrdist.rep_diff import hcluster_diff from tcrdist.summarize import member_summ PathBuilder.build(self.result_path) subsampled_dataset = self._extract_positive_example_dataset() reference_sequences = self._extract_reference_sequences() tcr_rep = TCRdistHelper.compute_tcr_dist(subsampled_dataset, [self.label.name], self.cores) tcr_rep.hcluster_df, tcr_rep.Z = hcluster_diff(clone_df=tcr_rep.clone_df, pwmat=tcr_rep.pw_alpha + tcr_rep.pw_beta, x_cols=["epitope"], count_col='count') figures, tables = [], [] logging.info(f'{TCRdistMotifDiscovery.__name__}: created {tcr_rep.hcluster_df.shape[0]} clusters, now discovering motifs in clusters.') for index, row in tcr_rep.hcluster_df.iterrows(): if len(row['neighbors_i']) >= self.min_cluster_size: figure_outputs, table_outputs = self._discover_motif_in_cluster(tcr_rep, index, row, reference_sequences) figures.extend(figure_outputs) tables.extend(table_outputs) res_summary = member_summ(res_df=tcr_rep.hcluster_df, clone_df=tcr_rep.clone_df, addl_cols=['epitope']) res_summary.to_csv(self.result_path / "tcrdist_summary.csv") tables.append(ReportOutput(path=self.result_path / "tcrdist_summary.csv", name="TCRdist summary (csv)")) return ReportResult(name=self.name, info="TCRdist motif discovery", output_figures=figures, output_tables=tables)
def _store_fc_table(self, df, bias): df.append({"weights": bias, "names": "bias"}, ignore_index=True) df.to_csv(self.result_path / "fully_connected_layer_weights.csv", index=False) return ReportOutput( self.result_path / "fully_connected_layer_weights.csv", "fully-connected layer weights")
def _export_matrix(self, overlap_matrix, filename, row_col_names) -> ReportOutput: data_path = self.result_path / f"{filename}.csv" pd.DataFrame(overlap_matrix, columns=row_col_names, index=row_col_names).to_csv(data_path) return ReportOutput(data_path, " ".join(filename.split('_') + ['data']))
def _generate_barplot(self, df, output): import plotly.express as px path_csv = self.result_path / f"{self.name}.csv" path_html = self.result_path / f"{self.name}.html" df.to_csv(path_csv) figure = px.bar(df, x=df.index, y=self.label.name, labels={'index': "metrics"}, template='plotly_white', color_discrete_sequence=px.colors.diverging.Tealrose, title=f"Evaluation metrics ({self.label})") figure.write_html(str(path_html)) output['tables'].append(ReportOutput(path_csv, "training performance in csv")) output['figures'].append(ReportOutput(path_html, "training performance on selected metrics")) return
def export_receptorlist(self, receptors, result_path: Path): export_list = [] node_metadata_list = [] edge_metadata_list = [] for receptor in receptors: first_chain = receptor.get_chain(self.chains[0]) second_chain = receptor.get_chain(self.chains[1]) first_chain_name = self.get_shared_name(first_chain) second_chain_name = self.get_shared_name(second_chain) export_list.append([first_chain_name, "pair", second_chain_name]) node_metadata_list.append([first_chain_name, self.chains[0]] + self.get_formatted_node_metadata(first_chain)) node_metadata_list.append([second_chain_name, self.chains[1]] + self.get_formatted_node_metadata(second_chain)) edge_metadata_list.append( [f"{first_chain_name} (pair) {second_chain_name}"] + self.get_formatted_edge_metadata(first_chain, second_chain)) full_df = pd.DataFrame(export_list, columns=[self.chains[0], "relationship", self.chains[1]]) node_meta_df = pd.DataFrame(node_metadata_list, columns=["shared_name", "chain", "sequence", "v_subgroup", "v_gene", "j_subgroup", "j_gene"] + self.additional_node_attributes) edge_meta_df = pd.DataFrame(edge_metadata_list, columns=["shared_name"] + self.additional_edge_attributes) node_cols = list(node_meta_df.columns) node_meta_df["n_duplicates"] = 1 node_meta_df = node_meta_df.groupby(node_cols, as_index=False)["n_duplicates"].sum() edge_meta_df.drop_duplicates(inplace=True) node_meta_df.to_csv(result_path / "node_metadata.tsv", sep="\t", index=0, header=True) edge_meta_df.to_csv(result_path / "edge_metadata.tsv", sep="\t", index=0, header=True) if self.drop_duplicates: full_df.drop_duplicates(inplace=True) full_df.to_csv(result_path / "all_chains.sif", sep="\t", index=0, header=False) shared_df = full_df[(full_df.duplicated(["alpha"], keep=False)) | (full_df.duplicated(["beta"], keep=False))] shared_df.to_csv(result_path / "shared_chains.sif", sep="\t", index=0, header=False) return [ReportOutput(path=result_path / "node_metadata.tsv"), ReportOutput(path=result_path / "edge_metadata.tsv"), ReportOutput(path=result_path / "all_chains.sif"), ReportOutput(path=result_path / "shared_chains.sif")]
def _generate(self) -> ReportResult: df = pd.read_csv(self.dataset.encoded_data.info["relevant_sequence_path"]) column_mapping = self._compute_column_mapping(df) df.rename(columns=column_mapping, inplace=True) PathBuilder.build(self.result_path) filename = self.result_path / "relevant_sequences.csv" df.to_csv(filename, index=False) return ReportResult(self.name, output_tables=[ReportOutput(filename, "relevant sequences")])
def _generate(self): PathBuilder.build(self.result_path) paths = [] self._set_plotting_parameters() plot_data = self._retrieve_plot_data() plot_data["abs_coefficients"] = abs(plot_data["coefficients"]) plot_data.sort_values(by="abs_coefficients", inplace=True, ascending=False) result_table_path = self._write_results_table( plot_data[["features", "coefficients"]]) self._write_settings() if CoefficientPlottingSetting.ALL in self._coefs_to_plot: report_output_fig = self._plot(plotting_data=plot_data, output_name="all_coefficients") paths.append(report_output_fig) if CoefficientPlottingSetting.NONZERO in self._coefs_to_plot: nonzero_data = plot_data[plot_data["coefficients"] != 0] report_output_fig = self._plot(plotting_data=nonzero_data, output_name="nonzero_coefficients") paths.append(report_output_fig) if CoefficientPlottingSetting.CUTOFF in self._coefs_to_plot: for cutoff_val in self._cutoff: cutoff_data = plot_data[ plot_data["abs_coefficients"] >= cutoff_val] report_output_fig = self._plot( plotting_data=cutoff_data, output_name="cutoff_{}_coefficients".format(cutoff_val)) paths.append(report_output_fig) if CoefficientPlottingSetting.N_LARGEST in self._coefs_to_plot: for n_val in self._n_largest: n_largest_data = plot_data.nlargest( n=n_val, columns=["abs_coefficients"]) report_output_fig = self._plot( plotting_data=n_largest_data, output_name="largest_{}_coefficients".format(n_val)) paths.append(report_output_fig) return ReportResult( self.name, info= f"{self._y_axis_title}s of the trained {self.method.__class__.__name__} model", output_tables=[ ReportOutput(result_table_path, "features and coefficients csv") ], output_figures=[p for p in paths if p is not None])
def _generate(self) -> ReportResult: PathBuilder.build(self.result_path) test_metadata_filepath = self.test_dataset.encoded_data.info[ 'metadata_filepath'] hdf5_filepath = self.method._metadata_to_hdf5( metadata_filepath=test_metadata_filepath, label_name=self.label.name) n_examples_test = len(self.test_dataset.encoded_data.example_ids) indices = np.array(range(n_examples_test)) dataloader = self.method.make_data_loader(hdf5_filepath, pre_loaded_hdf5_file=None, indices=indices, label_name=self.label.name, eval_only=True, is_train=False) path_inputs = self.result_path / self.filename_inputs path_kernels = self.result_path / self.filename_kernels self.compute_contributions(intgrds_set_loader=dataloader, deeprc_model=self.method.model, n_steps=self.n_steps, threshold=self.threshold, path_inputs=path_inputs, path_kernels=self.result_path / self.filename_kernels) return ReportResult( self.name, info= "Plots the contributions of (i) input sequences and (ii) kernels to trained `DeepRC` model with respect to the test dataset. Contributions are computed using integrated gradients.", output_figures=[ ReportOutput(path_inputs, "Integrated Gradients over the inputs to DeepRC"), ReportOutput( path_kernels, "Integrated Gradients over the kernels of DeepRC") ])
def plot_precision_recall(self, optimal_hp_items: list, label: Label, colors): report_data_outputs = [] figure = go.Figure() for index, item in enumerate(optimal_hp_items): df = pd.read_csv(item.test_predictions_path) true_class = df[f"{label.name}_true_class"].values predicted_proba = df[ f"{label.name}_{label.positive_class}_proba"].values precision, recall, _ = precision_recall_curve( y_true=true_class, probas_pred=predicted_proba) name = self.instruction_states[index].dataset.name figure.add_trace( go.Scatter(x=recall, y=precision, mode='lines', name=name, marker=dict(color=colors[index], line=dict(width=3)), hoverinfo="skip")) data_path = self.result_path / f"precision_recall_data_{name}.csv" pd.DataFrame({ "precision": precision, "recall": recall }).to_csv(data_path, index=False) report_data_outputs.append( ReportOutput( data_path, f'precision-recall curve data for dataset {name}')) figure_path = self.result_path / "precision_recall_curve.html" figure.update_layout(template='plotly_white', xaxis_title="recall", yaxis_title="precision") figure.write_html(str(figure_path)) return ReportOutput(figure_path, 'precision-recall curve'), report_data_outputs
def _write_match_table(self): id_df = pd.DataFrame( {"repertoire_id": self.dataset.encoded_data.example_ids}) label_df = pd.DataFrame(self.dataset.encoded_data.labels) matches_df = pd.DataFrame( self.dataset.encoded_data.examples, columns=self.dataset.encoded_data.feature_names) result_path = self.result_path / "complete_match_count_table.csv" id_df.join(label_df).join(matches_df).to_csv(result_path, index=False) return ReportOutput(result_path, "All matches")
def _export_details(self) -> ReportOutput: file_path = self.result_path / "encoding_details.yaml" with file_path.open("w") as file: details = { "feature_names": self.dataset.encoded_data.feature_names, "encoding": self.dataset.encoded_data.encoding, "example_ids": list(self.dataset.encoded_data.example_ids) } yaml.dump(details, file) return ReportOutput(file_path, "encoding details")