def _generate(self) -> ReportResult: x = self.test_dataset.encoded_data y_score = self.method.predict_proba(x, self.label)[self.label] fpr, tpr, _ = roc_curve(x.labels[self.label], y_score[:, 0]) roc_auc = auc(fpr, tpr) trace1 = go.Scatter(x=fpr, y=tpr, mode='lines', line=dict(color='darkorange', width=2), name=f"ROC curve (area = {roc_auc})") trace2 = go.Scatter(x=[0, 1], y=[0, 1], mode='lines', line=dict(color='navy', width=2, dash='dash'), showlegend=False) layout = go.Layout(title='Receiver operating characteristic example', xaxis=dict(title='False Positive Rate'), yaxis=dict(title='True Positive Rate')) fig = go.Figure(data=[trace1, trace2], layout=layout) PathBuilder.build(self.result_path) path_htm = self.result_path / f"{self.name}.html" path_csv = self.result_path / f"{self.name}.csv" csv_result = np.concatenate((fpr.reshape(1, -1), tpr.reshape(1, -1))) fig.write_html(str(path_htm)) np.savetxt(str(path_csv), csv_result, header="fpr,tpr") return ReportResult(self.name, output_figures=[ReportOutput(path_htm)], output_tables=[ReportOutput(path_csv)])
def _generate(self) -> ReportResult: PathBuilder.build(self.result_path) test_metadata_filepath = self.test_dataset.encoded_data.info[ 'metadata_filepath'] label_names = [self.label] hdf5_filepath = self.method._metadata_to_hdf5(test_metadata_filepath, label_names) n_examples_test = len(self.test_dataset.encoded_data.example_ids) indices = np.array(range(n_examples_test)) dataloader = self.method.make_data_loader(hdf5_filepath, pre_loaded_hdf5_file=None, indices=indices, label=self.label, eval_only=True, is_train=False) model = self.method.get_model(self.label)[self.label] compute_contributions(intgrds_set_loader=dataloader, deeprc_model=model, n_steps=self.n_steps, threshold=self.threshold, resdir=self.result_path, filename_inputs=self.filename_inputs, filename_kernels=self.filename_kernels) return ReportResult(self.name, output_figures=[ ReportOutput(self.filename_inputs), ReportOutput(self.filename_kernels) ])
def _generate(self) -> ReportResult: PathBuilder.build(self.result_path) text_path = self.result_path / "dataset_description.txt" dataset_name = self.dataset.name if self.dataset.name is not None else self.dataset.identifier output_text = self._get_generic_dataset_text() if isinstance(self.dataset, RepertoireDataset): output_text += self._get_repertoire_dataset_text() elif isinstance(self.dataset, ReceptorDataset): output_text += self._get_receptor_dataset_text() elif isinstance(self.dataset, SequenceDataset): output_text += self._get_sequence_dataset_text() text_path.write_text(output_text) return ReportResult( name=self.name, info= "A simple text-based overview of the properties of any dataset, including the dataset name, size, and metadata labels.", output_text=[ ReportOutput(text_path, f"Description of dataset {dataset_name}") ])
def _generate(self) -> ReportResult: self.result_path = PathBuilder.build(self.result_path / self.name) self._extract_label() hp_items = [ state.optimal_hp_items[self.label.name] for state in self.instruction_states ] overlap_matrix = SequenceAnalysisHelper.compute_overlap_matrix( hp_items) labels = [state.dataset.name for state in self.instruction_states] figure_path = self._make_figure(overlap_matrix, labels) data_path = self._export_matrix(overlap_matrix, labels) return ReportResult( name=self.name, info= "A heatmap showing the overlap of disease-associated sequences produced by SequenceAbundance encoders between multiple datasets of different sizes.", output_figures=[ ReportOutput(figure_path, 'sequence overlap across datasets') ], output_tables=[ ReportOutput(data_path, 'sequence overlap across datasets (csv)') ])
def _generate(self) -> ReportResult: figures, tables = [], [] PathBuilder.build(self.result_path) if ReferenceSequenceOverlap._check_encoder_class( self.state.optimal_hp_items[self.label.name].encoder): figure, data = self._compute_optimal_model_overlap() figures.append(figure) tables.append(data) for assessment_state in self.state.assessment_states: encoder = assessment_state.label_states[ self.label.name].optimal_assessment_item.encoder if ReferenceSequenceOverlap._check_encoder_class(encoder): figure_filename = self.result_path / f"assessment_split_{assessment_state.split_index + 1}_model_vs_reference_overlap_{self.label.name}.pdf" df_filename = self.result_path / f"assessment_split_{assessment_state.split_index + 1}_overlap_sequences_{self.label.name}" figure, data = self._compute_model_overlap( figure_filename, df_filename, encoder, f"overlap sequences between the model for assessment split " f"{assessment_state.split_index + 1} and reference list") figures.append(figure) tables.append(data) return ReportResult( self.name, info= "A Venn diagram between the list of disease-associated sequences produced by the SequenceAbundance encoder and a list of reference receptor sequences, and a file containing the overlapping sequences.", output_figures=figures, output_tables=tables)
def _generate(self) -> ReportResult: report_result = ReportResult( name=self.name, info= "Plots ROC curves for all trained ML settings ([preprocessing], encoding, ML model) in the outer loop of cross-validation in the TrainMLModel instruction" ) PathBuilder.build(self.result_path) for label in self.state.label_configuration.get_label_objects(): if len(label.values) != 2: logging.warning( f"{ROCCurveSummary.__name__}: report {self.name} is skipping label {label.name} as it has {len(label.values)} " f"classes, while this report expects 2 classes.") elif label.positive_class is None: logging.warning( f"{ROCCurveSummary.__name__}: report {self.name} is skipping label {label.name} because 'positive_class' parameter " f"is not set.") else: for index in range(self.state.assessment.split_count): figure = self._create_figure_for_assessment_split( index, label) report_result.output_figures.append(figure) return report_result
def _generate(self) -> ReportResult: PathBuilder.build(self.result_path) data_long_format = DataReshaper.reshape(self.dataset) table_result = self._write_results_table(data_long_format) report_output_fig = self._safe_plot(data_long_format=data_long_format) output_figures = None if report_output_fig is None else [report_output_fig] return ReportResult(self.name, output_figures, [table_result])
def _generate(self) -> ReportResult: self.result_path = PathBuilder.build(self.result_path / self.name) assert all(self.instruction_states[0].label_configuration.get_labels_by_name() == state.label_configuration.get_labels_by_name() and self.instruction_states[0].label_configuration.get_label_values( self.instruction_states[0].label_configuration.get_labels_by_name()[0]) == state.label_configuration.get_label_values(state.label_configuration.get_labels_by_name()[0]) for state in self.instruction_states), \ "PerformanceOverview: there is a difference in labels between instructions, the plots cannot be created." assert len(self.instruction_states[0].label_configuration.get_labels_by_name()) == 1, \ 'PerformanceOverview: multiple labels were provided, but only one can be used in this report.' assert all(state.refit_optimal_model is False for state in self.instruction_states), \ f"{PerformanceOverview.__name__}: no test datasets were available to assess the performance of optimal models as they were refitted on " \ f"the full datasets. No reports will be generated." label = self.instruction_states[ 0].label_configuration.get_label_objects()[0] optimal_hp_items = [ list(state.optimal_hp_items.values())[0] for state in self.instruction_states ] colors = px.colors.sequential.Viridis[::2][::-1] figure_auc, table_aucs = self.plot_roc(optimal_hp_items, label, colors) figure_pr, table_pr = self.plot_precision_recall( optimal_hp_items, label, colors) return ReportResult(output_figures=[figure_auc, figure_pr], output_tables=table_aucs + table_pr)
def _generate(self) -> ReportResult: figures, tables = [], [] PathBuilder.build(self.result_path) if ReferenceSequenceOverlap._check_encoder_class( self.state.optimal_hp_items[self.label].encoder): figure, data = self._compute_optimal_model_overlap() figures.append(figure) tables.append(data) for assessment_state in self.state.assessment_states: encoder = assessment_state.label_states[ self.label].optimal_assessment_item.encoder if ReferenceSequenceOverlap._check_encoder_class(encoder): figure_filename = self.result_path / f"assessment_split_{assessment_state.split_index + 1}_model_vs_reference_overlap_{self.label}.pdf" df_filename = self.result_path / f"assessment_split_{assessment_state.split_index + 1}_overlap_sequences_{self.label}" figure, data = self._compute_model_overlap( figure_filename, df_filename, encoder, f"overlap sequences between the model for assessment split " f"{assessment_state.split_index + 1} and reference list") figures.append(figure) tables.append(data) return ReportResult(self.name, output_figures=figures, output_tables=tables)
def _generate(self) -> ReportResult: from immuneML.util.TCRdistHelper import TCRdistHelper from tcrdist.rep_diff import hcluster_diff from tcrdist.summarize import member_summ PathBuilder.build(self.result_path) subsampled_dataset = self._extract_positive_example_dataset() reference_sequences = self._extract_reference_sequences() tcr_rep = TCRdistHelper.compute_tcr_dist(subsampled_dataset, [self.label.name], self.cores) tcr_rep.hcluster_df, tcr_rep.Z = hcluster_diff(clone_df=tcr_rep.clone_df, pwmat=tcr_rep.pw_alpha + tcr_rep.pw_beta, x_cols=["epitope"], count_col='count') figures, tables = [], [] logging.info(f'{TCRdistMotifDiscovery.__name__}: created {tcr_rep.hcluster_df.shape[0]} clusters, now discovering motifs in clusters.') for index, row in tcr_rep.hcluster_df.iterrows(): if len(row['neighbors_i']) >= self.min_cluster_size: figure_outputs, table_outputs = self._discover_motif_in_cluster(tcr_rep, index, row, reference_sequences) figures.extend(figure_outputs) tables.extend(table_outputs) res_summary = member_summ(res_df=tcr_rep.hcluster_df, clone_df=tcr_rep.clone_df, addl_cols=['epitope']) res_summary.to_csv(self.result_path / "tcrdist_summary.csv") tables.append(ReportOutput(path=self.result_path / "tcrdist_summary.csv", name="TCRdist summary (csv)")) return ReportResult(name=self.name, info="TCRdist motif discovery", output_figures=figures, output_tables=tables)
def _generate(self) -> ReportResult: PathBuilder.build(self.result_path) alpha_chains, beta_chains, trbv, trbj, subject_condition, count = [], [], [], [], [], [] for index, receptor in enumerate(self.dataset.get_data()): alpha_chains.append( receptor.get_chain("alpha").amino_acid_sequence) beta_chains.append(receptor.get_chain("beta").amino_acid_sequence) trbv.append(receptor.get_chain("beta").metadata.v_gene) trbj.append(receptor.get_chain("beta").metadata.j_gene) subject_condition.append( f"{getattr(receptor.metadata, 'subject_id', str(index))}:{receptor.metadata[self.condition]}" ) count.append( receptor.get_chain("beta").metadata.count if receptor.get_chain('beta').metadata is not None and receptor. get_chain('beta').metadata.count is not None else 1) df = pd.DataFrame({ "CDR3b": beta_chains, "TRBV": trbv, "TRBJ": trbj, "CDR3a": alpha_chains, "subject:condition": subject_condition, "count": count }) file_path = self.result_path / "exported_data.tsv" df.to_csv(file_path, sep="\t", index=False) return ReportResult(self.name, output_tables=[ ReportOutput(file_path, "exported data in GLIPH2 format") ])
def _generate(self) -> ReportResult: PathBuilder.build(self.result_path) paths = [] # make predictions predictions = self.method.predict( self.test_dataset.encoded_data, self.label)[self.label] # label = disease true_labels = self.test_dataset.get_metadata(self.metadata_labels + [self.label]) metrics = ["FP", "FN"] plot = make_subplots(rows=len(self.metadata_labels), cols=2) listOfPlot = [] for label_index, meta_label in enumerate(self.metadata_labels): csv_data = {} for metric_index, metric in enumerate(metrics): plotting_data = self._metrics(metric=metric, label=self.label, meta_label=meta_label, predictions=predictions, true_labels=true_labels) csv_data[f"{metric}"] = plotting_data[f"{metric}"] plot.add_trace(go.Bar(x=plotting_data[meta_label], y=plotting_data[metric]), row=label_index + 1, col=metric_index + 1) plot.update_xaxes(title_text=f"{meta_label}", row=label_index + 1, col=metric_index + 1, type='category') plot.update_yaxes(title_text=f"{metric}", row=label_index + 1, col=metric_index + 1, rangemode="nonnegative", tick0=0, dtick=1) csv_data[f"{meta_label}"] = plotting_data[f"{meta_label}"] csv_data = pd.DataFrame(csv_data) listOfPlot.append(csv_data) plot.update_traces(marker_color=px.colors.sequential.Teal[3], showlegend=False) filename = self.result_path / "plots.html" plot.write_html(str(filename)) report_output_fig = ReportOutput(filename) paths.append(report_output_fig) result_table_path = self._write_results_table(listOfPlot, self.metadata_labels) return ReportResult(name=self.name, output_figures=paths, output_tables=[ReportOutput(result_table_path[0])])
def generate_report(self) -> ReportResult: """ Generates a report of the given class if the prerequisites are satisfied. It handles all exceptions so that if there is an error while generating a report, the execution of the rest of the code (e.g., more time-expensive parts, like instructions) is not influenced. Returns: ReportResult object which encapsulates all outputs (figure, table, and text files) so that they can be conveniently linked to in the final output of instructions """ try: if self.check_prerequisites(): return self._generate() except Exception as e: logging.exception( f"An exception occurred while generating report {self.name}. See the details below:" ) logging.warning( f"Report {self.name} encountered an error and could not be generated: {e}." ) return ReportResult( name=f"{self.name} (failed)", info="This report failed, see the log file for more information" )
def _generate(self) -> ReportResult: sequence_lengths = self._get_sequence_lengths() report_output_fig = self._plot(sequence_lengths=sequence_lengths) output_figures = None if report_output_fig is None else [report_output_fig] return ReportResult(name=self.name, info="A histogram of the lengths of the sequences in a RepertoireDataset.", output_figures=output_figures)
def _generate(self) -> ReportResult: sequence_lengths = self._get_sequence_lengths() report_output_fig = self._plot(sequence_lengths=sequence_lengths) output_figures = None if report_output_fig is None else [ report_output_fig ] return ReportResult(type(self).__name__, output_figures=output_figures)
def _generate(self) -> ReportResult: PathBuilder.build(self.result_path) matrix_result = self._export_matrix() details_result = self._export_details() label_result = self._export_labels() return ReportResult(self.name, output_tables=[matrix_result], output_text=[details_result, label_result])
def _generate(self) -> ReportResult: PathBuilder.build(self.result_path) matrix_result = self._export_matrix() details_result = self._export_details() label_result = self._export_labels() return ReportResult(self.name, info="The design matrix and related information of a given encoded Dataset", output_tables=[matrix_result, label_result], output_text=[details_result])
def _generate(self): PathBuilder.build(self.result_path) paths = [] self._set_plotting_parameters() plot_data = self._retrieve_plot_data() plot_data["abs_coefficients"] = abs(plot_data["coefficients"]) plot_data.sort_values(by="abs_coefficients", inplace=True, ascending=False) result_table_path = self._write_results_table( plot_data[["features", "coefficients"]]) self._write_settings() if CoefficientPlottingSetting.ALL in self._coefs_to_plot: report_output_fig = self._plot(plotting_data=plot_data, output_name="all_coefficients") paths.append(report_output_fig) if CoefficientPlottingSetting.NONZERO in self._coefs_to_plot: nonzero_data = plot_data[plot_data["coefficients"] != 0] report_output_fig = self._plot(plotting_data=nonzero_data, output_name="nonzero_coefficients") paths.append(report_output_fig) if CoefficientPlottingSetting.CUTOFF in self._coefs_to_plot: for cutoff_val in self._cutoff: cutoff_data = plot_data[ plot_data["abs_coefficients"] >= cutoff_val] report_output_fig = self._plot( plotting_data=cutoff_data, output_name="cutoff_{}_coefficients".format(cutoff_val)) paths.append(report_output_fig) if CoefficientPlottingSetting.N_LARGEST in self._coefs_to_plot: for n_val in self._n_largest: n_largest_data = plot_data.nlargest( n=n_val, columns=["abs_coefficients"]) report_output_fig = self._plot( plotting_data=n_largest_data, output_name="largest_{}_coefficients".format(n_val)) paths.append(report_output_fig) return ReportResult( self.name, info= f"{self._y_axis_title}s of the trained {self.method.__class__.__name__} model", output_tables=[ ReportOutput(result_table_path, "features and coefficients csv") ], output_figures=[p for p in paths if p is not None])
def _generate(self) -> ReportResult: df = pd.read_csv(self.dataset.encoded_data.info["relevant_sequence_path"]) column_mapping = self._compute_column_mapping(df) df.rename(columns=column_mapping, inplace=True) PathBuilder.build(self.result_path) filename = self.result_path / "relevant_sequences.csv" df.to_csv(filename, index=False) return ReportResult(self.name, output_tables=[ReportOutput(filename, "relevant sequences")])
def _generate(self): PathBuilder.build(self.result_path) self._set_plotting_parameters() plot_df = self._retrieve_plot_data() report_output_table = self._write_results_table(plot_df) report_output_fig = self._plot(plot_df, "motif_seed_recovery") return ReportResult(self.name, output_tables=[report_output_table], output_figures=[report_output_fig])
def _generate(self) -> ReportResult: PathBuilder.build(self.result_path) plotting_data = self._retrieve_plotting_data() result_table = self._write_results_table(plotting_data) report_output_fig = self._safe_plot(plotting_data=plotting_data) output_figures = [report_output_fig ] if report_output_fig is not None else [] return ReportResult(self.name, output_tables=[result_table], output_figures=output_figures)
def _generate(self) -> ReportResult: PathBuilder.build(self.result_path) lower_limit, upper_limit = self.get_distribution_limits() self.result_name = "beta_distribution" report_output_fig = self._plot(upper_limit=upper_limit, lower_limit=lower_limit) output_figures = [] if report_output_fig is None else [report_output_fig] return ReportResult(name="Beta distribution priors - probability that a sequence is disease-associated", output_figures=output_figures)
def _generate(self) -> ReportResult: self.result_path = PathBuilder.build(self.result_path / self.name) self._extract_label() hp_items = [state.optimal_hp_items[self.label] for state in self.instruction_states] overlap_matrix = SequenceAnalysisHelper.compute_overlap_matrix(hp_items) labels = [state.dataset.name for state in self.instruction_states] figure_path = self._make_figure(overlap_matrix, labels) data_path = self._export_matrix(overlap_matrix, labels) return ReportResult(output_figures=[ReportOutput(figure_path, 'sequence overlap across datasets')], output_tables=[ReportOutput(data_path, 'sequence overlap across datasets (csv)')])
def _generate(self): PathBuilder.build(self.result_path) self._set_plotting_parameters() plot_df = self._retrieve_plot_data() report_output_table = self._write_results_table(plot_df) report_output_fig = self._plot(plot_df, "motif_seed_recovery") return ReportResult( self.name, info= "This report shows how well implanted ('ground truth') motifs are recovered by ML models using the k-mer encoding. The x axis (box grouping) represents the maximum number of overlapping positions between a 'ground truth' motif seed and a k-mer feature. The y axis values represent the learned coefficients. ", output_tables=[report_output_table], output_figures=[report_output_fig])
def _generate(self) -> ReportResult: PathBuilder.build(self.result_path) plotting_data = self._retrieve_plotting_data() result_table = self._write_results_table(plotting_data) report_output_fig = self._safe_plot(plotting_data=plotting_data) output_figures = [report_output_fig ] if report_output_fig is not None else [] return ReportResult( self.name, info= "The performance on the test set in the assessment (outer cross-validation) loop for each of the setting combinations as defined under 'settings'.", output_tables=[result_table], output_figures=output_figures)
def _generate(self) -> ReportResult: df = pd.read_csv( self.dataset.encoded_data.info["relevant_sequence_path"]) column_mapping = self._compute_column_mapping(df) df.rename(columns=column_mapping, inplace=True) PathBuilder.build(self.result_path) filename = self.result_path / "relevant_sequences.csv" df.to_csv(filename, index=False) return ReportResult( self.name, info= f"Exports the sequences that are extracted as label-associated using the {self.dataset.encoded_data.encoding} in AIRR-compliant format.", output_tables=[ReportOutput(filename, "relevant sequences")])
def _generate(self) -> ReportResult: PathBuilder.build(self.result_path) report_result = ReportResult() sequence_alphabet = EnvironmentSettings.get_sequence_alphabet( self.method.sequence_type) for kernel_name in self.method.CNN.conv_chain_1 + self.method.CNN.conv_chain_2: figure_outputs, table_outputs = self._plot_kernels( kernel_name, sequence_alphabet) report_result.output_figures.extend(figure_outputs) report_result.output_tables.extend(table_outputs) figure_output, table_output = self._plot_fc_layer() report_result.output_figures.append(figure_output) report_result.output_tables.append(table_output) return report_result
def _generate(self) -> ReportResult: PathBuilder.build(self.result_path) tables, figures = [], [] for label in self.state.label_configuration.get_labels_by_name(): if self.compare_in_assessment: table, figure = self._generate_for_assessment(label) tables.append(table) figures.append(figure) if self.compare_in_selection: tmp_tables, tmp_figures = self._generate_for_selection(label) tables += tmp_tables figures += tmp_figures return ReportResult(self.name, [fig for fig in figures if fig is not None], [tab for tab in tables if tab is not None])
def _generate(self) -> ReportResult: PathBuilder.build(self.result_path) self.result_name = f"{self.feature}_performance" training_dataframe, test_dataframe = self._make_plot_dataframes() table_results = self._store_dataframes(training_dataframe, test_dataframe) report_output_fig = self._plot(training_dataframe=training_dataframe, test_dataframe=test_dataframe) output_figures = None if report_output_fig is None else [ report_output_fig ] return ReportResult(output_tables=table_results, output_figures=output_figures)
def _generate(self): report_output_tables = [] if isinstance(self.dataset, RepertoireDataset): for repertoire in self.dataset.get_data(): result_path = self.result_path / repertoire.identifier PathBuilder.build(result_path) report_output_tables = self.export_receptorlist(repertoire.receptors, result_path) elif isinstance(self.dataset, ReceptorDataset): receptors = self.dataset.get_data() result_path = self.result_path / self.dataset.identifier PathBuilder.build(result_path) report_output_tables = self.export_receptorlist(receptors, result_path=result_path) return ReportResult(name=self.name, info="This report exports the Receptor sequences to .sif format, such that they can directly be imported as a network in Cytoscape, to visualize chain sharing between the different receptors in a dataset (for example, for TCRs: how often one alpha chain is shared with multiple beta chains, and vice versa).", output_tables=report_output_tables)