Esempio n. 1
0
    def _generate(self) -> ReportResult:
        x = self.test_dataset.encoded_data
        y_score = self.method.predict_proba(x, self.label)[self.label]
        fpr, tpr, _ = roc_curve(x.labels[self.label], y_score[:, 0])
        roc_auc = auc(fpr, tpr)

        trace1 = go.Scatter(x=fpr,
                            y=tpr,
                            mode='lines',
                            line=dict(color='darkorange', width=2),
                            name=f"ROC curve (area = {roc_auc})")
        trace2 = go.Scatter(x=[0, 1],
                            y=[0, 1],
                            mode='lines',
                            line=dict(color='navy', width=2, dash='dash'),
                            showlegend=False)
        layout = go.Layout(title='Receiver operating characteristic example',
                           xaxis=dict(title='False Positive Rate'),
                           yaxis=dict(title='True Positive Rate'))

        fig = go.Figure(data=[trace1, trace2], layout=layout)

        PathBuilder.build(self.result_path)
        path_htm = self.result_path / f"{self.name}.html"
        path_csv = self.result_path / f"{self.name}.csv"
        csv_result = np.concatenate((fpr.reshape(1, -1), tpr.reshape(1, -1)))
        fig.write_html(str(path_htm))
        np.savetxt(str(path_csv), csv_result, header="fpr,tpr")
        return ReportResult(self.name,
                            output_figures=[ReportOutput(path_htm)],
                            output_tables=[ReportOutput(path_csv)])
Esempio n. 2
0
    def _generate(self) -> ReportResult:
        self.result_path = PathBuilder.build(self.result_path / self.name)
        self._extract_label()

        hp_items = [
            state.optimal_hp_items[self.label.name]
            for state in self.instruction_states
        ]
        overlap_matrix = SequenceAnalysisHelper.compute_overlap_matrix(
            hp_items)

        labels = [state.dataset.name for state in self.instruction_states]
        figure_path = self._make_figure(overlap_matrix, labels)
        data_path = self._export_matrix(overlap_matrix, labels)

        return ReportResult(
            name=self.name,
            info=
            "A heatmap showing the overlap of disease-associated sequences produced by SequenceAbundance encoders between multiple datasets of different sizes.",
            output_figures=[
                ReportOutput(figure_path, 'sequence overlap across datasets')
            ],
            output_tables=[
                ReportOutput(data_path,
                             'sequence overlap across datasets (csv)')
            ])
Esempio n. 3
0
    def _discover_motif_in_cluster(self, tcr_rep, index, row, negative_examples=None) -> Tuple[List[ReportOutput], List[ReportOutput]]:
        from tcrdist.adpt_funcs import get_centroid_seq
        from tcrdist.summarize import _select

        from palmotif import compute_pal_motif
        from palmotif import svg_logo

        dfnode = tcr_rep.clone_df.iloc[row['neighbors_i'],]
        figure_outputs, table_outputs = [], []

        logging.info(f"{TCRdistMotifDiscovery.__name__}: in cluster {index+1}, there are {dfnode.shape[0]} neighbors.")

        for chain in ['a', 'b']:

            if dfnode.shape[0] > 2:
                centroid, *_ = get_centroid_seq(df=dfnode)
            else:
                centroid = dfnode[f'cdr3_{chain}_aa'].to_list()[0]

            motif, stat = compute_pal_motif(seqs=_select(df=tcr_rep.clone_df, iloc_rows=row['neighbors_i'], col=f'cdr3_{chain}_aa'),
                                            centroid=centroid, refs=negative_examples[chain] if self.use_reference_sequences else None)

            figure_path = self.result_path / f"motif_{chain}_{index + 1}.svg"
            svg_logo(motif, filename=figure_path)

            motif_data_path = self.result_path / f"motif_{chain}_{index + 1}.csv"
            motif.to_csv(motif_data_path)

            figure_outputs.append(ReportOutput(figure_path, f'Motif {index + 1} ({Chain.get_chain(chain.upper()).name.lower()} chain)'))
            table_outputs.append(ReportOutput(motif_data_path, f'motif {index + 1} ({Chain.get_chain(chain.upper()).name.lower()} chain) csv data'))

        return figure_outputs, table_outputs
Esempio n. 4
0
    def _generate(self) -> ReportResult:
        PathBuilder.build(self.result_path)
        paths = []

        # make predictions
        predictions = self.method.predict(
            self.test_dataset.encoded_data,
            self.label)[self.label]  # label = disease

        true_labels = self.test_dataset.get_metadata(self.metadata_labels +
                                                     [self.label])
        metrics = ["FP", "FN"]

        plot = make_subplots(rows=len(self.metadata_labels), cols=2)
        listOfPlot = []

        for label_index, meta_label in enumerate(self.metadata_labels):
            csv_data = {}
            for metric_index, metric in enumerate(metrics):
                plotting_data = self._metrics(metric=metric,
                                              label=self.label,
                                              meta_label=meta_label,
                                              predictions=predictions,
                                              true_labels=true_labels)

                csv_data[f"{metric}"] = plotting_data[f"{metric}"]

                plot.add_trace(go.Bar(x=plotting_data[meta_label],
                                      y=plotting_data[metric]),
                               row=label_index + 1,
                               col=metric_index + 1)
                plot.update_xaxes(title_text=f"{meta_label}",
                                  row=label_index + 1,
                                  col=metric_index + 1,
                                  type='category')
                plot.update_yaxes(title_text=f"{metric}",
                                  row=label_index + 1,
                                  col=metric_index + 1,
                                  rangemode="nonnegative",
                                  tick0=0,
                                  dtick=1)

            csv_data[f"{meta_label}"] = plotting_data[f"{meta_label}"]

            csv_data = pd.DataFrame(csv_data)

            listOfPlot.append(csv_data)

        plot.update_traces(marker_color=px.colors.sequential.Teal[3],
                           showlegend=False)
        filename = self.result_path / "plots.html"
        plot.write_html(str(filename))
        report_output_fig = ReportOutput(filename)
        paths.append(report_output_fig)

        result_table_path = self._write_results_table(listOfPlot,
                                                      self.metadata_labels)
        return ReportResult(name=self.name,
                            output_figures=paths,
                            output_tables=[ReportOutput(result_table_path[0])])
Esempio n. 5
0
    def _generate(self) -> ReportResult:
        PathBuilder.build(self.result_path)

        test_metadata_filepath = self.test_dataset.encoded_data.info[
            'metadata_filepath']
        label_names = [self.label]
        hdf5_filepath = self.method._metadata_to_hdf5(test_metadata_filepath,
                                                      label_names)

        n_examples_test = len(self.test_dataset.encoded_data.example_ids)
        indices = np.array(range(n_examples_test))

        dataloader = self.method.make_data_loader(hdf5_filepath,
                                                  pre_loaded_hdf5_file=None,
                                                  indices=indices,
                                                  label=self.label,
                                                  eval_only=True,
                                                  is_train=False)

        model = self.method.get_model(self.label)[self.label]

        compute_contributions(intgrds_set_loader=dataloader,
                              deeprc_model=model,
                              n_steps=self.n_steps,
                              threshold=self.threshold,
                              resdir=self.result_path,
                              filename_inputs=self.filename_inputs,
                              filename_kernels=self.filename_kernels)

        return ReportResult(self.name,
                            output_figures=[
                                ReportOutput(self.filename_inputs),
                                ReportOutput(self.filename_kernels)
                            ])
Esempio n. 6
0
    def _store_sequence_distribution_data(self, fig, dfs, chains):
        fig.write_html(str(self.result_path / "sequence_length_distribution.html"))
        image_output = ReportOutput(self.result_path / "sequence_length_distribution.html", name="sequence length distribution per chain")
        table_outputs = [ReportOutput(self.result_path / f"sequence_length_distribution_chain_{chains[index]}.csv") for index in range(len(chains))]
        for index, df in enumerate(dfs):
            df.to_csv(table_outputs[index].path, index=False)

        return image_output, table_outputs
Esempio n. 7
0
    def plot_roc(self, optimal_hp_items, label: Label,
                 colors) -> Tuple[ReportOutput, List[ReportOutput]]:
        report_data_outputs = []
        figure = go.Figure()

        figure.add_trace(
            go.Scatter(x=[0, 1],
                       y=[0, 1],
                       mode='lines',
                       name='baseline',
                       line=dict(color=PerformanceOverview.PLOTLY_BLACK,
                                 dash='dash'),
                       hoverinfo="skip"))

        for index, item in enumerate(optimal_hp_items):
            if item.test_predictions_path is None:
                logging.warning(
                    f'{PerformanceOverview.__name__}: there are no test predictions for dataset '
                    f'{self.instruction_states[index].dataset.name}, skipping this dataset when generating performance overview...'
                )
            else:

                df = pd.read_csv(item.test_predictions_path)
                true_class = df[f"{label.name}_true_class"].values
                predicted_class = df[
                    f"{label.name}_{label.positive_class}_proba"].values
                fpr, tpr, _ = metrics.roc_curve(y_true=true_class,
                                                y_score=predicted_class)
                auc = metrics.roc_auc_score(true_class, predicted_class)
                name = self.instruction_states[
                    index].dataset.name + f' (AUC = {round(auc, 2)})'
                figure.add_trace(
                    go.Scatter(x=fpr,
                               y=tpr,
                               mode='lines',
                               name=name,
                               marker=dict(color=colors[index],
                                           line=dict(width=3)),
                               hoverinfo="skip"))

                data_path = self.result_path / f"roc_curve_data_{name}.csv"
                pd.DataFrame({
                    "FPR": fpr,
                    "TPR": tpr
                }).to_csv(data_path, index=False)
                report_data_outputs.append(
                    ReportOutput(data_path,
                                 f'ROC curve data for dataset {name} (csv)'))

        figure_path = self.result_path / "roc_curve.html"
        figure.update_layout(template='plotly_white',
                             xaxis_title='false positive rate',
                             yaxis_title='true positive rate')
        figure.write_html(str(figure_path))

        return ReportOutput(figure_path, 'ROC curve'), report_data_outputs
    def _generate(self) -> ReportResult:
        self.result_path = PathBuilder.build(self.result_path / self.name)
        self._extract_label()

        hp_items = [state.optimal_hp_items[self.label] for state in self.instruction_states]
        overlap_matrix = SequenceAnalysisHelper.compute_overlap_matrix(hp_items)

        labels = [state.dataset.name for state in self.instruction_states]
        figure_path = self._make_figure(overlap_matrix, labels)
        data_path = self._export_matrix(overlap_matrix, labels)

        return ReportResult(output_figures=[ReportOutput(figure_path, 'sequence overlap across datasets')],
                            output_tables=[ReportOutput(data_path, 'sequence overlap across datasets (csv)')])
    def _store_dataframes(self, training_dataframe: pd.DataFrame,
                          test_dataframe: pd.DataFrame) -> List[ReportOutput]:
        train_path = self.result_path / "training_performance.csv"
        test_path = self.result_path / "test_performance.csv"
        training_dataframe.to_csv(train_path, index=False)
        test_dataframe.to_csv(test_path, index=False)

        return [
            ReportOutput(
                path=train_path,
                name=f"Training performance w.r.t. {self.feature} values"),
            ReportOutput(path=test_path,
                         name=f"Test performance w.r.t. {self.feature} values")
        ]
Esempio n. 10
0
    def _generate_heatmap(self,
                          x,
                          y,
                          z,
                          metric,
                          output,
                          xlabel='Prediction',
                          ylabel='Ground Truth',
                          zlabel='Count'):
        path_csv = self.result_path / f"{self.name}_{metric.lower()}.csv"
        path_html = self.result_path / f"{self.name}_{metric.lower()}.html"

        z_flip = np.flipud(z)

        hovertext = []
        for yi, yy in enumerate(y):
            hovertext.append(list())
            for xi, xx in enumerate(x):
                hovertext[-1].append(
                    f"{xlabel}: {xx}<br />{ylabel}: {yy}<br />{zlabel}: {z_flip[yi][xi]}"
                )

        layout = go.Layout(title=f'Evaluation: {metric} ({self.label})',
                           xaxis=dict(title=xlabel),
                           yaxis=dict(title=ylabel))
        trace = go.Heatmap(z=z_flip,
                           x=x,
                           y=y,
                           hoverongaps=False,
                           colorscale='burgyl',
                           hoverinfo='text',
                           text=hovertext)
        fig = go.Figure(data=[trace], layout=layout)
        fig.write_html(str(path_html))

        z_df = pd.DataFrame(z)
        z_df.columns = f'{xlabel} (' + pd.Index(map(str, x)) + ')'
        z_df.index = f'{ylabel} (' + pd.Index(map(str, y)) + ')'
        z_df.to_csv(path_csv)

        output['tables'].append(
            ReportOutput(path_csv,
                         f"TrainingPerformance table ({metric.lower()})"))
        output['figures'].append(
            ReportOutput(path_html,
                         f"TrainingPerformance html ({metric.lower()})"))

        return
Esempio n. 11
0
    def _plot(self, plotting_data, output_name):
        if plotting_data.empty:
            logging.warning(
                f"Coefficients: empty data subset specified, skipping {output_name} plot..."
            )
        else:

            filename = self.result_path / f"{output_name}.html"

            import plotly.express as px
            figure = px.box(
                plotting_data,
                x="max_seed_overlap",
                y="coefficients",
                labels={
                    "max_seed_overlap": self._x_axis_title,
                    "coefficients": self._y_axis_title
                },
                template='plotly_white',
                color_discrete_sequence=px.colors.diverging.Tealrose)
            # figure.update_layout(title={"text":self.title, "x":0.5, "font": {"size":14}})

            figure.write_html(str(filename))

            return ReportOutput(
                filename,
                f"Overlap between implanted motif seeds and features versus {self._y_axis_title.lower()}"
            )
Esempio n. 12
0
    def _write_paired_matches(self,
                              paired_matches_path: Path) -> List[ReportOutput]:
        PathBuilder.build(paired_matches_path)

        report_outputs = []
        for i in range(0, len(self.dataset.encoded_data.example_ids)
                       ):  # todo don't mention subject in the name twice
            file_name = "example_{}_".format(
                self.dataset.encoded_data.example_ids[i])
            file_name += "_".join([
                "{label}_{value}".format(label=label, value=values[i])
                for label, values in self.dataset.encoded_data.labels.items()
            ])
            file_name += ".csv"
            file_path = paired_matches_path / file_name

            if self.dataset.encoded_data.encoding == "MatchedReceptorsEncoder":
                self._write_paired_receptor_matches_for_repertoire(
                    self.dataset.encoded_data.examples[i], file_path)
            elif self.dataset.encoded_data.encoding == "MatchedRegexEncoder":
                self._write_paired_regex_matches_for_repertoire(
                    self.dataset.encoded_data.examples[i], file_path)

            report_outputs.append(
                ReportOutput(
                    file_path,
                    f"Example {self.dataset.encoded_data.example_ids[i]} paired matches"
                ))

        return report_outputs
Esempio n. 13
0
    def _write_repertoire_sizes(self):
        """
        Writes the repertoire sizes (# clones & # reads) per subject, per chain.
        """
        all_subjects = self.dataset.encoded_data.example_ids
        all_chains = sorted(
            set(self.dataset.encoded_data.feature_annotations["chain"]))

        results_df = pd.DataFrame(list(
            itertools.product(all_subjects, all_chains)),
                                  columns=["subject_id", "chain"])
        results_df["n_reads"] = 0
        results_df["n_clones"] = 0

        for repertoire in self.dataset.repertoires:
            rep_counts = repertoire.get_counts()
            rep_chains = repertoire.get_chains()

            for chain in all_chains:
                indices = rep_chains == Chain.get_chain(chain.upper())
                results_df.loc[(
                    results_df.subject_id == repertoire.metadata["subject_id"])
                               & (results_df.chain == chain),
                               'n_reads'] += np.sum(rep_counts[indices])
                results_df.loc[(
                    results_df.subject_id == repertoire.metadata["subject_id"])
                               & (results_df.chain == chain),
                               'n_clones'] += len(rep_counts[indices])

        results_path = self.result_path / "repertoire_sizes.csv"
        results_df.to_csv(results_path, index=False)

        return ReportOutput(results_path, "Repertoire sizes")
Esempio n. 14
0
    def _plot(self, data_long_format) -> ReportOutput:
        groupby_cols = [self.x, self.color, self.facet_row, self.facet_column]
        groupby_cols = [i for i in groupby_cols if i]
        groupby_cols = list(set(groupby_cols))
        plotting_data = data_long_format.groupby(
            groupby_cols, as_index=False).agg({"value": ['mean', self.std]})

        plotting_data.columns = plotting_data.columns.map(''.join)

        error_y = "valuestd" if self.show_error_bar else None

        figure = px.bar(plotting_data,
                        x=self.x,
                        y="valuemean",
                        color=self.color,
                        barmode="relative",
                        facet_row=self.facet_row,
                        facet_col=self.facet_column,
                        error_y=error_y,
                        labels={
                            "valuemean": self.y_title,
                            self.x: self.x_title,
                        },
                        template='plotly_white',
                        color_discrete_sequence=px.colors.diverging.Tealrose)

        file_path = self.result_path / f"{self.result_name}.html"

        figure.write_html(str(file_path))

        return ReportOutput(path=file_path, name="Average feature values")
Esempio n. 15
0
    def _export_matrix(self) -> ReportOutput:
        """Create a file for the design matrix in the desired format."""
        
        data = self._get_data()
        file_path = self.result_path / "design_matrix"
        ext = os.path.splitext(self.file_format)[0]
        file_path = file_path.with_suffix('.' + ext)

        # Use h5py to create a hdf5 file.
        if ext == "hdf5": 
            with h5py.File(str(file_path), 'w') as hf_object:
                hf_object.create_dataset(str(file_path), data=data)
        # Use numpy to create a csv or npy file.
        elif len(data.shape) <= 2 and ext == "csv":
            feature_names = self.dataset.encoded_data.feature_names
            header = ",".join(str(name) for name in feature_names) if feature_names is not None else ""
            np.savetxt(fname=str(file_path), X=data, delimiter=",", comments='',
                       header=header)
        else:
            if ext != "npy":
                logging.info('The selected Report format is not compatible, .npy is used instead')
                file_path = file_path.with_suffix(".npy")
                ext = "npy"
            np.save(str(file_path), data)
        
        # If requested, compress the file into a .zip.
        if self.file_format.endswith(".zip"):
            file_path_zip = file_path.with_suffix('.' + ext + '.zip')
            with zipfile.ZipFile(str(file_path_zip), 'w') as zipped_file:
                zipped_file.write(str(file_path), compress_type=zipfile.ZIP_DEFLATED)
            os.remove(str(file_path)) 
            file_path = file_path_zip
        return ReportOutput(file_path, "design matrix")
Esempio n. 16
0
    def _plot_fc_figure(self, df, bias):
        fig = make_subplots(rows=1,
                            cols=2,
                            column_widths=[0.8, 0.2],
                            specs=[[{
                                "type": "bar"
                            }, {
                                'type': "table"
                            }]])
        fig.add_trace(go.Bar(
            x=df["names"],
            y=df["weights"],
            name="weights",
            hovertemplate='Weight for %{x}: %{y:.4f}<extra></extra>',
            hoverlabel={"font_color": "white"},
            marker_color=px.colors.diverging.Tealrose[0]),
                      row=1,
                      col=1)
        table = go.Table(header={"values": ["bias"]}, cells={"values": bias})
        table.cells.format = [[None], ['.3f']]
        fig.add_trace(table, row=1, col=2)
        fig.update_layout(template="plotly_white")
        fig.write_html(
            str(self.result_path / "fully_connected_layer_weights.html"))

        return ReportOutput(
            self.result_path / "fully_connected_layer_weights.html",
            "fully-connected layer weights")
Esempio n. 17
0
    def _plot_sparse(self, data_long_format) -> ReportOutput:
        columns_to_filter = [self.x, "value"]
        for optional_column in [self.color, self.facet_row, self.facet_column]:
            if optional_column is not None:
                columns_to_filter.append(optional_column)

        data_long_format_filtered = data_long_format.loc[data_long_format.value != 0, columns_to_filter]
        columns_to_filter.remove("value")
        total_counts = data_long_format_filtered.groupby(columns_to_filter, as_index=False).agg(
            {"value": 'sum'})
        data_long_format_filtered = data_long_format_filtered.merge(total_counts,
                                                                    on=self.x,
                                                                    how="left",
                                                                    suffixes=('', '_sum')) \
            .fillna(0) \
            .sort_values(by=self.x) \
            .reset_index(drop=True)

        figure = px.box(data_long_format_filtered, x=self.x, y="value", color=self.color,
                        facet_row=self.facet_row, facet_col=self.facet_column,
                        labels={
                            "valuemean": self.y_title,
                            self.x: self.x_title,
                        }, template='plotly_white',
                        color_discrete_sequence=px.colors.diverging.Tealrose)

        file_path = self.result_path / f"{self.result_name}.html"

        figure.write_html(str(file_path))

        return ReportOutput(path=file_path, name="feature boxplots")
Esempio n. 18
0
    def _generate(self) -> ReportResult:
        PathBuilder.build(self.result_path)
        alpha_chains, beta_chains, trbv, trbj, subject_condition, count = [], [], [], [], [], []
        for index, receptor in enumerate(self.dataset.get_data()):
            alpha_chains.append(
                receptor.get_chain("alpha").amino_acid_sequence)
            beta_chains.append(receptor.get_chain("beta").amino_acid_sequence)
            trbv.append(receptor.get_chain("beta").metadata.v_gene)
            trbj.append(receptor.get_chain("beta").metadata.j_gene)
            subject_condition.append(
                f"{getattr(receptor.metadata, 'subject_id', str(index))}:{receptor.metadata[self.condition]}"
            )
            count.append(
                receptor.get_chain("beta").metadata.count if
                receptor.get_chain('beta').metadata is not None and receptor.
                get_chain('beta').metadata.count is not None else 1)

        df = pd.DataFrame({
            "CDR3b": beta_chains,
            "TRBV": trbv,
            "TRBJ": trbj,
            "CDR3a": alpha_chains,
            "subject:condition": subject_condition,
            "count": count
        })
        file_path = self.result_path / "exported_data.tsv"
        df.to_csv(file_path, sep="\t", index=False)

        return ReportResult(self.name,
                            output_tables=[
                                ReportOutput(file_path,
                                             "exported data in GLIPH2 format")
                            ])
Esempio n. 19
0
    def _generate(self) -> ReportResult:
        PathBuilder.build(self.result_path)

        text_path = self.result_path / "dataset_description.txt"

        dataset_name = self.dataset.name if self.dataset.name is not None else self.dataset.identifier

        output_text = self._get_generic_dataset_text()

        if isinstance(self.dataset, RepertoireDataset):
            output_text += self._get_repertoire_dataset_text()
        elif isinstance(self.dataset, ReceptorDataset):
            output_text += self._get_receptor_dataset_text()
        elif isinstance(self.dataset, SequenceDataset):
            output_text += self._get_sequence_dataset_text()

        text_path.write_text(output_text)

        return ReportResult(
            name=self.name,
            info=
            "A simple text-based overview of the properties of any dataset, including the dataset name, size, and metadata labels.",
            output_text=[
                ReportOutput(text_path,
                             f"Description of dataset {dataset_name}")
            ])
Esempio n. 20
0
    def _generate(self) -> ReportResult:
        from immuneML.util.TCRdistHelper import TCRdistHelper
        from tcrdist.rep_diff import hcluster_diff
        from tcrdist.summarize import member_summ

        PathBuilder.build(self.result_path)

        subsampled_dataset = self._extract_positive_example_dataset()
        reference_sequences = self._extract_reference_sequences()
        tcr_rep = TCRdistHelper.compute_tcr_dist(subsampled_dataset, [self.label.name], self.cores)
        tcr_rep.hcluster_df, tcr_rep.Z = hcluster_diff(clone_df=tcr_rep.clone_df, pwmat=tcr_rep.pw_alpha + tcr_rep.pw_beta, x_cols=["epitope"],
                                                       count_col='count')

        figures, tables = [], []

        logging.info(f'{TCRdistMotifDiscovery.__name__}: created {tcr_rep.hcluster_df.shape[0]} clusters, now discovering motifs in clusters.')

        for index, row in tcr_rep.hcluster_df.iterrows():
            if len(row['neighbors_i']) >= self.min_cluster_size:
                figure_outputs, table_outputs = self._discover_motif_in_cluster(tcr_rep, index, row, reference_sequences)
                figures.extend(figure_outputs)
                tables.extend(table_outputs)

        res_summary = member_summ(res_df=tcr_rep.hcluster_df, clone_df=tcr_rep.clone_df, addl_cols=['epitope'])
        res_summary.to_csv(self.result_path / "tcrdist_summary.csv")

        tables.append(ReportOutput(path=self.result_path / "tcrdist_summary.csv", name="TCRdist summary (csv)"))

        return ReportResult(name=self.name, info="TCRdist motif discovery", output_figures=figures, output_tables=tables)
Esempio n. 21
0
    def _store_fc_table(self, df, bias):
        df.append({"weights": bias, "names": "bias"}, ignore_index=True)
        df.to_csv(self.result_path / "fully_connected_layer_weights.csv",
                  index=False)

        return ReportOutput(
            self.result_path / "fully_connected_layer_weights.csv",
            "fully-connected layer weights")
 def _export_matrix(self, overlap_matrix, filename,
                    row_col_names) -> ReportOutput:
     data_path = self.result_path / f"{filename}.csv"
     pd.DataFrame(overlap_matrix,
                  columns=row_col_names,
                  index=row_col_names).to_csv(data_path)
     return ReportOutput(data_path,
                         " ".join(filename.split('_') + ['data']))
Esempio n. 23
0
    def _generate_barplot(self, df, output):
        import plotly.express as px

        path_csv = self.result_path / f"{self.name}.csv"
        path_html = self.result_path / f"{self.name}.html"

        df.to_csv(path_csv)

        figure = px.bar(df, x=df.index, y=self.label.name, labels={'index': "metrics"},
                        template='plotly_white', color_discrete_sequence=px.colors.diverging.Tealrose,
                        title=f"Evaluation metrics ({self.label})")

        figure.write_html(str(path_html))

        output['tables'].append(ReportOutput(path_csv, "training performance in csv"))
        output['figures'].append(ReportOutput(path_html, "training performance on selected metrics"))

        return
Esempio n. 24
0
    def export_receptorlist(self, receptors, result_path: Path):
        export_list = []
        node_metadata_list = []
        edge_metadata_list = []

        for receptor in receptors:
            first_chain = receptor.get_chain(self.chains[0])
            second_chain = receptor.get_chain(self.chains[1])
            first_chain_name = self.get_shared_name(first_chain)
            second_chain_name = self.get_shared_name(second_chain)

            export_list.append([first_chain_name, "pair", second_chain_name])

            node_metadata_list.append([first_chain_name, self.chains[0]] + self.get_formatted_node_metadata(first_chain))
            node_metadata_list.append([second_chain_name, self.chains[1]] + self.get_formatted_node_metadata(second_chain))

            edge_metadata_list.append(
                [f"{first_chain_name} (pair) {second_chain_name}"] + self.get_formatted_edge_metadata(first_chain, second_chain))

        full_df = pd.DataFrame(export_list, columns=[self.chains[0], "relationship", self.chains[1]])
        node_meta_df = pd.DataFrame(node_metadata_list, columns=["shared_name", "chain", "sequence", "v_subgroup", "v_gene", "j_subgroup",
                                                                 "j_gene"] + self.additional_node_attributes)
        edge_meta_df = pd.DataFrame(edge_metadata_list, columns=["shared_name"] + self.additional_edge_attributes)

        node_cols = list(node_meta_df.columns)
        node_meta_df["n_duplicates"] = 1
        node_meta_df = node_meta_df.groupby(node_cols, as_index=False)["n_duplicates"].sum()

        edge_meta_df.drop_duplicates(inplace=True)
        node_meta_df.to_csv(result_path / "node_metadata.tsv", sep="\t", index=0, header=True)
        edge_meta_df.to_csv(result_path / "edge_metadata.tsv", sep="\t", index=0, header=True)

        if self.drop_duplicates:
            full_df.drop_duplicates(inplace=True)

        full_df.to_csv(result_path / "all_chains.sif", sep="\t", index=0, header=False)

        shared_df = full_df[(full_df.duplicated(["alpha"], keep=False)) | (full_df.duplicated(["beta"], keep=False))]
        shared_df.to_csv(result_path / "shared_chains.sif", sep="\t", index=0, header=False)

        return [ReportOutput(path=result_path / "node_metadata.tsv"),
                ReportOutput(path=result_path / "edge_metadata.tsv"),
                ReportOutput(path=result_path / "all_chains.sif"),
                ReportOutput(path=result_path / "shared_chains.sif")]
Esempio n. 25
0
    def _generate(self) -> ReportResult:

        df = pd.read_csv(self.dataset.encoded_data.info["relevant_sequence_path"])
        column_mapping = self._compute_column_mapping(df)
        df.rename(columns=column_mapping, inplace=True)

        PathBuilder.build(self.result_path)
        filename = self.result_path / "relevant_sequences.csv"
        df.to_csv(filename, index=False)

        return ReportResult(self.name, output_tables=[ReportOutput(filename, "relevant sequences")])
Esempio n. 26
0
    def _generate(self):
        PathBuilder.build(self.result_path)
        paths = []

        self._set_plotting_parameters()

        plot_data = self._retrieve_plot_data()
        plot_data["abs_coefficients"] = abs(plot_data["coefficients"])
        plot_data.sort_values(by="abs_coefficients",
                              inplace=True,
                              ascending=False)

        result_table_path = self._write_results_table(
            plot_data[["features", "coefficients"]])
        self._write_settings()

        if CoefficientPlottingSetting.ALL in self._coefs_to_plot:
            report_output_fig = self._plot(plotting_data=plot_data,
                                           output_name="all_coefficients")
            paths.append(report_output_fig)

        if CoefficientPlottingSetting.NONZERO in self._coefs_to_plot:
            nonzero_data = plot_data[plot_data["coefficients"] != 0]
            report_output_fig = self._plot(plotting_data=nonzero_data,
                                           output_name="nonzero_coefficients")
            paths.append(report_output_fig)

        if CoefficientPlottingSetting.CUTOFF in self._coefs_to_plot:
            for cutoff_val in self._cutoff:
                cutoff_data = plot_data[
                    plot_data["abs_coefficients"] >= cutoff_val]
                report_output_fig = self._plot(
                    plotting_data=cutoff_data,
                    output_name="cutoff_{}_coefficients".format(cutoff_val))
                paths.append(report_output_fig)

        if CoefficientPlottingSetting.N_LARGEST in self._coefs_to_plot:
            for n_val in self._n_largest:
                n_largest_data = plot_data.nlargest(
                    n=n_val, columns=["abs_coefficients"])
                report_output_fig = self._plot(
                    plotting_data=n_largest_data,
                    output_name="largest_{}_coefficients".format(n_val))
                paths.append(report_output_fig)

        return ReportResult(
            self.name,
            info=
            f"{self._y_axis_title}s of the trained {self.method.__class__.__name__} model",
            output_tables=[
                ReportOutput(result_table_path,
                             "features and coefficients csv")
            ],
            output_figures=[p for p in paths if p is not None])
Esempio n. 27
0
    def _generate(self) -> ReportResult:
        PathBuilder.build(self.result_path)

        test_metadata_filepath = self.test_dataset.encoded_data.info[
            'metadata_filepath']
        hdf5_filepath = self.method._metadata_to_hdf5(
            metadata_filepath=test_metadata_filepath,
            label_name=self.label.name)

        n_examples_test = len(self.test_dataset.encoded_data.example_ids)
        indices = np.array(range(n_examples_test))

        dataloader = self.method.make_data_loader(hdf5_filepath,
                                                  pre_loaded_hdf5_file=None,
                                                  indices=indices,
                                                  label_name=self.label.name,
                                                  eval_only=True,
                                                  is_train=False)

        path_inputs = self.result_path / self.filename_inputs
        path_kernels = self.result_path / self.filename_kernels

        self.compute_contributions(intgrds_set_loader=dataloader,
                                   deeprc_model=self.method.model,
                                   n_steps=self.n_steps,
                                   threshold=self.threshold,
                                   path_inputs=path_inputs,
                                   path_kernels=self.result_path /
                                   self.filename_kernels)

        return ReportResult(
            self.name,
            info=
            "Plots the contributions of (i) input sequences and (ii) kernels to trained `DeepRC` model with respect to the test dataset. Contributions are computed using integrated gradients.",
            output_figures=[
                ReportOutput(path_inputs,
                             "Integrated Gradients over the inputs to DeepRC"),
                ReportOutput(
                    path_kernels,
                    "Integrated Gradients over the kernels of DeepRC")
            ])
Esempio n. 28
0
    def plot_precision_recall(self, optimal_hp_items: list, label: Label,
                              colors):
        report_data_outputs = []
        figure = go.Figure()

        for index, item in enumerate(optimal_hp_items):
            df = pd.read_csv(item.test_predictions_path)

            true_class = df[f"{label.name}_true_class"].values
            predicted_proba = df[
                f"{label.name}_{label.positive_class}_proba"].values
            precision, recall, _ = precision_recall_curve(
                y_true=true_class, probas_pred=predicted_proba)
            name = self.instruction_states[index].dataset.name
            figure.add_trace(
                go.Scatter(x=recall,
                           y=precision,
                           mode='lines',
                           name=name,
                           marker=dict(color=colors[index],
                                       line=dict(width=3)),
                           hoverinfo="skip"))

            data_path = self.result_path / f"precision_recall_data_{name}.csv"
            pd.DataFrame({
                "precision": precision,
                "recall": recall
            }).to_csv(data_path, index=False)
            report_data_outputs.append(
                ReportOutput(
                    data_path,
                    f'precision-recall curve data for dataset {name}'))

        figure_path = self.result_path / "precision_recall_curve.html"
        figure.update_layout(template='plotly_white',
                             xaxis_title="recall",
                             yaxis_title="precision")
        figure.write_html(str(figure_path))

        return ReportOutput(figure_path,
                            'precision-recall curve'), report_data_outputs
Esempio n. 29
0
    def _write_match_table(self):
        id_df = pd.DataFrame(
            {"repertoire_id": self.dataset.encoded_data.example_ids})
        label_df = pd.DataFrame(self.dataset.encoded_data.labels)
        matches_df = pd.DataFrame(
            self.dataset.encoded_data.examples,
            columns=self.dataset.encoded_data.feature_names)

        result_path = self.result_path / "complete_match_count_table.csv"
        id_df.join(label_df).join(matches_df).to_csv(result_path, index=False)

        return ReportOutput(result_path, "All matches")
Esempio n. 30
0
    def _export_details(self) -> ReportOutput:
        file_path = self.result_path / "encoding_details.yaml"
        with file_path.open("w") as file:
            details = {
                "feature_names": self.dataset.encoded_data.feature_names,
                "encoding": self.dataset.encoded_data.encoding,
                "example_ids": list(self.dataset.encoded_data.example_ids)
            }

            yaml.dump(details, file)

        return ReportOutput(file_path, "encoding details")