Ejemplo n.º 1
0
    def _encode_sequence_count(self, dataset: RepertoireDataset, comparison_data: ComparisonData, label: str, params: EncoderParams) -> EncodedData:
        sequence_p_values_indices, indices_path, relevant_sequences_path = SequenceFilterHelper.get_relevant_sequences(dataset, params, comparison_data, label, self.p_value_threshold,
                                                                                self.comparison_attributes, self.relevant_indices_path)
        if self.relevant_indices_path is None:
            self.relevant_indices_path = indices_path
        if self.relevant_sequence_csv_path is None:
            self.relevant_sequence_csv_path = relevant_sequences_path

        count_matrix = self._build_count_matrix(comparison_data, dataset.get_repertoire_ids(), sequence_p_values_indices)
        feature_names = comparison_data.get_item_names()[sequence_p_values_indices]

        encoded_data = EncodedData(count_matrix, dataset.get_metadata([label]) if params.encode_labels else None,
                                   dataset.get_repertoire_ids(),
                                   feature_names,
                                   encoding=SequenceCountEncoder.__name__, info={'relevant_sequence_path': self.relevant_sequence_csv_path})

        return encoded_data
    def create_comparison_data(self,
                               dataset: RepertoireDataset) -> ComparisonData:

        comparison_data = ComparisonData(dataset.get_repertoire_ids(),
                                         self.matching_columns,
                                         self.sequence_batch_size, self.path)
        comparison_data.process_dataset(dataset)

        return comparison_data
Ejemplo n.º 3
0
    def build_comparison_data(dataset: RepertoireDataset, params: EncoderParams,
                              comparison_attributes, sequence_batch_size):

        comp_data = ComparisonData(dataset.get_repertoire_ids(), comparison_attributes,
                                   sequence_batch_size, params.result_path)

        comp_data.process_dataset(dataset)

        return comp_data
Ejemplo n.º 4
0
    def build_labels(self, dataset: RepertoireDataset,
                     params: EncoderParams) -> dict:

        lbl = ["repertoire_identifier"]
        lbl.extend(params.label_config.get_labels_by_name())

        tmp_labels = dataset.get_metadata(lbl, return_df=True)
        tmp_labels = tmp_labels.iloc[pd.Index(
            tmp_labels['repertoire_identifier']).get_indexer(
                dataset.get_repertoire_ids())]
        tmp_labels = tmp_labels.to_dict("list")
        del tmp_labels["repertoire_identifier"]

        return tmp_labels
Ejemplo n.º 5
0
    def _calculate_sequence_abundance(self, dataset: RepertoireDataset, comparison_data: ComparisonData, label: str, params: EncoderParams):
        sequence_p_values_indices, indices_path, sequence_csv_path = SequenceFilterHelper.get_relevant_sequences(dataset=dataset, params=params,
                                                                                              comparison_data=comparison_data,
                                                                                              label=label, p_value_threshold=self.p_value_threshold,
                                                                                              comparison_attributes=self.comparison_attributes,
                                                                                              sequence_indices_path=self.relevant_indices_path)

        if self.relevant_indices_path is None:
            self.relevant_indices_path = indices_path

        if self.relevant_sequence_csv_path is None:
            self.relevant_sequence_csv_path = sequence_csv_path

        abundance_matrix = self._build_abundance_matrix(comparison_data, dataset.get_repertoire_ids(), sequence_p_values_indices)

        return abundance_matrix
Ejemplo n.º 6
0
    def build_distance_matrix(self, dataset: RepertoireDataset,
                              params: EncoderParams,
                              train_repertoire_ids: list):
        self.comparison = PairwiseRepertoireComparison(
            self.attributes_to_match,
            self.attributes_to_match,
            params.result_path,
            sequence_batch_size=self.sequence_batch_size)

        current_dataset = dataset if self.context is None or "dataset" not in self.context else self.context[
            "dataset"]

        distance_matrix = self.comparison.compare(current_dataset,
                                                  self.distance_fn,
                                                  self.distance_metric.value)

        repertoire_ids = dataset.get_repertoire_ids()

        distance_matrix = distance_matrix.loc[repertoire_ids,
                                              train_repertoire_ids]

        return distance_matrix
    def compare_repertoires(self, dataset: RepertoireDataset, comparison_fn):
        self.comparison_data = self.memo_by_params(dataset)
        repertoire_count = dataset.get_example_count()
        comparison_result = np.zeros([repertoire_count, repertoire_count])
        repertoire_identifiers = dataset.get_repertoire_ids()

        for index1 in range(repertoire_count):
            repertoire_vector_1 = self.comparison_data.get_repertoire_vector(
                repertoire_identifiers[index1])
            for index2 in range(index1, repertoire_count):
                repertoire_vector_2 = self.comparison_data.get_repertoire_vector(
                    repertoire_identifiers[index2])
                comparison_result[index1, index2] = comparison_fn(
                    repertoire_vector_1, repertoire_vector_2)
                comparison_result[index2, index1] = comparison_result[index1,
                                                                      index2]

        comparison_df = pd.DataFrame(comparison_result,
                                     columns=repertoire_identifiers,
                                     index=repertoire_identifiers)

        return comparison_df
Ejemplo n.º 8
0
    def _encode_data(self, dataset: RepertoireDataset, params: EncoderParams):
        labels = params.label_config.get_labels_by_name()

        assert len(labels) == 1, \
            "SequenceAbundanceEncoder: this encoding works only for single label."

        examples = self._calculate_sequence_abundance(dataset, self.comparison_data, labels[0], params)

        encoded_data = EncodedData(examples, dataset.get_metadata([labels[0]]) if params.encode_labels else None, dataset.get_repertoire_ids(),
                                   [SequenceAbundanceEncoder.RELEVANT_SEQUENCE_ABUNDANCE, SequenceAbundanceEncoder.TOTAL_SEQUENCE_ABUNDANCE],
                                   encoding=SequenceAbundanceEncoder.__name__, info={'relevant_sequence_path': self.relevant_sequence_csv_path})

        encoded_dataset = RepertoireDataset(params=dataset.params, encoded_data=encoded_data, repertoires=dataset.repertoires)

        return encoded_dataset