def encode(self, dataset, params: EncoderParams):

        EncoderHelper.check_positive_class_label(
            SequenceAbundanceEncoder.__name__,
            params.label_config.get_label_objects())

        self.comparison_data = SequenceFilterHelper.build_comparison_data(
            dataset, self.context, self.comparison_attributes, params,
            self.sequence_batch_size)
        return self._encode_data(dataset, params)
    def _get_relevant_sequence_indices(self, params, label_str,
                                       sequence_p_values):
        if self.relevant_indices_path is None:
            self.relevant_indices_path = params.result_path / 'relevant_sequence_indices.pickle'

        if params.learn_model:
            SequenceFilterHelper._check_label_object(params, label_str)

            relevant_sequence_indices = np.array(
                sequence_p_values) < self.p_value_threshold

            with self.relevant_indices_path.open("wb") as file:
                pickle.dump(relevant_sequence_indices, file)

            self._write_relevant_sequence_csv(
                self.full_sequence_set[relevant_sequence_indices],
                params.result_path)

        else:
            with self.relevant_indices_path.open("rb") as file:
                relevant_sequence_indices = pickle.load(file)

        return relevant_sequence_indices
Exemple #3
0
    def _encode_sequence_count(self, dataset: RepertoireDataset, comparison_data: ComparisonData, label: str, params: EncoderParams) -> EncodedData:
        sequence_p_values_indices, indices_path, relevant_sequences_path = SequenceFilterHelper.get_relevant_sequences(dataset, params,
                                                                                                                       comparison_data, label,
                                                                                                                       self.p_value_threshold,
                                                                                                                       self.comparison_attributes,
                                                                                                                       self.relevant_indices_path)
        if self.relevant_indices_path is None:
            self.relevant_indices_path = indices_path
        if self.relevant_sequence_csv_path is None:
            self.relevant_sequence_csv_path = relevant_sequences_path

        count_matrix = self._build_count_matrix(comparison_data, dataset.get_repertoire_ids(), sequence_p_values_indices)
        feature_names = comparison_data.get_item_names()[sequence_p_values_indices]

        encoded_data = EncodedData(count_matrix, dataset.get_metadata([label]) if params.encode_labels else None,
                                   dataset.get_repertoire_ids(),
                                   feature_names,
                                   encoding=SequenceCountEncoder.__name__, info={'relevant_sequence_path': self.relevant_sequence_csv_path})

        return encoded_data
    def _calculate_sequence_abundance(self, dataset: RepertoireDataset,
                                      comparison_data: ComparisonData,
                                      label: str, params: EncoderParams):
        sequence_p_values_indices, indices_path, sequence_csv_path = SequenceFilterHelper.get_relevant_sequences(
            dataset=dataset,
            params=params,
            comparison_data=comparison_data,
            label=label,
            p_value_threshold=self.p_value_threshold,
            comparison_attributes=self.comparison_attributes,
            sequence_indices_path=self.relevant_indices_path)

        if self.relevant_indices_path is None:
            self.relevant_indices_path = indices_path

        if self.relevant_sequence_csv_path is None:
            self.relevant_sequence_csv_path = sequence_csv_path

        abundance_matrix = self._build_abundance_matrix(
            comparison_data, dataset.get_repertoire_ids(),
            sequence_p_values_indices)

        return abundance_matrix
Exemple #5
0
 def encode(self, dataset, params: EncoderParams):
     self.comparison_data = SequenceFilterHelper.build_comparison_data(dataset, self.context, self.comparison_attributes, params,
                                                                       self.sequence_batch_size)
     return self._encode_data(dataset, params)
Exemple #6
0
    def test_find_label_associated_sequence_p_values(self):
        path = EnvironmentSettings.tmp_test_path / "comparison_data_find_label_assocseqpvalues/"
        PathBuilder.build(path)

        repertoires = [
            Repertoire.build_from_sequence_objects([ReceptorSequence()], path,
                                                   {
                                                       "l1": val,
                                                       "subject_id": subject_id
                                                   })
            for val, subject_id in zip([True, True, False, False],
                                       ["rep_0", "rep_1", "rep_2", "rep_3"])
        ]

        col_name_index = {
            repertoires[index].identifier: index
            for index in range(len(repertoires))
        }

        comparison_data = ComparisonData(
            repertoire_ids=[
                repertoire.identifier for repertoire in repertoires
            ],
            comparison_attributes=["sequence_aas"],
            sequence_batch_size=4,
            path=path)
        comparison_data.batches = [
            ComparisonDataBatch(
                **{
                    'matrix': np.array([[1., 0., 0., 0.], [1., 1., 0., 0.]]),
                    'items': [('GGG', ), ('III', )],
                    'repertoire_index_mapping': col_name_index,
                    'path': path,
                    'identifier': 0
                }),
            ComparisonDataBatch(
                **{
                    'matrix': np.array([[1., 1., 0., 1.], [1., 1., 1., 1.]]),
                    'items': [('LLL', ), ('MMM', )],
                    'repertoire_index_mapping': col_name_index,
                    'path': path,
                    'identifier': 1
                }),
            ComparisonDataBatch(
                **{
                    'matrix': np.array([[0., 1., 0., 0.], [0., 1., 0., 1.]]),
                    'items': [('DDD', ), ('EEE', )],
                    'repertoire_index_mapping': col_name_index,
                    'path': path,
                    'identifier': 2
                }),
            ComparisonDataBatch(
                **{
                    'matrix': np.array([[0., 1., 1., 1.], [0., 0., 1., 1.]]),
                    'items': [('FFF', ), ('CCC', )],
                    'repertoire_index_mapping': col_name_index,
                    'path': path,
                    'identifier': 3
                }),
            ComparisonDataBatch(
                **{
                    'matrix': np.array([[0., 0., 0., 1.]]),
                    'items': [('AAA', )],
                    'repertoire_index_mapping': col_name_index,
                    'path': path,
                    'identifier': 4
                })
        ]

        p_values = SequenceFilterHelper.find_label_associated_sequence_p_values(
            comparison_data, repertoires,
            Label('l1', [True, False], positive_class=True))

        print(p_values)

        self.assertTrue(
            np.allclose([
                SequenceFilterHelper.INVALID_P_VALUE, 0.1666666666666667,
                0.5000000000000001, 1., SequenceFilterHelper.INVALID_P_VALUE,
                0.8333333333333331, 1., 1., 2
            ],
                        p_values,
                        equal_nan=True))

        shutil.rmtree(path)