def _encode_data(self, dataset: RepertoireDataset, params: EncoderParams):
        labels = params.label_config.get_labels_by_name()

        assert len(labels) == 1, \
            "SequenceAbundanceEncoder: this encoding works only for single label."

        examples = self._calculate_sequence_abundance(dataset,
                                                      self.comparison_data,
                                                      labels[0], params)

        encoded_data = EncodedData(
            examples,
            dataset.get_metadata([labels[0]])
            if params.encode_labels else None,
            dataset.get_repertoire_ids(), [
                SequenceAbundanceEncoder.RELEVANT_SEQUENCE_ABUNDANCE,
                SequenceAbundanceEncoder.TOTAL_SEQUENCE_ABUNDANCE
            ],
            encoding=SequenceAbundanceEncoder.__name__,
            info={'relevant_sequence_path': self.relevant_sequence_csv_path})

        encoded_dataset = RepertoireDataset(labels=dataset.labels,
                                            encoded_data=encoded_data,
                                            repertoires=dataset.repertoires)

        return encoded_dataset
Example #2
0
    def create_comparison_data(self,
                               dataset: RepertoireDataset) -> ComparisonData:

        comparison_data = ComparisonData(dataset.get_repertoire_ids(),
                                         self.matching_columns,
                                         self.sequence_batch_size, self.path)
        comparison_data.process_dataset(dataset)

        return comparison_data
Example #3
0
    def build_comparison_data(dataset: RepertoireDataset,
                              params: EncoderParams, comparison_attributes,
                              sequence_batch_size):

        comp_data = ComparisonData(dataset.get_repertoire_ids(),
                                   comparison_attributes, sequence_batch_size,
                                   params.result_path)

        comp_data.process_dataset(dataset)

        return comp_data
Example #4
0
    def _encode_sequence_count(self, dataset: RepertoireDataset, comparison_data: ComparisonData, label: str, params: EncoderParams) -> EncodedData:
        sequence_p_values_indices, indices_path, relevant_sequences_path = SequenceFilterHelper.get_relevant_sequences(dataset, params,
                                                                                                                       comparison_data, label,
                                                                                                                       self.p_value_threshold,
                                                                                                                       self.comparison_attributes,
                                                                                                                       self.relevant_indices_path)
        if self.relevant_indices_path is None:
            self.relevant_indices_path = indices_path
        if self.relevant_sequence_csv_path is None:
            self.relevant_sequence_csv_path = relevant_sequences_path

        count_matrix = self._build_count_matrix(comparison_data, dataset.get_repertoire_ids(), sequence_p_values_indices)
        feature_names = comparison_data.get_item_names()[sequence_p_values_indices]

        encoded_data = EncodedData(count_matrix, dataset.get_metadata([label]) if params.encode_labels else None,
                                   dataset.get_repertoire_ids(),
                                   feature_names,
                                   encoding=SequenceCountEncoder.__name__, info={'relevant_sequence_path': self.relevant_sequence_csv_path})

        return encoded_data
Example #5
0
    def build_distance_matrix(self, dataset: RepertoireDataset, params: EncoderParams, train_repertoire_ids: list):
        self.comparison = PairwiseRepertoireComparison(self.attributes_to_match, self.attributes_to_match, params.result_path,
                                                  sequence_batch_size=self.sequence_batch_size)

        current_dataset = dataset if self.context is None or "dataset" not in self.context else self.context["dataset"]

        distance_matrix = self.comparison.compare(current_dataset, self.distance_fn, self.distance_metric.value)

        repertoire_ids = dataset.get_repertoire_ids()

        distance_matrix = distance_matrix.loc[repertoire_ids, train_repertoire_ids]

        return distance_matrix
    def _calculate_sequence_abundance(self, dataset: RepertoireDataset,
                                      sequence_presence_matrix,
                                      matrix_repertoire_ids, label_str: str,
                                      params: EncoderParams):
        sequence_p_values = self._find_label_associated_sequence_p_values(
            sequence_presence_matrix, matrix_repertoire_ids, dataset, params,
            label_str)
        relevant_sequence_indices = self._get_relevant_sequence_indices(
            params, label_str, sequence_p_values)
        abundance_matrix = self._build_abundance_matrix(
            sequence_presence_matrix, matrix_repertoire_ids,
            dataset.get_repertoire_ids(), relevant_sequence_indices)

        return abundance_matrix
Example #7
0
    def build_labels(self, dataset: RepertoireDataset,
                     params: EncoderParams) -> dict:

        lbl = ["repertoire_identifier"]
        lbl.extend(params.label_config.get_labels_by_name())

        tmp_labels = dataset.get_metadata(lbl, return_df=True)
        tmp_labels = tmp_labels.iloc[pd.Index(
            tmp_labels['repertoire_identifier']).get_indexer(
                dataset.get_repertoire_ids())]
        tmp_labels = tmp_labels.to_dict("list")
        del tmp_labels["repertoire_identifier"]

        return tmp_labels
    def _encode_repertoires(self, dataset: RepertoireDataset, params: EncoderParams):
        # Rows = repertoires, Columns = reference chains (two per sequence receptor)
        encoded_repertories = np.zeros((dataset.get_example_count(),
                                        len(self.reference_receptors) * 2),
                                       dtype=int)
        labels = {label: [] for label in params.label_config.get_labels_by_name()} if params.encode_labels else None

        for i, repertoire in enumerate(dataset.get_data()):
            encoded_repertories[i] = self._match_repertoire_to_receptors(repertoire)

            if labels is not None:
                for label_name in params.label_config.get_labels_by_name():
                    labels[label_name].append(repertoire.metadata[label_name])

        return encoded_repertories, labels, dataset.get_repertoire_ids()
    def build_distance_matrix(self, dataset: RepertoireDataset,
                              params: EncoderParams,
                              train_repertoire_ids: list):
        current_dataset = dataset if self.context is None or "dataset" not in self.context else self.context[
            "dataset"]
        raw_distance_matrix, repertoire_sizes, repertoire_indices = self._compute_overlap_with_compairr(
            current_dataset, params)

        distance_matrix = self._morisita_horn(raw_distance_matrix,
                                              repertoire_sizes,
                                              repertoire_indices)

        repertoire_ids = dataset.get_repertoire_ids()

        distance_matrix = distance_matrix.loc[repertoire_ids,
                                              train_repertoire_ids]

        return distance_matrix
Example #10
0
    def compare_repertoires(self, dataset: RepertoireDataset, comparison_fn):
        self.comparison_data = self.memo_by_params(dataset)
        repertoire_count = dataset.get_example_count()
        comparison_result = np.zeros([repertoire_count, repertoire_count])
        repertoire_identifiers = dataset.get_repertoire_ids()

        for index1 in range(repertoire_count):
            repertoire_vector_1 = self.comparison_data.get_repertoire_vector(
                repertoire_identifiers[index1])
            for index2 in range(index1, repertoire_count):
                repertoire_vector_2 = self.comparison_data.get_repertoire_vector(
                    repertoire_identifiers[index2])
                comparison_result[index1, index2] = comparison_fn(
                    repertoire_vector_1, repertoire_vector_2)
                comparison_result[index2, index1] = comparison_result[index1,
                                                                      index2]

        comparison_df = pd.DataFrame(comparison_result,
                                     columns=repertoire_identifiers,
                                     index=repertoire_identifiers)

        return comparison_df
    def _encode_data(self, dataset: RepertoireDataset, params: EncoderParams):
        label_name = params.label_config.get_labels_by_name()[0]

        examples = self._calculate_sequence_abundance(
            dataset, self.sequence_presence_matrix, self.matrix_repertoire_ids,
            label_name, params)

        encoded_data = EncodedData(
            examples,
            dataset.get_metadata([label_name])
            if params.encode_labels else None,
            dataset.get_repertoire_ids(), [
                CompAIRRSequenceAbundanceEncoder.RELEVANT_SEQUENCE_ABUNDANCE,
                CompAIRRSequenceAbundanceEncoder.TOTAL_SEQUENCE_ABUNDANCE
            ],
            encoding=CompAIRRSequenceAbundanceEncoder.__name__,
            info={'relevant_sequence_path': self.relevant_sequence_csv_path})

        encoded_dataset = RepertoireDataset(labels=dataset.labels,
                                            encoded_data=encoded_data,
                                            repertoires=dataset.repertoires)

        return encoded_dataset
    def _calculate_sequence_abundance(self, dataset: RepertoireDataset,
                                      comparison_data: ComparisonData,
                                      label: str, params: EncoderParams):
        sequence_p_values_indices, indices_path, sequence_csv_path = SequenceFilterHelper.get_relevant_sequences(
            dataset=dataset,
            params=params,
            comparison_data=comparison_data,
            label=label,
            p_value_threshold=self.p_value_threshold,
            comparison_attributes=self.comparison_attributes,
            sequence_indices_path=self.relevant_indices_path)

        if self.relevant_indices_path is None:
            self.relevant_indices_path = indices_path

        if self.relevant_sequence_csv_path is None:
            self.relevant_sequence_csv_path = sequence_csv_path

        abundance_matrix = self._build_abundance_matrix(
            comparison_data, dataset.get_repertoire_ids(),
            sequence_p_values_indices)

        return abundance_matrix