def _encode_data(self, dataset: RepertoireDataset, params: EncoderParams):
        labels = params.label_config.get_labels_by_name()

        assert len(labels) == 1, \
            "SequenceAbundanceEncoder: this encoding works only for single label."

        examples = self._calculate_sequence_abundance(dataset,
                                                      self.comparison_data,
                                                      labels[0], params)

        encoded_data = EncodedData(
            examples,
            dataset.get_metadata([labels[0]])
            if params.encode_labels else None,
            dataset.get_repertoire_ids(), [
                SequenceAbundanceEncoder.RELEVANT_SEQUENCE_ABUNDANCE,
                SequenceAbundanceEncoder.TOTAL_SEQUENCE_ABUNDANCE
            ],
            encoding=SequenceAbundanceEncoder.__name__,
            info={'relevant_sequence_path': self.relevant_sequence_csv_path})

        encoded_dataset = RepertoireDataset(labels=dataset.labels,
                                            encoded_data=encoded_data,
                                            repertoires=dataset.repertoires)

        return encoded_dataset
Exemple #2
0
    def build_labels(self, dataset: RepertoireDataset, params: EncoderParams) -> dict:

        lbl = ["repertoire_identifier"]
        lbl.extend(params.label_config.get_labels_by_name())

        tmp_labels = dataset.get_metadata(lbl, return_df=True)
        tmp_labels = tmp_labels.iloc[pd.Index(tmp_labels['repertoire_identifier']).get_indexer(dataset.get_repertoire_ids())]
        tmp_labels = tmp_labels.to_dict("list")
        del tmp_labels["repertoire_identifier"]

        return tmp_labels
Exemple #3
0
    def _encode_sequence_count(self, dataset: RepertoireDataset, comparison_data: ComparisonData, label: str, params: EncoderParams) -> EncodedData:
        sequence_p_values_indices, indices_path, relevant_sequences_path = SequenceFilterHelper.get_relevant_sequences(dataset, params,
                                                                                                                       comparison_data, label,
                                                                                                                       self.p_value_threshold,
                                                                                                                       self.comparison_attributes,
                                                                                                                       self.relevant_indices_path)
        if self.relevant_indices_path is None:
            self.relevant_indices_path = indices_path
        if self.relevant_sequence_csv_path is None:
            self.relevant_sequence_csv_path = relevant_sequences_path

        count_matrix = self._build_count_matrix(comparison_data, dataset.get_repertoire_ids(), sequence_p_values_indices)
        feature_names = comparison_data.get_item_names()[sequence_p_values_indices]

        encoded_data = EncodedData(count_matrix, dataset.get_metadata([label]) if params.encode_labels else None,
                                   dataset.get_repertoire_ids(),
                                   feature_names,
                                   encoding=SequenceCountEncoder.__name__, info={'relevant_sequence_path': self.relevant_sequence_csv_path})

        return encoded_data
    def _encode_examples(self, dataset: RepertoireDataset,
                         params: EncoderParams) -> Tuple[list, set, dict]:

        keys = set()
        example_count = dataset.get_example_count()

        arguments = [(repertoire, index, example_count)
                     for index, repertoire in enumerate(dataset.repertoires)]

        with Pool(params.pool_size) as pool:
            chunksize = math.floor(
                dataset.get_example_count() / params.pool_size) + 1
            examples = pool.starmap(self._process_repertoire_cached,
                                    arguments,
                                    chunksize=chunksize)

        for example in examples:
            keys.update(list(example.keys()))

        labels = dataset.get_metadata(params.label_config.get_labels_by_name()
                                      ) if params.encode_labels else None

        return examples, keys, labels
    def _encode_data(self, dataset: RepertoireDataset, params: EncoderParams):
        label_name = params.label_config.get_labels_by_name()[0]

        examples = self._calculate_sequence_abundance(
            dataset, self.sequence_presence_matrix, self.matrix_repertoire_ids,
            label_name, params)

        encoded_data = EncodedData(
            examples,
            dataset.get_metadata([label_name])
            if params.encode_labels else None,
            dataset.get_repertoire_ids(), [
                CompAIRRSequenceAbundanceEncoder.RELEVANT_SEQUENCE_ABUNDANCE,
                CompAIRRSequenceAbundanceEncoder.TOTAL_SEQUENCE_ABUNDANCE
            ],
            encoding=CompAIRRSequenceAbundanceEncoder.__name__,
            info={'relevant_sequence_path': self.relevant_sequence_csv_path})

        encoded_dataset = RepertoireDataset(labels=dataset.labels,
                                            encoded_data=encoded_data,
                                            repertoires=dataset.repertoires)

        return encoded_dataset
 def get_matching_indices(dataset: RepertoireDataset, criteria):
     metadata = pd.DataFrame(dataset.get_metadata(None))
     matches = CriteriaMatcher().match(criteria, metadata)
     indices = np.where(matches)[0]
     return indices
 def build_labels(self, dataset: RepertoireDataset,
                  params: EncoderParams) -> dict:
     lbl = params.label_config.get_labels_by_name()
     return dataset.get_metadata(lbl, return_df=False)