Esempio n. 1
0
    def _encode_by_model(self, dataset, params: EncoderParams, vectors):
        examples = CacheHandler.memo_by_params(
            self._prepare_caching_params(
                dataset, params, vectors,
                Word2VecEncoder.DESCRIPTION_REPERTOIRES),
            lambda: self._encode_examples(dataset, vectors, params))

        if params.encode_labels:
            labels = CacheHandler.memo_by_params(
                self._prepare_caching_params(
                    dataset, params, vectors,
                    Word2VecEncoder.DESCRIPTION_LABELS),
                lambda: self._encode_labels(dataset, params))
        else:
            labels = None

        if params.learn_model:
            self.scaler = StandardScaler(with_std=True, with_mean=True)
            scaled_examples = FeatureScaler.standard_scale_fit(
                self.scaler, examples)
        else:
            scaled_examples = FeatureScaler.standard_scale(
                self.scaler, examples)

        encoded_dataset = self._build_encoded_dataset(dataset, scaled_examples,
                                                      labels, params)
        return encoded_dataset
Esempio n. 2
0
    def _encode_data(self, dataset, params: EncoderParams) -> EncodedData:
        encoded_example_list, example_ids, encoded_labels, feature_annotation_names = CacheHandler.memo_by_params(
            self._prepare_caching_params(dataset, params,
                                         KmerFrequencyEncoder.STEP_ENCODED),
            lambda: self._encode_examples(dataset, params))

        vectorized_examples, feature_names = CacheHandler.memo_by_params(
            self._prepare_caching_params(dataset, params,
                                         KmerFrequencyEncoder.STEP_VECTORIZED),
            lambda: self._vectorize_encoded(examples=encoded_example_list,
                                            params=params))

        normalized_examples = CacheHandler.memo_by_params(
            self._prepare_caching_params(dataset, params,
                                         KmerFrequencyEncoder.STEP_NORMALIZED),
            lambda: FeatureScaler.normalize(vectorized_examples, self.
                                            normalization_type))

        if self.scale_to_unit_variance:
            examples = self.scale_normalized(params, dataset,
                                             normalized_examples)
        else:
            examples = normalized_examples

        feature_annotations = self._get_feature_annotations(
            feature_names, feature_annotation_names)

        encoded_data = EncodedData(examples=examples,
                                   labels=encoded_labels,
                                   feature_names=feature_names,
                                   example_ids=example_ids,
                                   feature_annotations=feature_annotations,
                                   encoding=KmerFrequencyEncoder.__name__)

        return encoded_data
Esempio n. 3
0
    def encode(self, dataset, params: EncoderParams):

        encoded_dataset = CacheHandler.memo_by_params(
            self._prepare_caching_params(dataset, params),
            lambda: self._encode_new_dataset(dataset, params))

        return encoded_dataset
Esempio n. 4
0
 def compute_tcr_dist(dataset: ReceptorDataset,
                      label_names: list,
                      cores: int = 1):
     return CacheHandler.memo_by_params(
         (('dataset_identifier', dataset.identifier), ("type", "TCRrep")),
         lambda: TCRdistHelper._compute_tcr_dist(dataset, label_names, cores
                                                 ))
Esempio n. 5
0
    def get_encoded_repertoire(self, repertoire, params: EncoderParams):
        params.model = vars(self)

        return CacheHandler.memo_by_params((("encoding_model", params.model), ("type", "kmer_encoding"),
                                            ("labels", params.label_config.get_labels_by_name()),
                                            ("repertoire_id", repertoire.identifier)),
                                           lambda: self.encode_repertoire(repertoire, params), CacheObjectType.ENCODING_STEP)
Esempio n. 6
0
    def _encode_by_model(self, dataset, params: EncoderParams, vectors):
        examples = CacheHandler.memo_by_params(self._prepare_caching_params(dataset, params, vectors,
                                                                            Word2VecEncoder.DESCRIPTION_REPERTOIRES),
                                               lambda: self._encode_examples(dataset, vectors, params))

        if params.encode_labels:
            labels = CacheHandler.memo_by_params(self._prepare_caching_params(dataset, params, vectors, Word2VecEncoder.DESCRIPTION_LABELS),
                                                 lambda: self._encode_labels(dataset, params))
        else:
            labels = None

        scaler_filename = params.result_path / FilenameHandler.get_filename("standard_scaling", "pkl")
        scaled_examples = FeatureScaler.standard_scale(scaler_filename, examples)

        encoded_dataset = self._build_encoded_dataset(dataset, scaled_examples, labels, params)
        return encoded_dataset
Esempio n. 7
0
 def fit(self,
         encoded_data: EncodedData,
         label_name: str,
         cores_for_training: int = 2):
     self.feature_names = encoded_data.feature_names
     self.models = CacheHandler.memo_by_params(
         self._prepare_caching_params(encoded_data, "fit", label_name),
         lambda: self._fit(encoded_data, label_name, cores_for_training))
Esempio n. 8
0
    def scale_normalized(self, params, dataset, normalized_examples):
        self.scaler_path = params.result_path / 'scaler.pickle' if self.scaler_path is None else self.scaler_path

        examples = CacheHandler.memo_by_params(
            self._prepare_caching_params(dataset, params, step=KmerFrequencyEncoder.STEP_SCALED),
            lambda: FeatureScaler.standard_scale(self.scaler_path, normalized_examples, with_mean=self.scale_to_zero_mean))

        return examples
    def _get_encoded_repertoire(self, repertoire, params: EncoderParams):
        params.model = vars(self)

        return CacheHandler.memo_by_params((("encoding_model", params.model),
                                            ("labels", params.label_config.get_labels_by_name()),
                                            ("repertoire_id", repertoire.identifier),
                                            ("repertoire_data", hashlib.sha256(np.ascontiguousarray(repertoire.get_attribute(self.sequence_type.value))).hexdigest())),
                                           lambda: self._encode_repertoire(repertoire, params), CacheObjectType.ENCODING)
    def _get_sequence_presence(self, full_dataset, full_sequence_set, params):
        sequence_presence_matrix, matrix_repertoire_ids = CacheHandler.memo_by_params(
            self._build_sequence_presence_params(full_dataset,
                                                 self.compairr_params),
            lambda: self._compute_sequence_presence_with_compairr(
                full_dataset, full_sequence_set, params))

        return sequence_presence_matrix, matrix_repertoire_ids
Esempio n. 11
0
 def _process_repertoire_cached(self, repertoire, index, example_count):
     return CacheHandler.memo_by_params(
         (('repertoire', repertoire.identifier),
          ('encoder', AtchleyKmerEncoder.__name__),
          (self.abundance, self.skip_last_n_aa, self.skip_first_n_aa,
           self.k)),
         lambda: self._process_repertoire(repertoire, index, example_count),
         CacheObjectType.ENCODING_STEP)
Esempio n. 12
0
 def fit(self,
         encoded_data: EncodedData,
         label_name: str,
         cores_for_training: int = 2):
     assert encoded_data.encoding == "DeepRCEncoder", f"DeepRC: ML method DeepRC is only compatible with the DeepRC encoder, found {encoded_data.encoding.replace('Encoder','')} encoder"
     self.feature_names = encoded_data.feature_names
     self._set_label_classes({label_name: encoded_data.labels[label_name]})
     self.model = CacheHandler.memo_by_params(
         self._prepare_caching_params(encoded_data, "fit", label_name),
         lambda: self._fit(encoded_data, label_name, cores_for_training))
Esempio n. 13
0
    def fit(self, encoded_data: EncodedData, label_name: str, cores_for_training: int = 2):

        self.class_mapping = Util.make_class_mapping(encoded_data.labels[label_name])
        self.feature_names = encoded_data.feature_names
        self.label_name = label_name

        mapped_y = Util.map_to_new_class_values(encoded_data.labels[label_name], self.class_mapping)

        self.model = CacheHandler.memo_by_params(self._prepare_caching_params(encoded_data, encoded_data.labels[label_name], self.FIT, label_name),
                                                 lambda: self._fit(encoded_data.examples, mapped_y, cores_for_training))
Esempio n. 14
0
    def encode(self, dataset, params: EncoderParams):
        cache_params = self._prepare_caching_params(dataset, params)
        encoded_dataset = CacheHandler.memo_by_params(
            cache_params, lambda: self._encode_new_dataset(dataset, params))

        EncoderHelper.sync_encoder_with_cache(
            cache_params, lambda: {
                'model_path': self.model_path,
                'scaler': self.scaler
            }, self, ['model_path', 'scaler'])

        return encoded_dataset
Esempio n. 15
0
    def fit_by_cross_validation(self, encoded_data: EncodedData, number_of_splits: int = 5, label_name: str = None, cores_for_training: int = -1,
                                optimization_metric='balanced_accuracy'):

        self.class_mapping = Util.make_class_mapping(encoded_data.labels[label_name])
        self.feature_names = encoded_data.feature_names
        self.label_name = label_name
        mapped_y = Util.map_to_new_class_values(encoded_data.labels[label_name], self.class_mapping)

        self.model = CacheHandler.memo_by_params(
            self._prepare_caching_params(encoded_data, mapped_y, self.FIT_CV, label_name, number_of_splits),
            lambda: self._fit_by_cross_validation(encoded_data.examples, mapped_y, number_of_splits, label_name, cores_for_training,
                                                  optimization_metric))
Esempio n. 16
0
    def scale_normalized(self, params, dataset, normalized_examples):

        if params.learn_model:
            self.scaler = StandardScaler(with_mean=self.scale_to_zero_mean)
            examples = CacheHandler.memo_by_params(
                self._prepare_caching_params(
                    dataset, params, step=KmerFrequencyEncoder.STEP_SCALED),
                lambda: FeatureScaler.standard_scale_fit(self.scaler,
                                                         normalized_examples,
                                                         with_mean=self.
                                                         scale_to_zero_mean))
        else:
            examples = CacheHandler.memo_by_params(
                self._prepare_caching_params(
                    dataset, params, step=KmerFrequencyEncoder.STEP_SCALED),
                lambda: FeatureScaler.standard_scale(self.scaler,
                                                     normalized_examples,
                                                     with_mean=self.
                                                     scale_to_zero_mean))

        return examples
Esempio n. 17
0
    def build_comparison_data(dataset: RepertoireDataset, context: dict,
                              comparison_attributes: list,
                              params: EncoderParams, sequence_batch_size: int):

        current_dataset = EncoderHelper.get_current_dataset(dataset, context)
        comparison_data = CacheHandler.memo_by_params(
            EncoderHelper.build_comparison_params(current_dataset,
                                                  comparison_attributes),
            lambda: EncoderHelper.build_comparison_data(
                current_dataset, params, comparison_attributes,
                sequence_batch_size))

        return comparison_data
Esempio n. 18
0
    def sync_encoder_with_cache(cache_params: tuple, encoder_memo_func,
                                encoder, param_names):
        encoder_cache_params = tuple(
            (key, val) for key, val in dict(cache_params).items()
            if key != 'learn_model')
        encoder_cache_params = (encoder_cache_params, "encoder")

        encoder_from_cache = CacheHandler.memo_by_params(
            encoder_cache_params, encoder_memo_func)
        for param in param_names:
            setattr(encoder, param, copy.deepcopy(encoder_from_cache[param]))

        return encoder
    def _encode_data(self, dataset, params: EncoderParams) -> EncodedData:

        encoded_example_list, example_ids, encoded_labels = CacheHandler.memo_by_params(
            self._prepare_caching_params(dataset, params, EvennessProfileEncoder.STEP_ENCODED),
            lambda: self._encode_examples(dataset, params))

        vectorized_examples = CacheHandler.memo_by_params(
            self._prepare_caching_params(dataset, params, EvennessProfileEncoder.STEP_VECTORIZED),
            lambda: self._vectorize_encoded(examples=encoded_example_list))

        feature_names = list(range(self.dimension))

        feature_annotations = pd.DataFrame({"feature": feature_names})

        encoded_data = EncodedData(examples=vectorized_examples,
                                   labels=encoded_labels,
                                   feature_names=feature_names,
                                   example_ids=example_ids,
                                   feature_annotations=feature_annotations,
                                   encoding=EvennessProfileEncoder.__name__)

        return encoded_data
Esempio n. 20
0
 def memo_by_params(self, dataset: RepertoireDataset):
     comparison_data = CacheHandler.memo_by_params(
         self.prepare_caching_params(dataset),
         lambda: self.create_comparison_data(dataset))
     return comparison_data
Esempio n. 21
0
 def compare(self, dataset: RepertoireDataset, comparison_fn,
             comparison_fn_name):
     return CacheHandler.memo_by_params(
         (("dataset_identifier", dataset.identifier), "pairwise_comparison",
          ("comparison_fn", comparison_fn_name)),
         lambda: self.compare_repertoires(dataset, comparison_fn))
    def _get_full_sequence_set(self, full_dataset):
        full_sequence_set = CacheHandler.memo_by_params(
            self._build_dataset_params(full_dataset),
            lambda: self.get_sequence_set(full_dataset))

        return full_sequence_set