def _encode_by_model(self, dataset, params: EncoderParams, vectors): examples = CacheHandler.memo_by_params( self._prepare_caching_params( dataset, params, vectors, Word2VecEncoder.DESCRIPTION_REPERTOIRES), lambda: self._encode_examples(dataset, vectors, params)) if params.encode_labels: labels = CacheHandler.memo_by_params( self._prepare_caching_params( dataset, params, vectors, Word2VecEncoder.DESCRIPTION_LABELS), lambda: self._encode_labels(dataset, params)) else: labels = None if params.learn_model: self.scaler = StandardScaler(with_std=True, with_mean=True) scaled_examples = FeatureScaler.standard_scale_fit( self.scaler, examples) else: scaled_examples = FeatureScaler.standard_scale( self.scaler, examples) encoded_dataset = self._build_encoded_dataset(dataset, scaled_examples, labels, params) return encoded_dataset
def _encode_data(self, dataset, params: EncoderParams) -> EncodedData: encoded_example_list, example_ids, encoded_labels, feature_annotation_names = CacheHandler.memo_by_params( self._prepare_caching_params(dataset, params, KmerFrequencyEncoder.STEP_ENCODED), lambda: self._encode_examples(dataset, params)) vectorized_examples, feature_names = CacheHandler.memo_by_params( self._prepare_caching_params(dataset, params, KmerFrequencyEncoder.STEP_VECTORIZED), lambda: self._vectorize_encoded(examples=encoded_example_list, params=params)) normalized_examples = CacheHandler.memo_by_params( self._prepare_caching_params(dataset, params, KmerFrequencyEncoder.STEP_NORMALIZED), lambda: FeatureScaler.normalize(vectorized_examples, self. normalization_type)) if self.scale_to_unit_variance: examples = self.scale_normalized(params, dataset, normalized_examples) else: examples = normalized_examples feature_annotations = self._get_feature_annotations( feature_names, feature_annotation_names) encoded_data = EncodedData(examples=examples, labels=encoded_labels, feature_names=feature_names, example_ids=example_ids, feature_annotations=feature_annotations, encoding=KmerFrequencyEncoder.__name__) return encoded_data
def encode(self, dataset, params: EncoderParams): encoded_dataset = CacheHandler.memo_by_params( self._prepare_caching_params(dataset, params), lambda: self._encode_new_dataset(dataset, params)) return encoded_dataset
def compute_tcr_dist(dataset: ReceptorDataset, label_names: list, cores: int = 1): return CacheHandler.memo_by_params( (('dataset_identifier', dataset.identifier), ("type", "TCRrep")), lambda: TCRdistHelper._compute_tcr_dist(dataset, label_names, cores ))
def get_encoded_repertoire(self, repertoire, params: EncoderParams): params.model = vars(self) return CacheHandler.memo_by_params((("encoding_model", params.model), ("type", "kmer_encoding"), ("labels", params.label_config.get_labels_by_name()), ("repertoire_id", repertoire.identifier)), lambda: self.encode_repertoire(repertoire, params), CacheObjectType.ENCODING_STEP)
def _encode_by_model(self, dataset, params: EncoderParams, vectors): examples = CacheHandler.memo_by_params(self._prepare_caching_params(dataset, params, vectors, Word2VecEncoder.DESCRIPTION_REPERTOIRES), lambda: self._encode_examples(dataset, vectors, params)) if params.encode_labels: labels = CacheHandler.memo_by_params(self._prepare_caching_params(dataset, params, vectors, Word2VecEncoder.DESCRIPTION_LABELS), lambda: self._encode_labels(dataset, params)) else: labels = None scaler_filename = params.result_path / FilenameHandler.get_filename("standard_scaling", "pkl") scaled_examples = FeatureScaler.standard_scale(scaler_filename, examples) encoded_dataset = self._build_encoded_dataset(dataset, scaled_examples, labels, params) return encoded_dataset
def fit(self, encoded_data: EncodedData, label_name: str, cores_for_training: int = 2): self.feature_names = encoded_data.feature_names self.models = CacheHandler.memo_by_params( self._prepare_caching_params(encoded_data, "fit", label_name), lambda: self._fit(encoded_data, label_name, cores_for_training))
def scale_normalized(self, params, dataset, normalized_examples): self.scaler_path = params.result_path / 'scaler.pickle' if self.scaler_path is None else self.scaler_path examples = CacheHandler.memo_by_params( self._prepare_caching_params(dataset, params, step=KmerFrequencyEncoder.STEP_SCALED), lambda: FeatureScaler.standard_scale(self.scaler_path, normalized_examples, with_mean=self.scale_to_zero_mean)) return examples
def _get_encoded_repertoire(self, repertoire, params: EncoderParams): params.model = vars(self) return CacheHandler.memo_by_params((("encoding_model", params.model), ("labels", params.label_config.get_labels_by_name()), ("repertoire_id", repertoire.identifier), ("repertoire_data", hashlib.sha256(np.ascontiguousarray(repertoire.get_attribute(self.sequence_type.value))).hexdigest())), lambda: self._encode_repertoire(repertoire, params), CacheObjectType.ENCODING)
def _get_sequence_presence(self, full_dataset, full_sequence_set, params): sequence_presence_matrix, matrix_repertoire_ids = CacheHandler.memo_by_params( self._build_sequence_presence_params(full_dataset, self.compairr_params), lambda: self._compute_sequence_presence_with_compairr( full_dataset, full_sequence_set, params)) return sequence_presence_matrix, matrix_repertoire_ids
def _process_repertoire_cached(self, repertoire, index, example_count): return CacheHandler.memo_by_params( (('repertoire', repertoire.identifier), ('encoder', AtchleyKmerEncoder.__name__), (self.abundance, self.skip_last_n_aa, self.skip_first_n_aa, self.k)), lambda: self._process_repertoire(repertoire, index, example_count), CacheObjectType.ENCODING_STEP)
def fit(self, encoded_data: EncodedData, label_name: str, cores_for_training: int = 2): assert encoded_data.encoding == "DeepRCEncoder", f"DeepRC: ML method DeepRC is only compatible with the DeepRC encoder, found {encoded_data.encoding.replace('Encoder','')} encoder" self.feature_names = encoded_data.feature_names self._set_label_classes({label_name: encoded_data.labels[label_name]}) self.model = CacheHandler.memo_by_params( self._prepare_caching_params(encoded_data, "fit", label_name), lambda: self._fit(encoded_data, label_name, cores_for_training))
def fit(self, encoded_data: EncodedData, label_name: str, cores_for_training: int = 2): self.class_mapping = Util.make_class_mapping(encoded_data.labels[label_name]) self.feature_names = encoded_data.feature_names self.label_name = label_name mapped_y = Util.map_to_new_class_values(encoded_data.labels[label_name], self.class_mapping) self.model = CacheHandler.memo_by_params(self._prepare_caching_params(encoded_data, encoded_data.labels[label_name], self.FIT, label_name), lambda: self._fit(encoded_data.examples, mapped_y, cores_for_training))
def encode(self, dataset, params: EncoderParams): cache_params = self._prepare_caching_params(dataset, params) encoded_dataset = CacheHandler.memo_by_params( cache_params, lambda: self._encode_new_dataset(dataset, params)) EncoderHelper.sync_encoder_with_cache( cache_params, lambda: { 'model_path': self.model_path, 'scaler': self.scaler }, self, ['model_path', 'scaler']) return encoded_dataset
def fit_by_cross_validation(self, encoded_data: EncodedData, number_of_splits: int = 5, label_name: str = None, cores_for_training: int = -1, optimization_metric='balanced_accuracy'): self.class_mapping = Util.make_class_mapping(encoded_data.labels[label_name]) self.feature_names = encoded_data.feature_names self.label_name = label_name mapped_y = Util.map_to_new_class_values(encoded_data.labels[label_name], self.class_mapping) self.model = CacheHandler.memo_by_params( self._prepare_caching_params(encoded_data, mapped_y, self.FIT_CV, label_name, number_of_splits), lambda: self._fit_by_cross_validation(encoded_data.examples, mapped_y, number_of_splits, label_name, cores_for_training, optimization_metric))
def scale_normalized(self, params, dataset, normalized_examples): if params.learn_model: self.scaler = StandardScaler(with_mean=self.scale_to_zero_mean) examples = CacheHandler.memo_by_params( self._prepare_caching_params( dataset, params, step=KmerFrequencyEncoder.STEP_SCALED), lambda: FeatureScaler.standard_scale_fit(self.scaler, normalized_examples, with_mean=self. scale_to_zero_mean)) else: examples = CacheHandler.memo_by_params( self._prepare_caching_params( dataset, params, step=KmerFrequencyEncoder.STEP_SCALED), lambda: FeatureScaler.standard_scale(self.scaler, normalized_examples, with_mean=self. scale_to_zero_mean)) return examples
def build_comparison_data(dataset: RepertoireDataset, context: dict, comparison_attributes: list, params: EncoderParams, sequence_batch_size: int): current_dataset = EncoderHelper.get_current_dataset(dataset, context) comparison_data = CacheHandler.memo_by_params( EncoderHelper.build_comparison_params(current_dataset, comparison_attributes), lambda: EncoderHelper.build_comparison_data( current_dataset, params, comparison_attributes, sequence_batch_size)) return comparison_data
def sync_encoder_with_cache(cache_params: tuple, encoder_memo_func, encoder, param_names): encoder_cache_params = tuple( (key, val) for key, val in dict(cache_params).items() if key != 'learn_model') encoder_cache_params = (encoder_cache_params, "encoder") encoder_from_cache = CacheHandler.memo_by_params( encoder_cache_params, encoder_memo_func) for param in param_names: setattr(encoder, param, copy.deepcopy(encoder_from_cache[param])) return encoder
def _encode_data(self, dataset, params: EncoderParams) -> EncodedData: encoded_example_list, example_ids, encoded_labels = CacheHandler.memo_by_params( self._prepare_caching_params(dataset, params, EvennessProfileEncoder.STEP_ENCODED), lambda: self._encode_examples(dataset, params)) vectorized_examples = CacheHandler.memo_by_params( self._prepare_caching_params(dataset, params, EvennessProfileEncoder.STEP_VECTORIZED), lambda: self._vectorize_encoded(examples=encoded_example_list)) feature_names = list(range(self.dimension)) feature_annotations = pd.DataFrame({"feature": feature_names}) encoded_data = EncodedData(examples=vectorized_examples, labels=encoded_labels, feature_names=feature_names, example_ids=example_ids, feature_annotations=feature_annotations, encoding=EvennessProfileEncoder.__name__) return encoded_data
def memo_by_params(self, dataset: RepertoireDataset): comparison_data = CacheHandler.memo_by_params( self.prepare_caching_params(dataset), lambda: self.create_comparison_data(dataset)) return comparison_data
def compare(self, dataset: RepertoireDataset, comparison_fn, comparison_fn_name): return CacheHandler.memo_by_params( (("dataset_identifier", dataset.identifier), "pairwise_comparison", ("comparison_fn", comparison_fn_name)), lambda: self.compare_repertoires(dataset, comparison_fn))
def _get_full_sequence_set(self, full_dataset): full_sequence_set = CacheHandler.memo_by_params( self._build_dataset_params(full_dataset), lambda: self.get_sequence_set(full_dataset)) return full_sequence_set