def _encode_data(self, dataset: RepertoireDataset, params: EncoderParams): labels = params.label_config.get_labels_by_name() assert len(labels) == 1, \ "SequenceAbundanceEncoder: this encoding works only for single label." examples = self._calculate_sequence_abundance(dataset, self.comparison_data, labels[0], params) encoded_data = EncodedData( examples, dataset.get_metadata([labels[0]]) if params.encode_labels else None, dataset.get_repertoire_ids(), [ SequenceAbundanceEncoder.RELEVANT_SEQUENCE_ABUNDANCE, SequenceAbundanceEncoder.TOTAL_SEQUENCE_ABUNDANCE ], encoding=SequenceAbundanceEncoder.__name__, info={'relevant_sequence_path': self.relevant_sequence_csv_path}) encoded_dataset = RepertoireDataset(labels=dataset.labels, encoded_data=encoded_data, repertoires=dataset.repertoires) return encoded_dataset
def create_comparison_data(self, dataset: RepertoireDataset) -> ComparisonData: comparison_data = ComparisonData(dataset.get_repertoire_ids(), self.matching_columns, self.sequence_batch_size, self.path) comparison_data.process_dataset(dataset) return comparison_data
def build_comparison_data(dataset: RepertoireDataset, params: EncoderParams, comparison_attributes, sequence_batch_size): comp_data = ComparisonData(dataset.get_repertoire_ids(), comparison_attributes, sequence_batch_size, params.result_path) comp_data.process_dataset(dataset) return comp_data
def _encode_sequence_count(self, dataset: RepertoireDataset, comparison_data: ComparisonData, label: str, params: EncoderParams) -> EncodedData: sequence_p_values_indices, indices_path, relevant_sequences_path = SequenceFilterHelper.get_relevant_sequences(dataset, params, comparison_data, label, self.p_value_threshold, self.comparison_attributes, self.relevant_indices_path) if self.relevant_indices_path is None: self.relevant_indices_path = indices_path if self.relevant_sequence_csv_path is None: self.relevant_sequence_csv_path = relevant_sequences_path count_matrix = self._build_count_matrix(comparison_data, dataset.get_repertoire_ids(), sequence_p_values_indices) feature_names = comparison_data.get_item_names()[sequence_p_values_indices] encoded_data = EncodedData(count_matrix, dataset.get_metadata([label]) if params.encode_labels else None, dataset.get_repertoire_ids(), feature_names, encoding=SequenceCountEncoder.__name__, info={'relevant_sequence_path': self.relevant_sequence_csv_path}) return encoded_data
def build_distance_matrix(self, dataset: RepertoireDataset, params: EncoderParams, train_repertoire_ids: list): self.comparison = PairwiseRepertoireComparison(self.attributes_to_match, self.attributes_to_match, params.result_path, sequence_batch_size=self.sequence_batch_size) current_dataset = dataset if self.context is None or "dataset" not in self.context else self.context["dataset"] distance_matrix = self.comparison.compare(current_dataset, self.distance_fn, self.distance_metric.value) repertoire_ids = dataset.get_repertoire_ids() distance_matrix = distance_matrix.loc[repertoire_ids, train_repertoire_ids] return distance_matrix
def _calculate_sequence_abundance(self, dataset: RepertoireDataset, sequence_presence_matrix, matrix_repertoire_ids, label_str: str, params: EncoderParams): sequence_p_values = self._find_label_associated_sequence_p_values( sequence_presence_matrix, matrix_repertoire_ids, dataset, params, label_str) relevant_sequence_indices = self._get_relevant_sequence_indices( params, label_str, sequence_p_values) abundance_matrix = self._build_abundance_matrix( sequence_presence_matrix, matrix_repertoire_ids, dataset.get_repertoire_ids(), relevant_sequence_indices) return abundance_matrix
def build_labels(self, dataset: RepertoireDataset, params: EncoderParams) -> dict: lbl = ["repertoire_identifier"] lbl.extend(params.label_config.get_labels_by_name()) tmp_labels = dataset.get_metadata(lbl, return_df=True) tmp_labels = tmp_labels.iloc[pd.Index( tmp_labels['repertoire_identifier']).get_indexer( dataset.get_repertoire_ids())] tmp_labels = tmp_labels.to_dict("list") del tmp_labels["repertoire_identifier"] return tmp_labels
def _encode_repertoires(self, dataset: RepertoireDataset, params: EncoderParams): # Rows = repertoires, Columns = reference chains (two per sequence receptor) encoded_repertories = np.zeros((dataset.get_example_count(), len(self.reference_receptors) * 2), dtype=int) labels = {label: [] for label in params.label_config.get_labels_by_name()} if params.encode_labels else None for i, repertoire in enumerate(dataset.get_data()): encoded_repertories[i] = self._match_repertoire_to_receptors(repertoire) if labels is not None: for label_name in params.label_config.get_labels_by_name(): labels[label_name].append(repertoire.metadata[label_name]) return encoded_repertories, labels, dataset.get_repertoire_ids()
def build_distance_matrix(self, dataset: RepertoireDataset, params: EncoderParams, train_repertoire_ids: list): current_dataset = dataset if self.context is None or "dataset" not in self.context else self.context[ "dataset"] raw_distance_matrix, repertoire_sizes, repertoire_indices = self._compute_overlap_with_compairr( current_dataset, params) distance_matrix = self._morisita_horn(raw_distance_matrix, repertoire_sizes, repertoire_indices) repertoire_ids = dataset.get_repertoire_ids() distance_matrix = distance_matrix.loc[repertoire_ids, train_repertoire_ids] return distance_matrix
def compare_repertoires(self, dataset: RepertoireDataset, comparison_fn): self.comparison_data = self.memo_by_params(dataset) repertoire_count = dataset.get_example_count() comparison_result = np.zeros([repertoire_count, repertoire_count]) repertoire_identifiers = dataset.get_repertoire_ids() for index1 in range(repertoire_count): repertoire_vector_1 = self.comparison_data.get_repertoire_vector( repertoire_identifiers[index1]) for index2 in range(index1, repertoire_count): repertoire_vector_2 = self.comparison_data.get_repertoire_vector( repertoire_identifiers[index2]) comparison_result[index1, index2] = comparison_fn( repertoire_vector_1, repertoire_vector_2) comparison_result[index2, index1] = comparison_result[index1, index2] comparison_df = pd.DataFrame(comparison_result, columns=repertoire_identifiers, index=repertoire_identifiers) return comparison_df
def _encode_data(self, dataset: RepertoireDataset, params: EncoderParams): label_name = params.label_config.get_labels_by_name()[0] examples = self._calculate_sequence_abundance( dataset, self.sequence_presence_matrix, self.matrix_repertoire_ids, label_name, params) encoded_data = EncodedData( examples, dataset.get_metadata([label_name]) if params.encode_labels else None, dataset.get_repertoire_ids(), [ CompAIRRSequenceAbundanceEncoder.RELEVANT_SEQUENCE_ABUNDANCE, CompAIRRSequenceAbundanceEncoder.TOTAL_SEQUENCE_ABUNDANCE ], encoding=CompAIRRSequenceAbundanceEncoder.__name__, info={'relevant_sequence_path': self.relevant_sequence_csv_path}) encoded_dataset = RepertoireDataset(labels=dataset.labels, encoded_data=encoded_data, repertoires=dataset.repertoires) return encoded_dataset
def _calculate_sequence_abundance(self, dataset: RepertoireDataset, comparison_data: ComparisonData, label: str, params: EncoderParams): sequence_p_values_indices, indices_path, sequence_csv_path = SequenceFilterHelper.get_relevant_sequences( dataset=dataset, params=params, comparison_data=comparison_data, label=label, p_value_threshold=self.p_value_threshold, comparison_attributes=self.comparison_attributes, sequence_indices_path=self.relevant_indices_path) if self.relevant_indices_path is None: self.relevant_indices_path = indices_path if self.relevant_sequence_csv_path is None: self.relevant_sequence_csv_path = sequence_csv_path abundance_matrix = self._build_abundance_matrix( comparison_data, dataset.get_repertoire_ids(), sequence_p_values_indices) return abundance_matrix