Example #1
0
    def annotate_features(dataset: RepertoireDataset, criteria: dict, name: str = "annotation"):
        """
        Takes an encoded dataset and adds a new column to the feature_annotations with boolean values showing whether a
        feature matched the specified criteria or not.
        """
        dataset = copy.deepcopy(dataset)

        feature_annotations = dataset.encoded_data.feature_annotations

        matcher = CriteriaMatcher()
        results = matcher.match(criteria=criteria, data=feature_annotations)

        feature_annotations[name] = results

        encoded = EncodedData(
            examples=dataset.encoded_data.examples,
            labels=dataset.encoded_data.labels,
            example_ids=dataset.encoded_data.example_ids,
            feature_names=dataset.encoded_data.feature_names,
            feature_annotations=feature_annotations
        )

        result = RepertoireDataset(
            params=dataset.params,
            encoded_data=encoded,
            repertoires=dataset.get_data(),
            identifier=dataset.identifier,
            metadata_file=dataset.metadata_file
        )

        return result
Example #2
0
    def match(self, dataset: RepertoireDataset, reference_sequences: list, max_distance: int, summary_type: SequenceMatchingSummaryType) -> dict:

        matched = {"repertoires": []}

        for index, repertoire in enumerate(dataset.get_data()):
            matched["repertoires"].append(self.match_repertoire(repertoire, index,
                                                                reference_sequences, max_distance, summary_type))

        return matched
Example #3
0
 def process_dataset(self, dataset: RepertoireDataset):
     extract_fn = self.build_matching_fn()
     repertoire_count = dataset.get_example_count()
     for index, repertoire in enumerate(dataset.get_data()):
         self.process_repertoire(repertoire, str(repertoire.identifier),
                                 extract_fn)
         logging.info("Repertoire {} ({}/{}) processed.".format(
             repertoire.identifier, index + 1, repertoire_count))
         logging.info(
             f"Currently, there are {self.item_count} items in the comparison data matrix."
         )
     self.merge_tmp_batches_to_matrix()
    def process(dataset: RepertoireDataset, params: dict) -> RepertoireDataset:
        processed_dataset = dataset.clone()
        repertoires = []
        indices =[]
        for index, repertoire in enumerate(dataset.get_data()):
            if "lower_limit" in params.keys() and len(repertoire.sequences) >= params["lower_limit"] or \
                 "upper_limit" in params.keys() and len(repertoire.sequences) <= params["upper_limit"]:
                repertoires.append(dataset.repertoires[index])
                indices.append(index)
        processed_dataset.repertoires = repertoires
        processed_dataset.metadata_file = ClonesPerRepertoireFilter.build_new_metadata(dataset, indices, params["result_path"])

        Filter.check_dataset_not_empty(processed_dataset, "ClonesPerRepertoireFilter")

        return processed_dataset
    def _encode_repertoires(self, dataset: RepertoireDataset, params):
        # Rows = repertoires, Columns = reference sequences
        encoded_repertories = np.zeros((dataset.get_example_count(),
                                        len(self.reference_sequences)),
                                       dtype=int)

        labels = {label: [] for label in params.label_config.get_labels_by_name()} if params.encode_labels else None

        for i, repertoire in enumerate(dataset.get_data()):
            encoded_repertories[i] = self._match_repertoire_to_reference(repertoire)

            for label in params.label_config.get_labels_by_name():
                labels[label].append(repertoire.metadata[label])

        return encoded_repertories, labels
    def test_process(self):

        path = EnvironmentSettings.root_path + "test/tmp/chain_filter/"
        PathBuilder.build(path)

        rep1 = Repertoire.build_from_sequence_objects([
            ReceptorSequence(
                "AAA", metadata=SequenceMetadata(chain="A"), identifier="1")
        ],
                                                      path=path,
                                                      metadata={})
        rep2 = Repertoire.build_from_sequence_objects([
            ReceptorSequence(
                "AAC", metadata=SequenceMetadata(chain="B"), identifier="2")
        ],
                                                      path=path,
                                                      metadata={})

        metadata = pd.DataFrame({"CD": [1, 0]})
        metadata.to_csv(path + "metadata.csv")

        dataset = RepertoireDataset(repertoires=[rep1, rep2],
                                    metadata_file=path + "metadata.csv")

        dataset2 = ChainRepertoireFilter.process(
            dataset, {
                "keep_chain": "ALPHA",
                "result_path": path + "results/"
            })

        self.assertEqual(1, len(dataset2.get_data()))
        self.assertEqual(2, len(dataset.get_data()))

        metadata_dict = dataset2.get_metadata(["CD"])
        self.assertEqual(1, len(metadata_dict["CD"]))
        self.assertEqual(1, metadata_dict["CD"][0])

        for rep in dataset2.get_data():
            self.assertEqual("AAA", rep.sequences[0].get_sequence())

        self.assertRaises(AssertionError, ChainRepertoireFilter.process,
                          dataset, {
                              "keep_chain": "GAMMA",
                              "result_path": path + "results/"
                          })

        shutil.rmtree(path)
Example #7
0
    def create_model(self, dataset: RepertoireDataset, k: int,
                     vector_size: int, batch_size: int, model_path: str):
        model = Word2Vec(size=vector_size, min_count=1,
                         window=5)  # creates an empty model
        all_kmers = KmerHelper.create_all_kmers(
            k=k, alphabet=EnvironmentSettings.get_sequence_alphabet())
        all_kmers = [[kmer] for kmer in all_kmers]
        model.build_vocab(all_kmers)

        for repertoire in dataset.get_data(batch_size=batch_size):
            sentences = KmerHelper.create_sentences_from_repertoire(
                repertoire=repertoire, k=k)
            model.train(sentences=sentences,
                        total_words=len(all_kmers),
                        epochs=15)

        model.save(model_path)

        return model
    def process(dataset: RepertoireDataset, params: dict) -> RepertoireDataset:
        processed_dataset = dataset.clone()
        PathBuilder.build(params["result_path"])
        repertoires = []
        indices = []
        for index, repertoire in enumerate(dataset.get_data()):
            if all(sequence.metadata.chain == Chain.get_chain(
                    params["keep_chain"])
                   for sequence in repertoire.sequences):
                repertoires.append(repertoire)
                indices.append(index)

        processed_dataset.repertoires = repertoires
        processed_dataset.metadata_file = ChainRepertoireFilter.build_new_metadata(
            processed_dataset, indices, params["result_path"])

        Filter.check_dataset_not_empty(processed_dataset,
                                       "ChainRepertoireFilter")

        return processed_dataset
Example #9
0
    def group_features(dataset: RepertoireDataset, group_columns, group_summarization_type: GroupSummarizationType):
        """
        Takes an encoded dataset and groups together features by either adding or averaging the feature values for
        all features with the same values (or combination of values) for the specified group_columns.
        """
        dataset = copy.deepcopy(dataset)

        if group_summarization_type == GroupSummarizationType.NONZERO:
            dataset.encoded_data.examples.data[:] = 1

        feature_annotations = dataset.encoded_data.feature_annotations

        concatenated = DataSummarizer.concatenate_columns(feature_annotations, group_columns)

        group_mask = DataSummarizer.create_group_mask(concatenated.values, group_summarization_type)
        groups = group_mask["groups"]
        mask = group_mask["mask"]

        repertoires = dataset.encoded_data.examples.dot(mask)
        feature_annotations = DataSummarizer.split_values(groups, group_columns)

        encoded = EncodedData(
            examples=repertoires,
            labels=dataset.encoded_data.labels,
            example_ids=dataset.encoded_data.example_ids,
            feature_names=groups,
            feature_annotations=feature_annotations
        )

        result = RepertoireDataset(
            params=dataset.params,
            encoded_data=encoded,
            repertoires=dataset.get_data(),
            identifier=dataset.identifier,
            metadata_file=dataset.metadata_file
        )

        return result
Example #10
0
    def _encode_repertoires(self, dataset: RepertoireDataset,
                            params: EncoderParams):
        # Rows = repertoires, Columns = regex matches (one chain per column)
        encoded_repertoires = np.zeros(
            (dataset.get_example_count(), self.feature_count), dtype=int)
        labels = {
            label: []
            for label in params.label_config.get_labels_by_name()
        } if params.encode_labels else None

        n_repertoires = dataset.get_example_count()

        for i, repertoire in enumerate(dataset.get_data()):
            print(
                f"{datetime.datetime.now()}: Encoding repertoire {i+1}/{n_repertoires}"
            )
            encoded_repertoires[i] = self._match_repertoire_to_regexes(
                repertoire)

            if labels is not None:
                for label in params.label_config.get_labels_by_name():
                    labels[label].append(repertoire.metadata[label])

        return encoded_repertoires, labels
Example #11
0
    def group_repertoires(dataset: RepertoireDataset, group_columns, group_summarization_type: GroupSummarizationType):
        """
        Takes an encoded dataset and groups together repertoires by either adding or averaging the feature values for
        all repertoires with the same values (or combination of values) for the specified group_columns.
        """
        dataset = copy.deepcopy(dataset)

        repertoire_annotations = pd.DataFrame(dataset.encoded_data.labels)

        concatenated = DataSummarizer.concatenate_columns(repertoire_annotations, group_columns)

        group_mask = DataSummarizer.create_group_mask(concatenated.values, group_summarization_type)
        groups = group_mask["groups"]
        mask = group_mask["mask"]

        repertoires = mask.T.dot(dataset.encoded_data.examples)
        labels = DataSummarizer.split_values(groups, group_columns)
        metadata_file = DataSummarizer.build_metadata_from_labels(dataset.metadata_file, labels)

        encoded = EncodedData(
            examples=repertoires,
            labels=labels.to_dict("list"),
            example_ids=groups,
            feature_names=dataset.encoded_data.feature_names,
            feature_annotations=dataset.encoded_data.feature_annotations
        )

        result = RepertoireDataset(
            params=dataset.params,
            encoded_data=encoded,
            repertoires=dataset.get_data(),
            identifier=dataset.identifier,
            metadata_file=metadata_file
        )

        return result