Exemple #1
0
    def annotate_features(dataset: RepertoireDataset, criteria: dict, name: str = "annotation"):
        """
        Takes an encoded dataset and adds a new column to the feature_annotations with boolean values showing whether a
        feature matched the specified criteria or not.
        """
        dataset = copy.deepcopy(dataset)

        feature_annotations = dataset.encoded_data.feature_annotations

        matcher = CriteriaMatcher()
        results = matcher.match(criteria=criteria, data=feature_annotations)

        feature_annotations[name] = results

        encoded = EncodedData(
            examples=dataset.encoded_data.examples,
            labels=dataset.encoded_data.labels,
            example_ids=dataset.encoded_data.example_ids,
            feature_names=dataset.encoded_data.feature_names,
            feature_annotations=feature_annotations
        )

        result = RepertoireDataset(
            params=dataset.params,
            encoded_data=encoded,
            repertoires=dataset.get_data(),
            identifier=dataset.identifier,
            metadata_file=dataset.metadata_file
        )

        return result
Exemple #2
0
    def test_run(self):
        method = LogisticRegression()
        dataset = RepertoireDataset()
        dataset.encoded_data = EncodedData(examples=np.array([[1, 2, 3],
                                                              [2, 3, 4],
                                                              [1, 2, 3],
                                                              [2, 3, 4],
                                                              [1, 2, 3],
                                                              [2, 3, 4]]),
                                           labels={
                                               "l1": [1, 0, 1, 0, 1, 0],
                                               "l2": [0, 1, 0, 1, 0, 1]
                                           },
                                           feature_names=["f1", "f2", "f3"])

        path = EnvironmentSettings.root_path + "test/tmp/mlmethodtrainer/"

        method = MLMethodTrainer.run(
            MLMethodTrainerParams(
                result_path=path,
                dataset=dataset,
                label="l1",
                method=method,
                model_selection_n_folds=2,
                model_selection_cv=True,
                cores_for_training=1,
                train_predictions_path=f"{path}predictions.csv",
                ml_details_path=f"{path}details.yaml",
                optimization_metric="balanced_accuracy"))

        method.predict(EncodedData(np.array([1, 2, 3]).reshape(1, -1)), "l1")
        self.assertTrue(os.path.isfile(f"{path}predictions.csv"))
        self.assertTrue(os.path.isfile(f"{path}details.yaml"))

        shutil.rmtree(path)
Exemple #3
0
 def process_dataset(self, dataset: RepertoireDataset):
     extract_fn = self.build_matching_fn()
     repertoire_count = dataset.get_example_count()
     for index, repertoire in enumerate(dataset.get_data()):
         self.process_repertoire(repertoire, str(repertoire.identifier),
                                 extract_fn)
         logging.info("Repertoire {} ({}/{}) processed.".format(
             repertoire.identifier, index + 1, repertoire_count))
         logging.info(
             f"Currently, there are {self.item_count} items in the comparison data matrix."
         )
     self.merge_tmp_batches_to_matrix()
    def test_get_metadata_fields(self):

        path = EnvironmentSettings.tmp_test_path + "repertoire_dataset/"
        PathBuilder.build(path)

        repertoires, metadata = RepertoireBuilder.build([["AA"], ["BB"]], path, {"l1": [1, 2], "hla": ["A", "B"]}, subject_ids=["d1", "d2"])
        dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata)

        self.assertTrue("l1" in dataset.get_metadata_fields())
        self.assertTrue("hla" in dataset.get_metadata_fields())
        self.assertTrue("subject_id" in dataset.get_metadata_fields())

        shutil.rmtree(path)
Exemple #5
0
    def create_datasets(self, path: str):
        repertoires, metadata = RepertoireBuilder.build(
            [["A", "B"], ["B", "C"], ["D"], ["E", "F"]], path, {
                "l1": [1, 0, 1, 0],
                "l2": [2, 3, 2, 3]
            })

        main_dataset = RepertoireDataset(repertoires=repertoires,
                                         metadata_file=metadata)
        sub_dataset = main_dataset.make_subset([0, 1],
                                               path=path,
                                               dataset_type="subset")
        return main_dataset, sub_dataset
Exemple #6
0
    def build_labels(self, dataset: RepertoireDataset,
                     params: EncoderParams) -> dict:

        lbl = ["repertoire_identifier"]
        lbl.extend(params.label_config.get_labels_by_name())

        tmp_labels = dataset.get_metadata(lbl, return_df=True)
        tmp_labels = tmp_labels.iloc[pd.Index(
            tmp_labels['repertoire_identifier']).get_indexer(
                dataset.get_repertoire_ids())]
        tmp_labels = tmp_labels.to_dict("list")
        del tmp_labels["repertoire_identifier"]

        return tmp_labels
    def process(dataset: RepertoireDataset, params: dict) -> RepertoireDataset:
        processed_dataset = dataset.clone()
        repertoires = []
        indices =[]
        for index, repertoire in enumerate(dataset.get_data()):
            if "lower_limit" in params.keys() and len(repertoire.sequences) >= params["lower_limit"] or \
                 "upper_limit" in params.keys() and len(repertoire.sequences) <= params["upper_limit"]:
                repertoires.append(dataset.repertoires[index])
                indices.append(index)
        processed_dataset.repertoires = repertoires
        processed_dataset.metadata_file = ClonesPerRepertoireFilter.build_new_metadata(dataset, indices, params["result_path"])

        Filter.check_dataset_not_empty(processed_dataset, "ClonesPerRepertoireFilter")

        return processed_dataset
    def _encode_data(self, dataset: RepertoireDataset, params: EncoderParams):
        labels = params.label_config.get_labels_by_name()

        assert len(labels) == 1, \
            "SequenceAbundanceEncoder: this encoding works only for single label."

        examples = self._calculate_sequence_abundance(dataset, self.comparison_data, labels[0], params)

        encoded_data = EncodedData(examples, dataset.get_metadata([labels[0]]) if params.encode_labels else None, dataset.get_repertoire_ids(),
                                   [SequenceAbundanceEncoder.RELEVANT_SEQUENCE_ABUNDANCE, SequenceAbundanceEncoder.TOTAL_SEQUENCE_ABUNDANCE],
                                   encoding=SequenceAbundanceEncoder.__name__, info={'relevant_sequence_path': self.relevant_sequence_csv_path})

        encoded_dataset = RepertoireDataset(params=dataset.params, encoded_data=encoded_data, repertoires=dataset.repertoires)

        return encoded_dataset
    def _encode_repertoires(self, dataset: RepertoireDataset, params):
        # Rows = repertoires, Columns = reference sequences
        encoded_repertories = np.zeros((dataset.get_example_count(),
                                        len(self.reference_sequences)),
                                       dtype=int)

        labels = {label: [] for label in params.label_config.get_labels_by_name()} if params.encode_labels else None

        for i, repertoire in enumerate(dataset.get_data()):
            encoded_repertories[i] = self._match_repertoire_to_reference(repertoire)

            for label in params.label_config.get_labels_by_name():
                labels[label].append(repertoire.metadata[label])

        return encoded_repertories, labels
Exemple #10
0
    def test_repertoire_export(self):
        path = EnvironmentSettings.tmp_test_path + "airr_exporter_repertoire/"
        PathBuilder.build(path)

        repertoire, metadata_path = self.create_dummy_repertoire(path)
        dataset = RepertoireDataset(repertoires=[repertoire],
                                    metadata_file=metadata_path)

        path_exported = f"{path}exported/"
        AIRRExporter.export(dataset, path_exported)

        resulting_data = pd.read_csv(
            path_exported + f"repertoires/{repertoire.identifier}.tsv",
            sep="\t")

        self.assertListEqual(list(resulting_data["sequence_id"]),
                             ["receptor_1", "receptor_2"])
        self.assertListEqual(list(resulting_data["cdr3"]),
                             ["GCTGCTGCT", "GGTGGTGGT"])
        self.assertListEqual(list(resulting_data["cdr3_aa"]), ["AAA", "GGG"])
        self.assertListEqual(list(resulting_data["v_call"]),
                             ["TRBV1", "TRAV2*01"])
        self.assertListEqual(list(resulting_data["j_call"]),
                             ["TRBJ1", "TRAJ2"])
        self.assertListEqual(list(resulting_data["d_call"]),
                             ["TRBD1", "TRAD2"])
        self.assertListEqual(list(resulting_data["locus"]), ["TRB", "TRA"])
        self.assertListEqual(list(resulting_data["duplicate_count"]), [5, 15])
        self.assertListEqual(list(resulting_data["custom_test"]),
                             ["cust1", "cust2"])
        self.assertListEqual(list(resulting_data["productive"]), ['T', nan])
        self.assertListEqual(list(resulting_data["stop_codon"]), ['F', nan])

        shutil.rmtree(path)
Exemple #11
0
    def test_encode(self):
        path = EnvironmentSettings.tmp_test_path + "abundance_encoder/"
        PathBuilder.build(path)

        repertoires, metadata = RepertoireBuilder.build([["GGG", "III", "LLL", "MMM"],
                                                         ["DDD", "EEE", "FFF", "III", "LLL", "MMM"],
                                                         ["CCC", "FFF", "MMM"],
                                                         ["AAA", "CCC", "EEE", "FFF", "LLL", "MMM"]],
                                                        labels={"l1": [True, True, False, False]}, path=path)

        dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata, identifier="1")

        encoder = SequenceAbundanceEncoder.build_object(dataset, **{
            "comparison_attributes": ["sequence_aas"],
            "p_value_threshold": 0.4, "sequence_batch_size": 4, "repertoire_batch_size": 8
        })

        label_config = LabelConfiguration([Label("l1", [True, False], positive_class=True)])

        encoded_dataset = encoder.encode(dataset, EncoderParams(result_path=path, label_config=label_config))

        self.assertTrue(np.array_equal(np.array([[1, 4], [1, 6], [0, 3], [0, 6]]), encoded_dataset.encoded_data.examples))

        encoder.p_value_threshold = 0.05

        encoded_dataset = encoder.encode(dataset, EncoderParams(result_path=path, label_config=label_config))

        self.assertTrue(np.array_equal(np.array([[0, 4], [0, 6], [0, 3], [0, 6]]), encoded_dataset.encoded_data.examples))

        shutil.rmtree(path)
    def _create_dummy_data(self, path, dataset_type):
        PathBuilder.build(path)
        dataset = None

        test_repertoire = Repertoire.build(
            sequence_aas=[
                "DUPDUP", "AILUDGYF", "DFJKHJ", "DIUYUAG", "CTGTCGH"
            ],
            v_genes=["V1-1" for i in range(5)],
            j_genes=["J1-1" for i in range(5)],
            chains=[
                Chain.ALPHA, Chain.BETA, Chain.BETA, Chain.ALPHA, Chain.BETA
            ],
            custom_lists={
                "custom_1": [f"CUST-{i}" for i in range(5)],
                "custom_2":
                [f"CUST-A" for i in range(3)] + [f"CUST-B" for i in range(2)]
            },
            cell_ids=[1, 1, 1, 2, 2],
            path=path)

        if dataset_type == "receptor":
            receptordataset_filename = f"{path}/receptors.pkl"
            with open(receptordataset_filename, "wb") as file:
                pickle.dump(test_repertoire.receptors, file)

            dataset = ReceptorDataset(filenames=[receptordataset_filename],
                                      identifier="receptor_dataset")

        elif dataset_type == "repertoire":
            test_repertoire.identifier = "repertoire_dataset"
            dataset = RepertoireDataset(repertoires=[test_repertoire])

        return dataset
    def process(dataset: RepertoireDataset, params: dict) -> RepertoireDataset:
        rep_map = {}
        repertoires = []
        indices_to_keep = []

        processed_dataset = dataset.clone()
        PathBuilder.build(params["result_path"])

        for index, repertoire in enumerate(processed_dataset.get_data()):
            if repertoire.metadata["subject_id"] in rep_map.keys():
                sequences = np.append(
                    repertoire.sequences,
                    rep_map[repertoire.metadata["subject_id"]].sequences)
                del rep_map[repertoire.metadata["subject_id"]]
                repertoires.append(
                    SubjectRepertoireCollector.store_repertoire(
                        params["result_path"], repertoire, sequences))
            else:
                rep_map[repertoire.metadata["subject_id"]] = repertoire
                indices_to_keep.append(index)

        for key in rep_map.keys():
            repertoires.append(
                SubjectRepertoireCollector.store_repertoire(
                    params["result_path"], rep_map[key],
                    rep_map[key].sequences))

        processed_dataset.repertoires = repertoires
        processed_dataset.metadata_file = SubjectRepertoireCollector.build_new_metadata(
            dataset, indices_to_keep, params["result_path"])

        return processed_dataset
    def test_create_model(self):
        test_path = EnvironmentSettings.root_path + "test/tmp/w2v_test_tmp/"

        PathBuilder.build(test_path)

        sequence1 = ReceptorSequence("CASSVFA")
        sequence2 = ReceptorSequence("CASSCCC")

        metadata1 = {"T1D": "T1D", "subject_id": "1"}
        rep1 = Repertoire.build_from_sequence_objects([sequence1, sequence2],
                                                      test_path, metadata1)

        metadata2 = {"T1D": "CTL", "subject_id": "2"}
        rep2 = Repertoire.build_from_sequence_objects([sequence1], test_path,
                                                      metadata2)

        dataset = RepertoireDataset(repertoires=[rep1, rep2])

        model_creator = KmerPairModelCreator()
        model = model_creator.create_model(dataset=dataset,
                                           k=2,
                                           vector_size=16,
                                           batch_size=1,
                                           model_path=test_path +
                                           "model.model")

        self.assertTrue(isinstance(model, Word2Vec))
        self.assertTrue("CA" in model.wv.vocab)
        self.assertEqual(400, len(model.wv.vocab))

        shutil.rmtree(test_path)
    def create_dummy_dataset(self, path):
        repertoires, metadata = RepertoireBuilder.build([["AA"], ["CC"]],
                                                        path,
                                                        labels={
                                                            "label1":
                                                            ["val1", "val2"],
                                                            "label2":
                                                            ["val1", "val2"]
                                                        })

        dataset = RepertoireDataset(repertoires=repertoires,
                                    metadata_file=metadata)
        dataset.name = "my_dataset"
        PickleExporter.export(dataset, path)

        return f"{dataset.name}.iml_dataset"
Exemple #16
0
    def import_repertoire_dataset(import_class, params: DatasetImportParams, dataset_name: str) -> RepertoireDataset:
        """
        Function to create a dataset from the metadata and a list of repertoire files and exports dataset pickle file

        Arguments:
            import_class: class to use for import
            params: instance of DatasetImportParams class which includes information on path, columns, result path etc.
            dataset_name: user-defined name of the dataset

        Returns:
            RepertoireDataset object that was created
        """
        metadata = pd.read_csv(params.metadata_file, ",")

        ParameterValidator.assert_keys_present(metadata.columns.tolist(), ["filename"], ImportHelper.__name__,
                                               f'{dataset_name}: params: metadata_file')

        PathBuilder.build(params.result_path + "repertoires/")

        arguments = [(import_class, row, params) for index, row in metadata.iterrows()]
        with Pool(params.number_of_processes) as pool:
            repertoires = pool.starmap(ImportHelper.load_repertoire_as_object, arguments)

        new_metadata_file = ImportHelper.make_new_metadata_file(repertoires, metadata, params.result_path, dataset_name)

        potential_labels = list(set(metadata.columns.tolist()) - {"filename"})
        dataset = RepertoireDataset(params={key: list(set(metadata[key].values.tolist())) for key in potential_labels},
                                    repertoires=repertoires, metadata_file=new_metadata_file, name=dataset_name)

        PickleExporter.export(dataset, params.result_path)

        return dataset
Exemple #17
0
    def test_process(self):
        path = EnvironmentSettings.root_path + "test/tmp/clones_per_repertoire_filter/"
        PathBuilder.build(path)
        dataset = RepertoireDataset(repertoires=RepertoireBuilder.build(
            [["ACF", "ACF", "ACF"], ["ACF", "ACF"],
             ["ACF", "ACF", "ACF", "ACF"]], path)[0])

        dataset1 = ClonesPerRepertoireFilter.process(dataset, {
            "lower_limit": 3,
            "result_path": path
        })
        self.assertEqual(2, dataset1.get_example_count())

        dataset2 = ClonesPerRepertoireFilter.process(dataset, {
            "upper_limit": 2,
            "result_path": path
        })
        self.assertEqual(1, dataset2.get_example_count())

        self.assertRaises(AssertionError, ClonesPerRepertoireFilter.process,
                          dataset, {
                              "lower_limit": 10,
                              "result_path": path
                          })

        shutil.rmtree(path)
Exemple #18
0
    def _create_dummy_encoded_data(self, path):
        n_subjects = 8
        n_features = 300
        n_timepoints = 2
        n_examples = n_subjects * n_timepoints
        diseased_subjects = range(0, 4)

        subjects = [subject for subject in range(n_subjects) for timepoint in range(n_timepoints)]
        timepoints = [timepoint for subject in range(n_subjects) for timepoint in range(n_timepoints)]
        disease_statuses = [subject in diseased_subjects for subject in subjects]

        kmers = [''.join(random.choices(string.ascii_uppercase, k=3)) for i in range(n_features)]

        encoded_data = {
            'examples': sparse.csr_matrix(
                np.random.normal(50, 10, n_examples * n_features).reshape((n_examples, n_features))),
            'example_ids': [i for i in range(n_examples)],
            'labels': {
                "subject_id": np.array([f"subject {i}" for i in subjects]),
                "disease_status": np.array([f"disease: {i}" for i in disease_statuses]),
                "timepoint": np.array([f"timepoint {i}" for i in timepoints])
            },
            'feature_names': kmers,
            'feature_annotations': pd.DataFrame({
                "sequence": kmers
            }),
            'encoding': "random"
        }

        dataset = RepertoireDataset(encoded_data=EncodedData(**encoded_data))

        return dataset
Exemple #19
0
    def annotate_repertoires(dataset: RepertoireDataset, criteria: dict, name: str = "annotation"):
        """
        Takes an encoded dataset and adds a new label to the encoded_dataset with boolean values showing whether a
        repertoire matched the specified criteria or not.
        """
        dataset = copy.deepcopy(dataset)

        data = pd.DataFrame(dataset.encoded_data.labels)

        matcher = CriteriaMatcher()
        results = matcher.match(criteria=criteria, data=data)

        labels = dataset.encoded_data.labels
        labels[name] = np.array(results)

        encoded = EncodedData(
            examples=dataset.encoded_data.examples,
            labels=labels,
            example_ids=dataset.encoded_data.example_ids,
            feature_names=dataset.encoded_data.feature_names,
            feature_annotations=dataset.encoded_data.feature_annotations
        )

        result = RepertoireDataset(
            params=dataset.params,
            encoded_data=encoded,
            repertoires=dataset.repertoires,
            identifier=dataset.identifier,
            metadata_file=dataset.metadata_file
        )

        return result
Exemple #20
0
    def filter_features(dataset: RepertoireDataset, criteria: dict):
        """
        Takes an encoded dataset and filters features based on a given set of criteria. Only features meeting
        these criteria will be retained in the new dataset object.
        """
        dataset = copy.deepcopy(dataset)

        feature_annotations = dataset.encoded_data.feature_annotations

        matcher = CriteriaMatcher()
        results = matcher.match(criteria=criteria, data=feature_annotations)
        indices = np.where(np.array(results))[0]

        feature_annotations = feature_annotations.iloc[indices, :]
        examples = dataset.encoded_data.examples[:, indices]
        repertoires = dataset.repertoires
        feature_names = [dataset.encoded_data.feature_names[i] for i in indices]

        encoded = EncodedData(
            examples=examples,
            labels=dataset.encoded_data.labels,
            example_ids=dataset.encoded_data.example_ids,
            feature_names=feature_names,
            feature_annotations=feature_annotations
        )

        result = RepertoireDataset(
            params=dataset.params,
            encoded_data=encoded,
            repertoires=repertoires,
            identifier=dataset.identifier,
            metadata_file=dataset.metadata_file
        )

        return result
    def test_process(self):

        path = EnvironmentSettings.root_path + "test/tmp/chain_filter/"
        PathBuilder.build(path)

        rep1 = Repertoire.build_from_sequence_objects([
            ReceptorSequence(
                "AAA", metadata=SequenceMetadata(chain="A"), identifier="1")
        ],
                                                      path=path,
                                                      metadata={})
        rep2 = Repertoire.build_from_sequence_objects([
            ReceptorSequence(
                "AAC", metadata=SequenceMetadata(chain="B"), identifier="2")
        ],
                                                      path=path,
                                                      metadata={})

        metadata = pd.DataFrame({"CD": [1, 0]})
        metadata.to_csv(path + "metadata.csv")

        dataset = RepertoireDataset(repertoires=[rep1, rep2],
                                    metadata_file=path + "metadata.csv")

        dataset2 = ChainRepertoireFilter.process(
            dataset, {
                "keep_chain": "ALPHA",
                "result_path": path + "results/"
            })

        self.assertEqual(1, len(dataset2.get_data()))
        self.assertEqual(2, len(dataset.get_data()))

        metadata_dict = dataset2.get_metadata(["CD"])
        self.assertEqual(1, len(metadata_dict["CD"]))
        self.assertEqual(1, metadata_dict["CD"][0])

        for rep in dataset2.get_data():
            self.assertEqual("AAA", rep.sequences[0].get_sequence())

        self.assertRaises(AssertionError, ChainRepertoireFilter.process,
                          dataset, {
                              "keep_chain": "GAMMA",
                              "result_path": path + "results/"
                          })

        shutil.rmtree(path)
Exemple #22
0
    def _encode_sequence_count(self, dataset: RepertoireDataset, comparison_data: ComparisonData, label: str, params: EncoderParams) -> EncodedData:
        sequence_p_values_indices, indices_path, relevant_sequences_path = SequenceFilterHelper.get_relevant_sequences(dataset, params, comparison_data, label, self.p_value_threshold,
                                                                                self.comparison_attributes, self.relevant_indices_path)
        if self.relevant_indices_path is None:
            self.relevant_indices_path = indices_path
        if self.relevant_sequence_csv_path is None:
            self.relevant_sequence_csv_path = relevant_sequences_path

        count_matrix = self._build_count_matrix(comparison_data, dataset.get_repertoire_ids(), sequence_p_values_indices)
        feature_names = comparison_data.get_item_names()[sequence_p_values_indices]

        encoded_data = EncodedData(count_matrix, dataset.get_metadata([label]) if params.encode_labels else None,
                                   dataset.get_repertoire_ids(),
                                   feature_names,
                                   encoding=SequenceCountEncoder.__name__, info={'relevant_sequence_path': self.relevant_sequence_csv_path})

        return encoded_data
    def _encode_examples(self, dataset: RepertoireDataset, params: EncoderParams) -> Tuple[list, set, dict]:

        keys = set()
        example_count = dataset.get_example_count()

        arguments = [(repertoire, index, example_count) for index, repertoire in enumerate(dataset.repertoires)]

        with Pool(params.pool_size) as pool:
            chunksize = math.floor(dataset.get_example_count() / params.pool_size) + 1
            examples = pool.starmap(self._process_repertoire_cached, arguments, chunksize=chunksize)

        for example in examples:
            keys.update(list(example.keys()))

        labels = dataset.get_metadata(params.label_config.get_labels_by_name()) if params.encode_labels else None

        return examples, keys, labels
    def _encode_new_dataset(self, dataset, params: EncoderParams):
        encoded_dataset = RepertoireDataset(repertoires=dataset.repertoires, params=dataset.params,
                                            metadata_file=dataset.metadata_file)
        encoded_repertoires, labels = self._encode_repertoires(dataset, params)

        feature_annotations = self._get_feature_info()

        encoded_dataset.add_encoded_data(EncodedData(
            examples=encoded_repertoires,
            labels=labels,
            feature_names=list(feature_annotations["sequence_id"]),
            feature_annotations=feature_annotations,
            example_ids=[repertoire.identifier for repertoire in dataset.get_data()],
            encoding=MatchedSequencesEncoder.__name__
        ))

        return encoded_dataset
Exemple #25
0
    def build_comparison_data(dataset: RepertoireDataset, params: EncoderParams,
                              comparison_attributes, sequence_batch_size):

        comp_data = ComparisonData(dataset.get_repertoire_ids(), comparison_attributes,
                                   sequence_batch_size, params.result_path)

        comp_data.process_dataset(dataset)

        return comp_data
Exemple #26
0
    def _implant_signals_in_repertoires(simulation_state: SimulationState = None) -> Dataset:

        PathBuilder.build(simulation_state.result_path + "repertoires/")
        processed_repertoires = SignalImplanter._implant_signals(simulation_state, SignalImplanter._process_repertoire)
        processed_dataset = RepertoireDataset(repertoires=processed_repertoires, params={**(simulation_state.dataset.params if simulation_state.dataset.params is not None else {}),
                                                                                         **{signal.id: [True, False] for signal in simulation_state.signals}},
                                              name=simulation_state.dataset.name,
                                              metadata_file=SignalImplanter._create_metadata_file(processed_repertoires, simulation_state))
        return processed_dataset
    def create_comparison_data(self,
                               dataset: RepertoireDataset) -> ComparisonData:

        comparison_data = ComparisonData(dataset.get_repertoire_ids(),
                                         self.matching_columns,
                                         self.sequence_batch_size, self.path)
        comparison_data.process_dataset(dataset)

        return comparison_data
    def test_match(self):
        path = EnvironmentSettings.root_path + "test/tmp/seqmatch/"
        PathBuilder.build(path)

        repertoire = Repertoire.build_from_sequence_objects(
            sequence_objects=[
                ReceptorSequence(amino_acid_sequence="AAAAAA",
                                 metadata=SequenceMetadata(chain="A",
                                                           v_gene="V1",
                                                           j_gene="J2"),
                                 identifier="3"),
                ReceptorSequence(amino_acid_sequence="CCCCCC",
                                 metadata=SequenceMetadata(chain="A",
                                                           v_gene="V1",
                                                           j_gene="J2"),
                                 identifier="4"),
                ReceptorSequence(amino_acid_sequence="AAAACC",
                                 metadata=SequenceMetadata(chain="A",
                                                           v_gene="V1",
                                                           j_gene="J2"),
                                 identifier="5"),
                ReceptorSequence(amino_acid_sequence="TADQVF",
                                 metadata=SequenceMetadata(chain="A",
                                                           v_gene="V1",
                                                           j_gene="J3"),
                                 identifier="6")
            ],
            metadata={"CD": True},
            path=path)

        dataset = RepertoireDataset(repertoires=[repertoire])
        sequences = [
            ReceptorSequence("AAAACA",
                             metadata=SequenceMetadata(chain="A",
                                                       v_gene="V1",
                                                       j_gene="J2"),
                             identifier="1"),
            ReceptorSequence("TADQV",
                             metadata=SequenceMetadata(chain="A",
                                                       v_gene="V1",
                                                       j_gene="J3"),
                             identifier="2")
        ]

        matcher = SequenceMatcher()
        result = matcher.match(dataset, sequences, 2,
                               SequenceMatchingSummaryType.PERCENTAGE)

        self.assertTrue("repertoires" in result)
        self.assertEqual(
            1,
            len(result["repertoires"][0]["sequences"][3]
                ["matching_sequences"]))
        self.assertTrue(result["repertoires"][0]["metadata"]["CD"])
        self.assertEqual(1, len(result["repertoires"]))

        shutil.rmtree(path)
Exemple #29
0
    def match(self, dataset: RepertoireDataset, reference_sequences: list, max_distance: int, summary_type: SequenceMatchingSummaryType) -> dict:

        matched = {"repertoires": []}

        for index, repertoire in enumerate(dataset.get_data()):
            matched["repertoires"].append(self.match_repertoire(repertoire, index,
                                                                reference_sequences, max_distance, summary_type))

        return matched
Exemple #30
0
 def create_dataset(self, path: str) -> RepertoireDataset:
     repertoires, metadata = RepertoireBuilder.build(
         [["A", "B"], ["B", "C"], ["D"], ["E", "F"], ["A", "B"], ["B", "C"],
          ["D"], ["E", "F"]], path, {
              "l1": [1, 0, 1, 0, 1, 0, 1, 0],
              "l2": [2, 3, 2, 3, 2, 3, 3, 3]
          })
     dataset = RepertoireDataset(repertoires=repertoires,
                                 metadata_file=metadata)
     return dataset