def test_create_model(self):
        test_path = EnvironmentSettings.root_path + "test/tmp/w2v_test_tmp/"

        PathBuilder.build(test_path)

        sequence1 = ReceptorSequence("CASSVFA")
        sequence2 = ReceptorSequence("CASSCCC")

        metadata1 = {"T1D": "T1D", "subject_id": "1"}
        rep1 = Repertoire.build_from_sequence_objects([sequence1, sequence2],
                                                      test_path, metadata1)

        metadata2 = {"T1D": "CTL", "subject_id": "2"}
        rep2 = Repertoire.build_from_sequence_objects([sequence1], test_path,
                                                      metadata2)

        dataset = RepertoireDataset(repertoires=[rep1, rep2])

        model_creator = KmerPairModelCreator()
        model = model_creator.create_model(dataset=dataset,
                                           k=2,
                                           vector_size=16,
                                           batch_size=1,
                                           model_path=test_path +
                                           "model.model")

        self.assertTrue(isinstance(model, Word2Vec))
        self.assertTrue("CA" in model.wv.vocab)
        self.assertEqual(400, len(model.wv.vocab))

        shutil.rmtree(test_path)
Example #2
0
    def _construct_test_repertoiredataset(self, path, positional):
        receptors1 = ReceptorSequenceList()
        receptors2 = ReceptorSequenceList()

        if positional:
            [
                receptors1.append(seq) for seq in [
                    ReceptorSequence("AAAAAAAAAAAAAAAAA", identifier="1"),
                    ReceptorSequence("AAAAAAAAAAAAAAAAA", identifier="1")
                ]
            ]
            [
                receptors2.append(seq)
                for seq in [ReceptorSequence("TTTTTTTTTTTTT", identifier="1")]
            ]
        else:
            [
                receptors1.append(seq) for seq in [
                    ReceptorSequence("AAAA", identifier="1"),
                    ReceptorSequence("ATA", identifier="2"),
                    ReceptorSequence("ATA", identifier='3')
                ]
            ]
            [
                receptors2.append(seq) for seq in [
                    ReceptorSequence("ATA", identifier="1"),
                    ReceptorSequence("TAA", identifier="2")
                ]
            ]

        rep1 = Repertoire.build_from_sequence_objects(receptors1,
                                                      metadata={
                                                          "l1": 1,
                                                          "l2": 2,
                                                          "subject_id": "1"
                                                      },
                                                      path=path)

        rep2 = Repertoire.build_from_sequence_objects(receptors2,
                                                      metadata={
                                                          "l1": 0,
                                                          "l2": 3,
                                                          "subject_id": "2"
                                                      },
                                                      path=path)

        lc = LabelConfiguration()
        lc.add_label("l1", [1, 2])
        lc.add_label("l2", [0, 3])

        dataset = RepertoireDataset(repertoires=[rep1, rep2])

        return dataset, lc
    def _create_dummy_data(self, path, dataset_type):
        PathBuilder.build(path)
        dataset = None

        test_repertoire = Repertoire.build(
            sequence_aas=[
                "DUPDUP", "AILUDGYF", "DFJKHJ", "DIUYUAG", "CTGTCGH"
            ],
            v_genes=["V1-1" for i in range(5)],
            j_genes=["J1-1" for i in range(5)],
            chains=[
                Chain.ALPHA, Chain.BETA, Chain.BETA, Chain.ALPHA, Chain.BETA
            ],
            custom_lists={
                "custom_1": [f"CUST-{i}" for i in range(5)],
                "custom_2":
                [f"CUST-A" for i in range(3)] + [f"CUST-B" for i in range(2)]
            },
            cell_ids=[1, 1, 1, 2, 2],
            path=path)

        if dataset_type == "receptor":
            receptordataset_filename = f"{path}/receptors.pkl"
            with open(receptordataset_filename, "wb") as file:
                pickle.dump(test_repertoire.receptors, file)

            dataset = ReceptorDataset(filenames=[receptordataset_filename],
                                      identifier="receptor_dataset")

        elif dataset_type == "repertoire":
            test_repertoire.identifier = "repertoire_dataset"
            dataset = RepertoireDataset(repertoires=[test_repertoire])

        return dataset
Example #4
0
    def test_implant_in_repertoire(self):
        path = EnvironmentSettings.tmp_test_path + "healthysequenceimplanting/"
        PathBuilder.build(path)

        repertoire = Repertoire.build_from_sequence_objects(
            [
                ReceptorSequence(amino_acid_sequence="ACDFQ", identifier="1"),
                ReceptorSequence(amino_acid_sequence="TGCDF", identifier="2")
            ],
            path=path,
            metadata={"subject_id": "1"})
        implanting = HealthySequenceImplanting(
            GappedMotifImplanting(),
            implanting_computation=ImplantingComputation.ROUND)
        signal = Signal("1", [Motif("m1", GappedKmerInstantiation(), "CCC")],
                        implanting)

        repertoire2 = implanting.implant_in_repertoire(repertoire, 0.5, signal,
                                                       path)

        new_sequences = [
            sequence.get_sequence() for sequence in repertoire2.sequences
        ]
        self.assertTrue("ACDFQ" in new_sequences or "TGCDF" in new_sequences)
        self.assertTrue(any(["CCC" in sequence for sequence in new_sequences]))

        shutil.rmtree(path)
    def test_process(self):

        path = EnvironmentSettings.root_path + "test/tmp/chain_filter/"
        PathBuilder.build(path)

        rep1 = Repertoire.build_from_sequence_objects([
            ReceptorSequence(
                "AAA", metadata=SequenceMetadata(chain="A"), identifier="1")
        ],
                                                      path=path,
                                                      metadata={})
        rep2 = Repertoire.build_from_sequence_objects([
            ReceptorSequence(
                "AAC", metadata=SequenceMetadata(chain="B"), identifier="2")
        ],
                                                      path=path,
                                                      metadata={})

        metadata = pd.DataFrame({"CD": [1, 0]})
        metadata.to_csv(path + "metadata.csv")

        dataset = RepertoireDataset(repertoires=[rep1, rep2],
                                    metadata_file=path + "metadata.csv")

        dataset2 = ChainRepertoireFilter.process(
            dataset, {
                "keep_chain": "ALPHA",
                "result_path": path + "results/"
            })

        self.assertEqual(1, len(dataset2.get_data()))
        self.assertEqual(2, len(dataset.get_data()))

        metadata_dict = dataset2.get_metadata(["CD"])
        self.assertEqual(1, len(metadata_dict["CD"]))
        self.assertEqual(1, metadata_dict["CD"][0])

        for rep in dataset2.get_data():
            self.assertEqual("AAA", rep.sequences[0].get_sequence())

        self.assertRaises(AssertionError, ChainRepertoireFilter.process,
                          dataset, {
                              "keep_chain": "GAMMA",
                              "result_path": path + "results/"
                          })

        shutil.rmtree(path)
Example #6
0
    def test_run(self):
        path = EnvironmentSettings.root_path + "test/tmp/dataencoder/"
        PathBuilder.build(path)

        rep1 = Repertoire.build_from_sequence_objects(
            [ReceptorSequence("AAA", identifier="1")],
            metadata={
                "l1": 1,
                "l2": 2
            },
            path=path)

        rep2 = Repertoire.build_from_sequence_objects(
            [ReceptorSequence("ATA", identifier="2")],
            metadata={
                "l1": 0,
                "l2": 3
            },
            path=path)

        lc = LabelConfiguration()
        lc.add_label("l1", [1, 2])
        lc.add_label("l2", [0, 3])

        dataset = RepertoireDataset(repertoires=[rep1, rep2])
        encoder = Word2VecEncoder.build_object(
            dataset, **{
                "k": 3,
                "model_type": ModelType.SEQUENCE.name,
                "vector_size": 6
            })

        res = DataEncoder.run(
            DataEncoderParams(dataset=dataset,
                              encoder=encoder,
                              encoder_params=EncoderParams(
                                  model={},
                                  pool_size=2,
                                  label_config=lc,
                                  result_path=path,
                                  filename="dataset.csv"),
                              store_encoded_data=False))

        self.assertTrue(isinstance(res, RepertoireDataset))
        self.assertTrue(res.encoded_data.examples.shape[0] == 2)

        shutil.rmtree(path)
Example #7
0
    def test_encode(self):

        test_path = EnvironmentSettings.root_path + "test/tmp/w2v/"

        PathBuilder.build(test_path)

        sequence1 = ReceptorSequence("CASSVFA", identifier="1")
        sequence2 = ReceptorSequence("CASSCCC", identifier="2")

        metadata1 = {"T1D": "T1D", "subject_id": "1"}
        rep1 = Repertoire.build_from_sequence_objects([sequence1, sequence2],
                                                      test_path, metadata1)

        metadata2 = {"T1D": "CTL", "subject_id": "2"}
        rep2 = Repertoire.build_from_sequence_objects([sequence1], test_path,
                                                      metadata2)

        dataset = RepertoireDataset(repertoires=[rep1, rep2])

        label_configuration = LabelConfiguration()
        label_configuration.add_label("T1D", ["T1D", "CTL"])

        config_params = EncoderParams(model={},
                                      learn_model=True,
                                      result_path=test_path,
                                      label_config=label_configuration,
                                      filename="dataset.pkl")

        encoder = Word2VecEncoder.build_object(
            dataset, **{
                "k": 3,
                "model_type": "sequence",
                "vector_size": 16
            })

        encoded_dataset = encoder.encode(dataset=dataset, params=config_params)

        self.assertIsNotNone(encoded_dataset.encoded_data)
        self.assertTrue(encoded_dataset.encoded_data.examples.shape[0] == 2)
        self.assertTrue(encoded_dataset.encoded_data.examples.shape[1] == 16)
        self.assertTrue(len(encoded_dataset.encoded_data.labels["T1D"]) == 2)
        self.assertTrue(encoded_dataset.encoded_data.labels["T1D"][0] == "T1D")
        self.assertTrue(isinstance(encoder, W2VRepertoireEncoder))

        shutil.rmtree(test_path)
Example #8
0
    def process_repertoire(repertoire: Repertoire, params: dict) -> Repertoire:
        data = pd.DataFrame(repertoire.load_data())

        groupby_fields = DuplicateSequenceFilter._prepare_group_by_field(
            params, data.columns)
        custom_lists = list(set(data.columns) - set(Repertoire.FIELDS))
        agg_dict = DuplicateSequenceFilter._prepare_agg_dict(
            params, data.columns, custom_lists)

        # Chain objects can not be aggregated, convert to strings
        if "chains" in data.columns:
            data["chains"] = [
                chain.value if isinstance(chain, Chain) else chain
                for chain in data["chains"]
            ]
        else:
            data["chains"] = None

        no_duplicates = data.groupby(groupby_fields).agg(
            agg_dict).reset_index()

        processed_repertoire = Repertoire.build(
            sequence_aas=list(no_duplicates["sequence_aas"])
            if "sequence_aas" in no_duplicates.columns else None,
            sequences=list(no_duplicates["sequences"])
            if "sequences" in no_duplicates.columns else None,
            v_genes=list(no_duplicates["v_genes"])
            if "v_genes" in no_duplicates.columns else None,
            j_genes=list(no_duplicates["j_genes"])
            if 'j_genes' in no_duplicates.columns else None,
            chains=[Chain(key) for key in list(no_duplicates["chains"])]
            if "chains" in no_duplicates.columns else None,
            counts=list(no_duplicates["counts"])
            if "counts" in no_duplicates else None,
            region_types=list(no_duplicates["region_types"])
            if "region_types" in no_duplicates else None,
            custom_lists={
                key: list(no_duplicates[key])
                for key in custom_lists
            },
            sequence_identifiers=list(no_duplicates["sequence_identifiers"]),
            metadata=copy.deepcopy(repertoire.metadata),
            path=params["result_path"])

        return processed_repertoire
Example #9
0
    def test_match(self):
        path = EnvironmentSettings.root_path + "test/tmp/seqmatch/"
        PathBuilder.build(path)

        repertoire = Repertoire.build_from_sequence_objects(
            sequence_objects=[
                ReceptorSequence(amino_acid_sequence="AAAAAA",
                                 metadata=SequenceMetadata(chain="A",
                                                           v_gene="V1",
                                                           j_gene="J2"),
                                 identifier="3"),
                ReceptorSequence(amino_acid_sequence="CCCCCC",
                                 metadata=SequenceMetadata(chain="A",
                                                           v_gene="V1",
                                                           j_gene="J2"),
                                 identifier="4"),
                ReceptorSequence(amino_acid_sequence="AAAACC",
                                 metadata=SequenceMetadata(chain="A",
                                                           v_gene="V1",
                                                           j_gene="J2"),
                                 identifier="5"),
                ReceptorSequence(amino_acid_sequence="TADQVF",
                                 metadata=SequenceMetadata(chain="A",
                                                           v_gene="V1",
                                                           j_gene="J3"),
                                 identifier="6")
            ],
            metadata={"CD": True},
            path=path)

        dataset = RepertoireDataset(repertoires=[repertoire])
        sequences = [
            ReceptorSequence("AAAACA",
                             metadata=SequenceMetadata(chain="A",
                                                       v_gene="V1",
                                                       j_gene="J2"),
                             identifier="1"),
            ReceptorSequence("TADQV",
                             metadata=SequenceMetadata(chain="A",
                                                       v_gene="V1",
                                                       j_gene="J3"),
                             identifier="2")
        ]

        matcher = SequenceMatcher()
        result = matcher.match(dataset, sequences, 2,
                               SequenceMatchingSummaryType.PERCENTAGE)

        self.assertTrue("repertoires" in result)
        self.assertEqual(
            1,
            len(result["repertoires"][0]["sequences"][3]
                ["matching_sequences"]))
        self.assertTrue(result["repertoires"][0]["metadata"]["CD"])
        self.assertEqual(1, len(result["repertoires"]))

        shutil.rmtree(path)
Example #10
0
    def test_match_repertoire(self):

        path = EnvironmentSettings.root_path + "test/tmp/seqmatchrep/"
        PathBuilder.build(path)

        repertoire = Repertoire.build_from_sequence_objects(sequence_objects=[
            ReceptorSequence(amino_acid_sequence="AAAAAA",
                             identifier="1",
                             metadata=SequenceMetadata(chain="A", count=3)),
            ReceptorSequence(amino_acid_sequence="CCCCCC",
                             identifier="2",
                             metadata=SequenceMetadata(chain="A", count=2)),
            ReceptorSequence(amino_acid_sequence="AAAACC",
                             identifier="3",
                             metadata=SequenceMetadata(chain="A", count=1)),
            ReceptorSequence(amino_acid_sequence="TADQVF",
                             identifier="4",
                             metadata=SequenceMetadata(chain="A", count=4))
        ],
                                                            metadata={
                                                                "CD": True
                                                            },
                                                            path=path)

        sequences = [
            ReceptorSequence("AAAACA", metadata=SequenceMetadata(chain="A")),
            ReceptorSequence("TADQV", metadata=SequenceMetadata(chain="A"))
        ]

        matcher = SequenceMatcher()
        result = matcher.match_repertoire(repertoire, 0, sequences, 2,
                                          SequenceMatchingSummaryType.COUNT)

        self.assertTrue("sequences" in result)
        self.assertTrue("repertoire" in result)
        self.assertTrue("repertoire_index" in result)

        self.assertEqual(4, len(result["sequences"]))
        self.assertEqual(1, len(result["sequences"][0]["matching_sequences"]))
        self.assertEqual(0, len(result["sequences"][1]["matching_sequences"]))
        self.assertEqual(1, len(result["sequences"][2]["matching_sequences"]))
        self.assertEqual(1, len(result["sequences"][3]["matching_sequences"]))

        self.assertEqual(
            3,
            len([
                r for r in result["sequences"]
                if len(r["matching_sequences"]) > 0
            ]))
        self.assertTrue(result["metadata"]["CD"])

        result = matcher.match_repertoire(
            repertoire, 0, sequences, 2,
            SequenceMatchingSummaryType.CLONAL_PERCENTAGE)
        self.assertEqual(0.8, result["clonal_percentage"])

        shutil.rmtree(path)
Example #11
0
    def _process_repertoire(index, repertoire, current_implanting, simulation_state) -> Repertoire:
        if current_implanting is not None:

            return SignalImplanter._implant_in_repertoire(index, repertoire, current_implanting, simulation_state)

        else:
            new_repertoire = Repertoire.build_from_sequence_objects(repertoire.sequences, simulation_state.result_path + "repertoires/",
                                                                    repertoire.metadata)

            for signal in simulation_state.signals:
                new_repertoire.metadata[f"signal_{signal.id}"] = False

            return new_repertoire
Example #12
0
    def test_implant_in_repertoire(self):
        path = PathBuilder.build(f"{EnvironmentSettings.tmp_test_path}full_seq_implanting/")
        signal = Signal("sig1", [Motif("motif1", GappedKmerInstantiation(max_gap=0), "AAAA")], FullSequenceImplanting())

        repertoire = Repertoire.build(["CCCC", "CCCC", "CCCC"], path=path)

        new_repertoire = signal.implant_to_repertoire(repertoire, 0.33, path)

        self.assertEqual(len(repertoire.sequences), len(new_repertoire.sequences))
        self.assertEqual(1, len([seq for seq in new_repertoire.sequences if seq.amino_acid_sequence == "AAAA"]))
        self.assertEqual(2, len([seq for seq in new_repertoire.sequences if seq.amino_acid_sequence == "CCCC"]))

        shutil.rmtree(path)
Example #13
0
    def implant_in_repertoire(self, repertoire: Repertoire, repertoire_implanting_rate: float, signal, path):

        assert all("/" not in motif.seed for motif in signal.motifs), \
            f'FullSequenceImplanting: motifs cannot include gaps. Check motifs {[motif.identifier for motif in signal.motifs]}.'

        sequences = repertoire.sequences
        new_sequence_count = math.ceil(len(sequences) * repertoire_implanting_rate)
        assert new_sequence_count > 0, \
            f"FullSequenceImplanting: there are too few sequences ({len(sequences)}) in the repertoire with identifier {repertoire.identifier} " \
            f"to have the given repertoire implanting rate ({repertoire_implanting_rate}). Please consider increasing the repertoire implanting rate."
        new_sequences = self._create_new_sequences(sequences, new_sequence_count, signal)
        metadata = copy.deepcopy(repertoire.metadata)
        metadata[f"signal_{signal.id}"] = True

        return Repertoire.build_from_sequence_objects(new_sequences, path, metadata)
Example #14
0
    def test_create_sentences_from_repertoire(self):

        path = EnvironmentSettings.tmp_test_path + "kmer/"
        PathBuilder.build(path)

        rep = Repertoire.build_from_sequence_objects([ReceptorSequence(amino_acid_sequence="AACT"),
                                                      ReceptorSequence(amino_acid_sequence="ACCT"),
                                                      ReceptorSequence(amino_acid_sequence="AACT")], path, {})

        sentences = KmerHelper.create_sentences_from_repertoire(rep, 3)

        self.assertEqual(3, len(sentences))
        self.assertTrue(len(sentences[0]) == 2 and "AAC" in sentences[0] and "ACT" in sentences[0])

        shutil.rmtree(path)
    def _build_new_repertoire(self, sequences, repertoire_metadata, signal,
                              path) -> Repertoire:
        if repertoire_metadata is not None:
            metadata = copy.deepcopy(repertoire_metadata)
        else:
            metadata = {}

        # when adding implant to a repertoire, only signal id is stored:
        # more detailed information is available in each receptor_sequence
        # (specific motif and motif instance)
        metadata[f"signal_{signal.id}"] = True
        repertoire = Repertoire.build_from_sequence_objects(
            sequences, path, metadata)

        return repertoire
Example #16
0
    def _repertoire_to_dataframe(repertoire: Repertoire, region_type):
        # get all fields (including custom fields)
        df = pd.DataFrame(repertoire.load_data())

        for column in ['v_alleles', 'j_alleles', 'v_genes', 'j_genes']:
            if column not in df.columns:
                df.loc[:, column] = None

        AIRRExporter.update_gene_columns(df, 'alleles', 'genes')

        # rename mandatory fields for airr-compliance
        mapper = {"sequence_identifiers": "sequence_id", "v_alleles": "v_call", "j_alleles": "j_call", "chains": "locus", "counts": "duplicate_count",
                  "sequences": AIRRExporter.get_sequence_field(region_type), "sequence_aas": AIRRExporter.get_sequence_aa_field(region_type)}

        df = df.rename(mapper=mapper, axis="columns")
        return df
Example #17
0
    def load_repertoire_as_object(import_class, metadata_row, params: DatasetImportParams):
        try:
            alternative_load_func = getattr(import_class, "alternative_load_func", None)

            dataframe = ImportHelper.load_sequence_dataframe(f"{params.path}{metadata_row['filename']}", params, alternative_load_func)
            dataframe = import_class.preprocess_dataframe(dataframe, params)
            sequence_lists = {field: dataframe[field].values.tolist() for field in Repertoire.FIELDS if field in dataframe.columns}
            sequence_lists["custom_lists"] = {field: dataframe[field].values.tolist()
                                              for field in list(set(dataframe.columns) - set(Repertoire.FIELDS))}

            repertoire_inputs = {**{"metadata": metadata_row.to_dict(), "path": params.result_path + "repertoires/"}, **sequence_lists}
            repertoire = Repertoire.build(**repertoire_inputs)

            return repertoire
        except Exception as exception:
            raise RuntimeError(f"{ImportHelper.__name__}: error when importing file {metadata_row['filename']}.") from exception
Example #18
0
    def build(sequences: list, path: str, labels: dict = None, seq_metadata: list = None, subject_ids: list = None):

        if subject_ids is not None:
            assert len(subject_ids) == len(sequences)

        if seq_metadata is not None:
            assert len(sequences) == len(seq_metadata)
            for index, sequence_list in enumerate(sequences):
                assert len(sequence_list) == len(seq_metadata[index])

        PathBuilder.build(path)
        rep_path = PathBuilder.build(path + "repertoires/")

        repertoires = []
        if subject_ids is None:
            subject_ids = []

        for rep_index, sequence_list in enumerate(sequences):
            rep_sequences = ReceptorSequenceList()
            if len(subject_ids) < len(sequences):
                subject_ids.append("rep_" + str(rep_index))
            for seq_index, sequence in enumerate(sequence_list):
                if seq_metadata is None:
                    m = SequenceMetadata(v_subgroup="TRBV1", v_gene="TRBV1-1", v_allele="TRBV1-1*01", j_subgroup="TRBJ1", j_gene="TRBJ1-1", j_allele="TRBJ1-1*01", count=1, chain="TRB", region_type="IMGT_CDR3")
                else:
                    m = SequenceMetadata(**seq_metadata[rep_index][seq_index])

                s = ReceptorSequence(amino_acid_sequence=sequence, metadata=m, identifier=str(seq_index))
                rep_sequences.append(s)

            if labels is not None:
                metadata = {key: labels[key][rep_index] for key in labels.keys()}
            else:
                metadata = {}

            metadata = {**metadata, **{"subject_id": subject_ids[rep_index]}}

            repertoire = Repertoire.build_from_sequence_objects(rep_sequences, rep_path, metadata)
            repertoires.append(repertoire)

        df = pd.DataFrame({**{"filename": [f"{repertoire.identifier}_data.npy" for repertoire in repertoires], "subject_id": subject_ids,
                              "repertoire_identifier": [repertoire.identifier for repertoire in repertoires]},
                           **(labels if labels is not None else {})})
        df.to_csv(path + "metadata.csv", index=False)

        return repertoires, path + "metadata.csv"
Example #19
0
    def create_dummy_repertoire(self, path):
        sequence_objects = [
            ReceptorSequence(amino_acid_sequence="AAA",
                             nucleotide_sequence="GCTGCTGCT",
                             identifier="receptor_1",
                             metadata=SequenceMetadata(v_gene="TRBV1",
                                                       j_gene="TRBJ1",
                                                       chain=Chain.BETA,
                                                       count=5,
                                                       region_type="IMGT_CDR3",
                                                       frame_type="IN",
                                                       custom_params={
                                                           "d_call": "TRBD1",
                                                           "custom_test":
                                                           "cust1"
                                                       })),
            ReceptorSequence(amino_acid_sequence="GGG",
                             nucleotide_sequence="GGTGGTGGT",
                             identifier="receptor_2",
                             metadata=SequenceMetadata(v_gene="TRAV2",
                                                       v_allele="TRAV2*01",
                                                       j_gene="TRAJ2",
                                                       chain=Chain.ALPHA,
                                                       count=15,
                                                       frame_type=None,
                                                       region_type="IMGT_CDR3",
                                                       custom_params={
                                                           "d_call": "TRAD2",
                                                           "custom_test":
                                                           "cust2"
                                                       }))
        ]

        repertoire = Repertoire.build_from_sequence_objects(
            sequence_objects=sequence_objects,
            path=path,
            metadata={"subject_id": "REP1"})
        df = pd.DataFrame({
            "filename": [f"{repertoire.identifier}_data.npy"],
            "subject_id": ["1"],
            "repertoire_identifier": [repertoire.identifier]
        })
        df.to_csv(path + "metadata.csv", index=False)

        return repertoire, path + "metadata.csv"
Example #20
0
    def test_run(self):

        r = []

        path = EnvironmentSettings.root_path + "test/tmp/signalImplanter/"

        if not os.path.isdir(path):
            os.makedirs(path)

        sequences = [ReceptorSequence("ACDEFG", identifier="1"), ReceptorSequence("ACDEFG", identifier="2"),
                     ReceptorSequence("ACDEFG", identifier="3"), ReceptorSequence("ACDEFG", identifier="4")]

        for i in range(10):
            rep = Repertoire.build_from_sequence_objects(sequence_objects=sequences, path=path, metadata={})
            r.append(rep)

        dataset = RepertoireDataset(repertoires=r)

        m1 = Motif(identifier="m1", instantiation=GappedKmerInstantiation(), seed="CAS")
        m2 = Motif(identifier="m2", instantiation=GappedKmerInstantiation(), seed="CCC")
        s1 = Signal(identifier="s1", motifs=[m1], implanting_strategy=HealthySequenceImplanting(GappedMotifImplanting(), implanting_computation=ImplantingComputation.ROUND))
        s2 = Signal(identifier="s2", motifs=[m1, m2],
                    implanting_strategy=HealthySequenceImplanting(GappedMotifImplanting(), implanting_computation=ImplantingComputation.ROUND))

        simulation = Simulation([Implanting(dataset_implanting_rate=0.2, repertoire_implanting_rate=0.5, signals=[s1, s2], name="i1"),
                                 Implanting(dataset_implanting_rate=0.2, repertoire_implanting_rate=0.5, signals=[s2], name="i2")])

        input_params = SimulationState(dataset=dataset, result_path=path, simulation=simulation, signals=[s1, s2])

        new_dataset = SignalImplanter.run(input_params)
        reps_with_s2 = sum([rep.metadata[f"signal_{s2.id}"] is True for rep in new_dataset.get_data(batch_size=10)])
        reps_with_s1 = sum([rep.metadata[f"signal_{s1.id}"] is True for rep in new_dataset.get_data(batch_size=10)])
        self.assertEqual(10, len(new_dataset.get_example_ids()))
        self.assertTrue(all([f"signal_{s1.id}" in rep.metadata.keys() for rep in new_dataset.get_data(batch_size=10)]))
        self.assertTrue(all([f"signal_{s2.id}" in rep.metadata.keys() for rep in new_dataset.get_data(batch_size=10)]))
        self.assertTrue(reps_with_s2 == 4)
        self.assertTrue(reps_with_s1 == 2)

        self.assertEqual(10, len(new_dataset.get_example_ids()))

        metadata_filenames = new_dataset.get_metadata(["filename"])["filename"]
        self.assertTrue(all([repertoire.data_filename in metadata_filenames for repertoire in new_dataset.repertoires]))

        shutil.rmtree(path)
Example #21
0
    def test_find_label_associated_sequence_p_values(self):
        path = EnvironmentSettings.tmp_test_path + "comparison_data_find_label_assocseqpvalues/"
        PathBuilder.build(path)

        repertoires = [Repertoire.build_from_sequence_objects([ReceptorSequence()], path, {
            "l1": val, "subject_id": subject_id
        }) for val, subject_id in zip([True, True, False, False], ["rep_0", "rep_1", "rep_2", "rep_3"])]

        col_name_index = {repertoires[index].identifier: index for index in range(len(repertoires))}

        comparison_data = ComparisonData(repertoire_ids=[repertoire.identifier for repertoire in repertoires],
                                         comparison_attributes=["sequence_aas"], sequence_batch_size=4, path=path)
        comparison_data.batches = [ComparisonDataBatch(**{'matrix': np.array([[1., 0., 0., 0.],
                                                                              [1., 1., 0., 0.]]),
                                                          'items': [('GGG',), ('III',)],
                                                          'repertoire_index_mapping': col_name_index, 'path': path, 'identifier': 0}),
                                   ComparisonDataBatch(**{'matrix': np.array([[1., 1., 0., 1.],
                                                                              [1., 1., 1., 1.]]),
                                                          'items': [('LLL',), ('MMM',)],
                                                          'repertoire_index_mapping': col_name_index, 'path': path, 'identifier': 1}),
                                   ComparisonDataBatch(**{'matrix': np.array([[0., 1., 0., 0.],
                                                                              [0., 1., 0., 1.]]),
                                                          'items': [('DDD',), ('EEE',)],
                                                          'repertoire_index_mapping': col_name_index, 'path': path, 'identifier': 2}),
                                   ComparisonDataBatch(**{'matrix': np.array([[0., 1., 1., 1.],
                                                                              [0., 0., 1., 1.]]),
                                                          'items': [('FFF',), ('CCC',)],
                                                          'repertoire_index_mapping': col_name_index, 'path': path, 'identifier': 3}),
                                   ComparisonDataBatch(**{'matrix': np.array([[0., 0., 0., 1.]]),
                                                          'items': [('AAA',)],
                                                          'repertoire_index_mapping': col_name_index, 'path': path, 'identifier': 4})]

        p_values = SequenceFilterHelper.find_label_associated_sequence_p_values(comparison_data, repertoires, Label('l1', [True, False], positive_class=True))

        print(p_values)

        self.assertTrue(
            np.allclose([SequenceFilterHelper.INVALID_P_VALUE, 0.1666666666666667, 0.5000000000000001, 1., SequenceFilterHelper.INVALID_P_VALUE,
                         0.8333333333333331, 1., 1., 2], p_values, equal_nan=True))

        shutil.rmtree(path)
Example #22
0
    def test_receptor(self):
        path = EnvironmentSettings.tmp_test_path + "receptortestingpathrepertoire/"
        PathBuilder.build(path)

        sequences = [
            ReceptorSequence(amino_acid_sequence="AAA",
                             identifier="1",
                             metadata=SequenceMetadata(v_gene="V1",
                                                       cell_id="1",
                                                       chain=Chain.ALPHA,
                                                       custom_params={
                                                           "cmv": "no",
                                                           "coeliac": False
                                                       })),
            ReceptorSequence(amino_acid_sequence="CCC",
                             identifier="2",
                             metadata=SequenceMetadata(j_gene="J1",
                                                       cell_id="1",
                                                       chain=Chain.BETA,
                                                       custom_params={
                                                           "cmv": "yes",
                                                           "coeliac": True
                                                       })),
            ReceptorSequence(amino_acid_sequence="FFF",
                             identifier="3",
                             metadata=SequenceMetadata(v_gene="V1",
                                                       cell_id="1",
                                                       chain=Chain.ALPHA,
                                                       custom_params={
                                                           "cmv": "no",
                                                           "coeliac": False
                                                       })),
            ReceptorSequence(amino_acid_sequence="EEE",
                             identifier="4",
                             metadata=SequenceMetadata(j_gene="J1",
                                                       cell_id="1",
                                                       chain=Chain.BETA,
                                                       custom_params={
                                                           "cmv": "yes",
                                                           "coeliac": True
                                                       })),
            ReceptorSequence(amino_acid_sequence="FFF",
                             identifier="5",
                             metadata=SequenceMetadata(v_gene="V1",
                                                       cell_id="2",
                                                       chain=Chain.GAMMA,
                                                       custom_params={
                                                           "cmv": "no",
                                                           "coeliac": False
                                                       })),
            ReceptorSequence(amino_acid_sequence="EEE",
                             identifier="6",
                             metadata=SequenceMetadata(j_gene="J1",
                                                       cell_id="2",
                                                       chain=Chain.DELTA,
                                                       custom_params={
                                                           "cmv": "yes",
                                                           "coeliac": True
                                                       })),
            ReceptorSequence(amino_acid_sequence="EEE",
                             identifier="7",
                             metadata=SequenceMetadata(j_gene="J2",
                                                       cell_id="2",
                                                       chain=Chain.DELTA,
                                                       custom_params={
                                                           "cmv": "yes",
                                                           "coeliac": True
                                                       }))
        ]

        obj = Repertoire.build_from_sequence_objects(sequences, path, {
            "cmv": "yes",
            'subject_id': "1"
        })
        receptors = obj.receptors

        self.assertEqual(6, len(receptors))

        cells = obj.cells

        self.assertEqual(2, len(cells))

        shutil.rmtree(path)
Example #23
0
class TestDataSummarizer(TestCase):

    # 5 features, 3 repertoires. Each repertoire has 3 labels. Each feature has 2 annotations.

    encoded_data_1 = {
        'examples':
        sparse.csr_matrix(
            np.array([[1, 2, 3, 4, 5], [0, 0, 0, 1, 1], [1, 1, 0, 0, 0]])),
        'example_ids': ['rep1', 'rep2', 'rep3'],
        'labels': {
            "diabetes": ['diabetes pos', 'diabetes neg', 'diabetes neg'],
            "celiac": ['celiac pos', 'celiac pos', 'celiac pos'],
            "cmv": ['cmv pos', 'cmv neg', 'cmv pos']
        },
        'feature_names': ['a', 'b', 'c', 'd', 'e'],
        'feature_annotations':
        pd.DataFrame({
            "specificity": ["cmv", "ebv", "cmv", "gluten", "gluten"],
            "p_val": [0.01, 0.00001, 0.1, 0, 0.0000001]
        })
    }

    dataset_1 = RepertoireDataset(encoded_data=EncodedData(**encoded_data_1),
                                  repertoires=[
                                      Repertoire("1.npy", None, "1"),
                                      Repertoire("2.npy", None, "2"),
                                      Repertoire("3.npy", None, "3")
                                  ])

    encoded_data_2 = {
        'examples':
        sparse.csr_matrix(
            np.array([[1, 2, 3, 4, 5], [0, 0, 0, 1, 1], [1, 1, 0, 0, 0],
                      [90, 10, 1, 3, 4], [0, 1, 1, 100, 200]])),
        'example_ids': ['rep1', 'rep2', 'rep3', 'rep4', 'rep5'],
        'labels': {
            "diabetes": [
                'diabetes pos', 'diabetes neg', 'diabetes neg', 'diabetes pos',
                'diabetes pos'
            ],
            "celiac": [
                'celiac pos', 'celiac pos', 'celiac pos', 'celiac neg',
                'celiac pos'
            ],
            "cmv": ['cmv pos', 'cmv neg', 'cmv pos', 'cmv pos', 'cmv neg']
        },
        'feature_names': ['a', 'b', 'c', 'd', 'e'],
        'feature_annotations':
        pd.DataFrame({
            "specificity": ["cmv", "ebv", "cmv", "gluten", "gluten"],
            "something": ["a", "b", "b", "a", "a"],
            "p_val": [0.01, 0.00001, 0.1, 0, 0.0000001]
        })
    }

    dataset_2 = RepertoireDataset(encoded_data=EncodedData(**encoded_data_2))

    def setUp(self) -> None:
        os.environ[Constants.CACHE_TYPE] = CacheType.TEST.name

    def test_filter_repertoires(self):

        dataset = TestDataSummarizer.dataset_1

        criteria = {
            "type":
            BooleanType.AND,
            "operands": [{
                "type": OperationType.IN,
                "allowed_values": ["celiac pos"],
                "value": {
                    "type": DataType.COLUMN,
                    "name": "celiac"
                }
            }, {
                "type": OperationType.IN,
                "allowed_values": ["cmv pos"],
                "value": {
                    "type": DataType.COLUMN,
                    "name": "cmv"
                }
            }]
        }

        filtered = DataSummarizer.filter_repertoires(dataset, criteria)

        self.assertTrue(filtered.get_example_count() == 2)
        self.assertTrue(filtered.encoded_data.examples.shape[0] == 2)
        self.assertTrue(filtered.encoded_data.examples.shape[1] == 5)

    def test_filter_features(self):

        dataset = TestDataSummarizer.dataset_1

        criteria = {
            "type":
            BooleanType.OR,
            "operands": [{
                "type": OperationType.IN,
                "allowed_values": ["gluten"],
                "value": {
                    "type": DataType.COLUMN,
                    "name": "specificity"
                }
            }, {
                "type": OperationType.LESS_THAN,
                "threshold": 0.0001,
                "value": {
                    "type": DataType.COLUMN,
                    "name": "p_val"
                }
            }]
        }

        filtered = DataSummarizer.filter_features(dataset, criteria)

        self.assertEqual(3, filtered.get_example_count())
        self.assertTrue(filtered.encoded_data.examples.shape[0] == 3)
        self.assertTrue(filtered.encoded_data.examples.shape[1] == 3)

    def test_annotate_repertoires(self):

        dataset = TestDataSummarizer.dataset_1

        criteria = {
            "type":
            BooleanType.AND,
            "operands": [{
                "type": OperationType.IN,
                "allowed_values": ["celiac pos"],
                "value": {
                    "type": DataType.COLUMN,
                    "name": "celiac"
                }
            }, {
                "type": OperationType.IN,
                "allowed_values": ["cmv pos"],
                "value": {
                    "type": DataType.COLUMN,
                    "name": "cmv"
                }
            }]
        }

        annotated = DataSummarizer.annotate_repertoires(
            dataset, criteria, "annotate")

        self.assertTrue(annotated.encoded_data.examples.shape[0] == 3)
        self.assertTrue(annotated.encoded_data.examples.shape[1] == 5)

    def test_annotate_features(self):

        dataset = TestDataSummarizer.dataset_1

        criteria = {
            "type":
            BooleanType.OR,
            "operands": [{
                "type": OperationType.IN,
                "allowed_values": ["gluten"],
                "value": {
                    "type": DataType.COLUMN,
                    "name": "specificity"
                }
            }, {
                "type": OperationType.LESS_THAN,
                "threshold": 0.0001,
                "value": {
                    "type": DataType.COLUMN,
                    "name": "p_val"
                }
            }]
        }

        annotated = DataSummarizer.annotate_features(dataset, criteria,
                                                     "annotate")

        self.assertTrue(annotated.encoded_data.examples.shape[0] == 3)
        self.assertTrue(annotated.encoded_data.examples.shape[1] == 5)

    def test_annotate_features_2(self):

        dataset = TestDataSummarizer.dataset_1

        criteria = {
            "type": OperationType.IN,
            "allowed_values": ["gluten"],
            "value": {
                "type": DataType.COLUMN,
                "name": "specificity"
            }
        }

        annotated = DataSummarizer.annotate_features(dataset, criteria,
                                                     "annotate")

        self.assertTrue(annotated.encoded_data.examples.shape[0] == 3)
        self.assertTrue(annotated.encoded_data.examples.shape[1] == 5)
Example #24
0
    def test_encode(self):
        path = EnvironmentSettings.root_path + "test/tmp/kmerfreqenc/"

        PathBuilder.build(path)

        rep1 = Repertoire.build_from_sequence_objects([ReceptorSequence("AAA", identifier="1"),
                                                       ReceptorSequence("ATA", identifier="2"),
                                                       ReceptorSequence("ATA", identifier='3')],
                                                      metadata={"l1": 1, "l2": 2, "subject_id": "1"}, path=path)

        rep2 = Repertoire.build_from_sequence_objects([ReceptorSequence("ATA", identifier="1"),
                                                       ReceptorSequence("TAA", identifier="2"),
                                                       ReceptorSequence("AAC", identifier="3")],
                                                      metadata={"l1": 0, "l2": 3, "subject_id": "2"}, path=path)

        lc = LabelConfiguration()
        lc.add_label("l1", [1, 2])
        lc.add_label("l2", [0, 3])

        dataset = RepertoireDataset(repertoires=[rep1, rep2])

        encoder = KmerFrequencyEncoder.build_object(dataset, **{
                "normalization_type": NormalizationType.RELATIVE_FREQUENCY.name,
                "reads": ReadsType.UNIQUE.name,
                "sequence_encoding": SequenceEncodingType.IDENTITY.name,
                "k": 3
            })

        d1 = encoder.encode(dataset, EncoderParams(
            result_path=path + "1/",
            label_config=lc,
            learn_model=True,
            model={},
            filename="dataset.pkl"
        ))

        encoder = KmerFrequencyEncoder.build_object(dataset, **{
                "normalization_type": NormalizationType.RELATIVE_FREQUENCY.name,
                "reads": ReadsType.UNIQUE.name,
                "sequence_encoding": SequenceEncodingType.CONTINUOUS_KMER.name,
                "k": 3
            })

        d2 = encoder.encode(dataset, EncoderParams(
            result_path=path + "2/",
            label_config=lc,
            pool_size=2,
            learn_model=True,
            model={},
            filename="dataset.csv"
        ))

        encoder3 = KmerFrequencyEncoder.build_object(dataset, **{
            "normalization_type": NormalizationType.BINARY.name,
            "reads": ReadsType.UNIQUE.name,
            "sequence_encoding": SequenceEncodingType.CONTINUOUS_KMER.name,
            "k": 3
        })

        d3 = encoder3.encode(dataset, EncoderParams(
            result_path=path + "3/",
            label_config=lc,
            learn_model=True,
            model={},
            filename="dataset.pkl"
        ))

        shutil.rmtree(path)

        self.assertTrue(isinstance(d1, RepertoireDataset))
        self.assertTrue(isinstance(d2, RepertoireDataset))
        self.assertEqual(0.67, np.round(d2.encoded_data.examples[0, 2], 2))
        self.assertEqual(0.0, np.round(d3.encoded_data.examples[0, 1], 2))
        self.assertTrue(isinstance(encoder, KmerFrequencyEncoder))
    def test_encode(self):
        path = EnvironmentSettings.root_path + "test/tmp/evennessenc/"

        PathBuilder.build(path)

        rep1 = Repertoire.build_from_sequence_objects(sequence_objects=[
            ReceptorSequence("AAA", metadata=SequenceMetadata(count=10))
            for i in range(1000)
        ] + [
            ReceptorSequence("AAA", metadata=SequenceMetadata(count=100))
            for i in range(1000)
        ] + [
            ReceptorSequence("AAA", metadata=SequenceMetadata(count=1))
            for i in range(1000)
        ],
                                                      metadata={
                                                          "l1": "test_1",
                                                          "l2": 2
                                                      },
                                                      path=path)

        rep2 = Repertoire.build_from_sequence_objects(sequence_objects=[
            ReceptorSequence("AAA", metadata=SequenceMetadata(count=10))
            for i in range(1000)
        ],
                                                      metadata={
                                                          "l1": "test_2",
                                                          "l2": 3
                                                      },
                                                      path=path)

        lc = LabelConfiguration()
        lc.add_label("l1", ["test_1", "test_2"])
        lc.add_label("l2", [0, 3])

        dataset = RepertoireDataset(repertoires=[rep1, rep2])

        encoder = EvennessProfileEncoder.build_object(
            dataset, **{
                "min_alpha": 0,
                "max_alpha": 10,
                "dimension": 51
            })

        d1 = encoder.encode(
            dataset, EncoderParams(
                result_path=path + "1/",
                label_config=lc,
            ))

        encoder = EvennessProfileEncoder.build_object(
            dataset, **{
                "min_alpha": 0,
                "max_alpha": 10,
                "dimension": 11
            })

        d2 = encoder.encode(
            dataset,
            EncoderParams(result_path=path, label_config=lc, pool_size=2))

        self.assertAlmostEqual(d1.encoded_data.examples[0, 0], 1)
        self.assertAlmostEqual(d1.encoded_data.examples[0, 1], 0.786444)
        self.assertAlmostEqual(d1.encoded_data.examples[1, 0], 1)
        self.assertAlmostEqual(d1.encoded_data.examples[1, 1], 1)

        shutil.rmtree(path)
Example #26
0
    def test_process(self):
        path = EnvironmentSettings.root_path + "test/tmp/duplicatesequencefilter/"
        PathBuilder.build(path)

        dataset = RepertoireDataset(repertoires=[
            Repertoire.build(
                sequence_aas=["AAA", "AAA", "CCC", "AAA", "CCC", "CCC", "CCC"],
                sequences=[
                    "ntAAA", "ntBBB", "ntCCC", "ntAAA", "ntCCC", "ntCCC",
                    "ntDDD"
                ],
                v_genes=["v1", "v1", "v1", "v1", "v1", "v1", "v1"],
                j_genes=["j1", "j1", "j1", "j1", "j1", "j1", "j1"],
                chains=[
                    Chain.ALPHA, Chain.ALPHA, Chain.ALPHA, Chain.ALPHA,
                    Chain.ALPHA, Chain.ALPHA, Chain.BETA
                ],
                counts=[10, 20, 30, 5, 20, None, 40],
                region_types=[
                    "IMGT_CDR3", "IMGT_CDR3", "IMGT_CDR3", "IMGT_CDR3",
                    "IMGT_CDR3", "IMGT_CDR3", "IMGT_CDR3"
                ],
                custom_lists={
                    "custom1": ["yes", "yes", "yes", "no", "no", "no", "no"],
                    "custom2": ["yes", "yes", "yes", "no", "no", "no", "no"]
                },
                sequence_identifiers=[1, 2, 3, 4, 5, 6, 7],
                path=path)
        ])

        # collapse by amino acids & use sum counts
        dupfilter = DuplicateSequenceFilter(
            filter_sequence_type=SequenceType.AMINO_ACID,
            count_agg=CountAggregationFunction.SUM,
            batch_size=4)

        reduced_repertoire = dupfilter.process_dataset(
            dataset=dataset, result_path=path).repertoires[0]

        attr = reduced_repertoire.get_attributes([
            "sequence_identifiers", "sequence_aas", "sequences", "counts",
            "chains"
        ])

        self.assertEqual(3, len(reduced_repertoire.get_sequence_identifiers()))
        self.assertListEqual(["AAA", "CCC", "CCC"], list(attr["sequence_aas"]))
        self.assertListEqual(["ntAAA", "ntCCC", "ntDDD"],
                             list(attr["sequences"]))
        self.assertListEqual([35, 50, 40], list(attr["counts"]))
        self.assertListEqual([1, 3, 7], list(attr["sequence_identifiers"]))
        self.assertListEqual(
            [Chain.get_chain("A"),
             Chain.get_chain("A"),
             Chain.get_chain('B')], list(attr["chains"]))

        # collapse by nucleotides & use min counts
        dupfilter = DuplicateSequenceFilter(
            filter_sequence_type=SequenceType.NUCLEOTIDE,
            count_agg=CountAggregationFunction.MIN,
            batch_size=4)

        reduced_repertoire = dupfilter.process_dataset(
            dataset=dataset, result_path=path).repertoires[0]

        attr = reduced_repertoire.get_attributes(
            ["sequence_identifiers", "sequence_aas", "sequences", "counts"])

        self.assertEqual(4, len(reduced_repertoire.get_sequence_identifiers()))
        self.assertListEqual([1, 2, 3, 7], list(attr["sequence_identifiers"]))
        self.assertListEqual(["AAA", "AAA", "CCC", "CCC"],
                             list(attr["sequence_aas"]))
        self.assertListEqual(["ntAAA", "ntBBB", "ntCCC", "ntDDD"],
                             list(attr["sequences"]))
        self.assertListEqual([5, 20, 20, 40], list(attr["counts"]))

        shutil.rmtree(path)
Example #27
0
    def test_run(self):
        dataset = RepertoireDataset(repertoires=[
            Repertoire("0.npy", "", "0"),
            Repertoire("0.npy", "", "1"),
            Repertoire("0.npy", "", "2"),
            Repertoire("0.npy", "", "3"),
            Repertoire("0.npy", "", "4"),
            Repertoire("0.npy", "", "5"),
            Repertoire("0.npy", "", "6"),
            Repertoire("0.npy", "", "7")
        ])

        paths = [
            EnvironmentSettings.root_path +
            "test/tmp/datasplitter/split_{}".format(i) for i in range(5)
        ]
        for path in paths:
            PathBuilder.build(path)

        df = pd.DataFrame(data={
            "key1": [0, 0, 1, 1, 1, 2, 2, 0],
            "filename": [0, 1, 2, 3, 4, 5, 6, 7]
        })
        df.to_csv(EnvironmentSettings.root_path +
                  "test/tmp/datasplitter/metadata.csv")

        dataset.metadata_file = EnvironmentSettings.root_path + "test/tmp/datasplitter/metadata.csv"

        training_percentage = 0.7

        trains, tests = DataSplitter.run(
            DataSplitterParams(dataset=dataset,
                               training_percentage=training_percentage,
                               split_strategy=SplitType.RANDOM,
                               split_count=5,
                               paths=paths))

        self.assertTrue(isinstance(trains[0], RepertoireDataset))
        self.assertTrue(isinstance(tests[0], RepertoireDataset))
        self.assertEqual(len(trains[0].get_data()), 5)
        self.assertEqual(len(tests[0].get_data()), 3)
        self.assertEqual(5, len(trains))
        self.assertEqual(5, len(tests))
        self.assertEqual(5, len(trains[0].repertoires))

        trains2, tests2 = DataSplitter.run(
            DataSplitterParams(dataset=dataset,
                               training_percentage=training_percentage,
                               split_strategy=SplitType.RANDOM,
                               split_count=5,
                               paths=paths))

        self.assertEqual(trains[0].get_repertoire_ids(),
                         trains2[0].get_repertoire_ids())

        paths = [
            EnvironmentSettings.root_path +
            "test/tmp/datasplitter/split_{}".format(i)
            for i in range(dataset.get_example_count())
        ]
        for path in paths:
            PathBuilder.build(path)

        trains, tests = DataSplitter.run(
            DataSplitterParams(dataset=dataset,
                               split_strategy=SplitType.LOOCV,
                               split_count=-1,
                               training_percentage=-1,
                               paths=paths))

        self.assertTrue(isinstance(trains[0], RepertoireDataset))
        self.assertTrue(isinstance(tests[0], RepertoireDataset))
        self.assertEqual(len(trains[0].get_data()), 7)
        self.assertEqual(len(tests[0].get_data()), 1)
        self.assertEqual(8, len(trains))
        self.assertEqual(8, len(tests))

        paths = [
            EnvironmentSettings.root_path +
            "test/tmp/datasplitter/split_{}".format(i) for i in range(5)
        ]
        for path in paths:
            PathBuilder.build(path)

        trains, tests = DataSplitter.run(
            DataSplitterParams(dataset=dataset,
                               split_strategy=SplitType.K_FOLD,
                               split_count=5,
                               training_percentage=-1,
                               paths=paths))

        self.assertTrue(isinstance(trains[0], RepertoireDataset))
        self.assertTrue(isinstance(tests[0], RepertoireDataset))
        self.assertEqual(len(trains[0].get_data()), 6)
        self.assertEqual(len(tests[0].get_data()), 2)
        self.assertEqual(5, len(trains))
        self.assertEqual(5, len(tests))

        shutil.rmtree(EnvironmentSettings.root_path + "test/tmp/datasplitter/")
Example #28
0
    def test_repertoire(self):

        path = EnvironmentSettings.tmp_test_path + "sequencerepertoire/"
        PathBuilder.build(path)

        sequences = [
            ReceptorSequence(amino_acid_sequence="AAA",
                             identifier="1",
                             metadata=SequenceMetadata(v_gene="V1",
                                                       cell_id="1",
                                                       chain=Chain.ALPHA,
                                                       custom_params={
                                                           "cmv": "no",
                                                           "coeliac": False
                                                       })),
            ReceptorSequence(amino_acid_sequence="CCC",
                             identifier="2",
                             metadata=SequenceMetadata(j_gene="J1",
                                                       cell_id="1",
                                                       chain=Chain.BETA,
                                                       custom_params={
                                                           "cmv": "yes",
                                                           "coeliac": True
                                                       }))
        ]

        obj = Repertoire.build_from_sequence_objects(sequences, path, {
            "cmv": "yes",
            'subject_id': "1"
        })

        self.assertTrue(os.path.isfile(obj.data_filename))
        self.assertTrue(isinstance(obj, Repertoire))
        self.assertTrue(
            np.array_equal(np.array(["1", "2"]),
                           obj.get_sequence_identifiers()))
        self.assertTrue(
            np.array_equal(np.array(["AAA", "CCC"]), obj.get_sequence_aas()))
        self.assertTrue(
            np.array_equal(np.array(["V1", None]), obj.get_v_genes()))
        self.assertTrue(
            np.array_equal(np.array([None, "J1"]), obj.get_j_genes()))
        self.assertTrue(
            np.array_equal(np.array(["no", "yes"]), obj.get_attribute("cmv")))
        self.assertTrue(
            np.array_equal(np.array([False, True]),
                           obj.get_attribute("coeliac")))
        self.assertEqual("yes", obj.metadata["cmv"])
        self.assertEqual("1", obj.metadata["subject_id"])

        rebuilt_sequences = obj.sequences

        self.assertTrue(
            all(
                isinstance(seq, ReceptorSequence)
                for seq in rebuilt_sequences))
        self.assertEqual(2, len(rebuilt_sequences))
        self.assertEqual("1", rebuilt_sequences[0].identifier)
        self.assertEqual("2", rebuilt_sequences[1].identifier)
        self.assertEqual("AAA", rebuilt_sequences[0].amino_acid_sequence)
        self.assertEqual("yes",
                         rebuilt_sequences[1].metadata.custom_params["cmv"])

        obj.free_memory()

        self.assertTrue(key in obj.data for key in Repertoire.FIELDS)
        self.assertTrue(obj.data[key] is None for key in Repertoire.FIELDS)

        shutil.rmtree(path)
 def store_repertoire(path, repertoire, sequences):
     new_repertoire = Repertoire.build_from_sequence_objects(
         sequences, path, repertoire.metadata)
     return new_repertoire