def _create_dummy_data(self, path, dataset_type):
        PathBuilder.build(path)
        dataset = None

        test_repertoire = Repertoire.build(
            sequence_aas=[
                "DUPDUP", "AILUDGYF", "DFJKHJ", "DIUYUAG", "CTGTCGH"
            ],
            v_genes=["V1-1" for i in range(5)],
            j_genes=["J1-1" for i in range(5)],
            chains=[
                Chain.ALPHA, Chain.BETA, Chain.BETA, Chain.ALPHA, Chain.BETA
            ],
            custom_lists={
                "custom_1": [f"CUST-{i}" for i in range(5)],
                "custom_2":
                [f"CUST-A" for i in range(3)] + [f"CUST-B" for i in range(2)]
            },
            cell_ids=["1", "1", "1", "2", '2'],
            path=path)

        if dataset_type == "receptor":

            dataset = ReceptorDataset.build_from_objects(
                test_repertoire.receptors, 100, path, name="receptor_dataset")
            dataset.identifier = 'receptor_dataset'

        elif dataset_type == "repertoire":
            test_repertoire.identifier = "repertoire_dataset"
            dataset = RepertoireDataset(repertoires=[test_repertoire])

        return dataset
Esempio n. 2
0
    def create_dummy_receptordataset(self, path):
        receptors = [TCABReceptor(identifier="1",
                                  alpha=ReceptorSequence(amino_acid_sequence="AAATTT", identifier="1a",
                                                         metadata=SequenceMetadata(v_gene="TRAV1", j_gene="TRAJ1",
                                                                                   chain=Chain.ALPHA,
                                                                                   frame_type="IN",
                                                                                   custom_params={"d_call": "TRAD1",
                                                                                                  "custom1": "cust1"})),
                                  beta=ReceptorSequence(amino_acid_sequence="ATATAT", identifier="1b",
                                                        metadata=SequenceMetadata(v_gene="TRBV1", j_gene="TRBJ1",
                                                                                  chain=Chain.BETA,
                                                                                  frame_type="IN",
                                                                                  custom_params={"d_call": "TRBD1",
                                                                                                 "custom1": "cust1"}))),
                     TCABReceptor(identifier="2",
                                  alpha=ReceptorSequence(amino_acid_sequence="AAAAAA", identifier="2a",
                                                         metadata=SequenceMetadata(v_gene="TRAV1", j_gene="TRAJ1",
                                                                                   chain=Chain.ALPHA,
                                                                                   frame_type="IN",
                                                                                   custom_params={"d_call": "TRAD1",
                                                                                                  "custom2": "cust1"})),
                                  beta=ReceptorSequence(amino_acid_sequence="AAAAAA", identifier="2b",
                                                        metadata=SequenceMetadata(v_gene="TRBV1", j_gene="TRBJ1",
                                                                                  chain=Chain.BETA,
                                                                                  frame_type="IN",
                                                                                  custom_params={"d_call": "TRBD1",
                                                                                                 "custom2": "cust1"})))]

        receptors_path = path / "receptors"
        PathBuilder.build(receptors_path)
        return ReceptorDataset.build_from_objects(receptors, 2, receptors_path)
    def test(self):

        receptors = [
            TCABReceptor(alpha=ReceptorSequence(amino_acid_sequence="AAACCC"),
                         beta=ReceptorSequence(amino_acid_sequence="AAACCC"),
                         identifier="1"),
            TCABReceptor(alpha=ReceptorSequence(amino_acid_sequence="AAA"),
                         beta=ReceptorSequence(amino_acid_sequence="CCC"),
                         identifier="2"),
            TCABReceptor(alpha=ReceptorSequence(amino_acid_sequence="AAACCC"),
                         beta=ReceptorSequence(amino_acid_sequence="AAACCC"),
                         identifier="3"),
            TCABReceptor(alpha=ReceptorSequence(amino_acid_sequence="AAA"),
                         beta=ReceptorSequence(amino_acid_sequence="CCC"),
                         identifier="4")
        ]

        path = EnvironmentSettings.tmp_test_path / "kmer_receptor_frequency/"
        PathBuilder.build(path / 'data')
        dataset = ReceptorDataset.build_from_objects(receptors,
                                                     path=path,
                                                     file_size=10)

        lc = LabelConfiguration()
        lc.add_label("l1", [1, 2])

        encoder = KmerFreqReceptorEncoder.build_object(
            dataset, **{
                "normalization_type":
                NormalizationType.RELATIVE_FREQUENCY.name,
                "reads": ReadsType.UNIQUE.name,
                "sequence_encoding": SequenceEncodingType.CONTINUOUS_KMER.name,
                "sequence_type": SequenceType.AMINO_ACID.name,
                "k": 3
            })

        encoded_dataset = encoder.encode(
            dataset,
            EncoderParams(result_path=path / "2/",
                          label_config=lc,
                          pool_size=2,
                          learn_model=True,
                          model={},
                          filename="dataset.csv",
                          encode_labels=False))

        self.assertEqual(4, encoded_dataset.encoded_data.examples.shape[0])
        self.assertTrue(
            all(identifier in encoded_dataset.encoded_data.example_ids
                for identifier in ['1', '2', '3', '4']))
        self.assertTrue(
            numpy.array_equal(encoded_dataset.encoded_data.examples[0].A,
                              encoded_dataset.encoded_data.examples[2].A))
        self.assertTrue(
            all(feature_name in encoded_dataset.encoded_data.feature_names
                for feature_name in ["alpha_AAA", "alpha_AAC", "beta_CCC"]))

        shutil.rmtree(path)
Esempio n. 4
0
    def _implant_signals_in_receptors(
            simulation_state: SimulationState) -> Dataset:
        processed_receptors = SignalImplanter._implant_signals(
            simulation_state, SignalImplanter._process_receptor, None)
        processed_dataset = ReceptorDataset.build_from_objects(
            receptors=processed_receptors,
            file_size=simulation_state.dataset.file_size,
            name=simulation_state.dataset.name,
            path=simulation_state.result_path)

        processed_dataset.labels = {**(simulation_state.dataset.labels if simulation_state.dataset.labels is not None else {}),
                                    **{signal.id: [True, False] for signal in simulation_state.signals}}

        return processed_dataset
Esempio n. 5
0
    def _construct_test_dataset(self, path, dataset_size: int = 50):
        receptors = [
            TCABReceptor(alpha=ReceptorSequence(amino_acid_sequence="AAAA"),
                         beta=ReceptorSequence(amino_acid_sequence="ATA"),
                         metadata={"l1": 1},
                         identifier=str("1")),
            TCABReceptor(alpha=ReceptorSequence(amino_acid_sequence="ATA"),
                         beta=ReceptorSequence(amino_acid_sequence="ATT"),
                         metadata={"l1": 2},
                         identifier=str("2"))
        ]

        PathBuilder.build(path)

        lc = LabelConfiguration()
        lc.add_label("l1", [1, 2])

        dataset = ReceptorDataset.build_from_objects(receptors, 2, path)
        return dataset, lc
Esempio n. 6
0
    def construct_test_flatten_dataset(self, path):
        receptors = [
            TCABReceptor(alpha=ReceptorSequence(amino_acid_sequence="AAATTT",
                                                identifier="1a"),
                         beta=ReceptorSequence(amino_acid_sequence="ATATAT",
                                               identifier="1b"),
                         metadata={"l1": 1},
                         identifier="1"),
            TCABReceptor(alpha=ReceptorSequence(amino_acid_sequence="AAAAAA",
                                                identifier="2a"),
                         beta=ReceptorSequence(amino_acid_sequence="AAAAAA",
                                               identifier="2b"),
                         metadata={"l1": 2},
                         identifier="2"),
            TCABReceptor(alpha=ReceptorSequence(amino_acid_sequence="AAAAAA",
                                                identifier="2a"),
                         beta=ReceptorSequence(amino_acid_sequence="AAAAAA",
                                               identifier="2b"),
                         metadata={"l1": 2},
                         identifier="2")
        ]

        return ReceptorDataset.build_from_objects(receptors, 10, path)
Esempio n. 7
0
    def test_split_dataset(self):
        path = PathBuilder.build(EnvironmentSettings.tmp_test_path /
                                 "leave_one_out_splitter/")
        receptors = []
        for i in range(10):
            receptors.append(
                TCABReceptor(ReceptorSequence(), ReceptorSequence(),
                             {"subject": i % 3}))

        dataset = ReceptorDataset.build_from_objects(receptors, 100, path,
                                                     'd1')

        params = DataSplitterParams(
            dataset,
            SplitType.LEAVE_ONE_OUT_STRATIFICATION,
            3,
            paths=[path / f"result_{i}/" for i in range(1, 4)],
            split_config=SplitConfig(SplitType.LEAVE_ONE_OUT_STRATIFICATION,
                                     split_count=3,
                                     leave_one_out_config=LeaveOneOutConfig(
                                         "subject", 1)))
        train_datasets, test_datasets = LeaveOneOutSplitter.split_dataset(
            params)

        self.assertEqual(3, len(train_datasets))
        self.assertEqual(3, len(test_datasets))

        for i in range(3):
            self.assertTrue(
                all(receptor.metadata["subject"] == i
                    for receptor in test_datasets[i].get_data()))
            self.assertTrue(
                all(receptor.metadata["subject"] != i
                    for receptor in train_datasets[i].get_data()))

        shutil.rmtree(path)
Esempio n. 8
0
    def _import_from_files(
            filenames: List[str],
            generic_params: DatasetImportParams) -> ReceptorDataset:
        elements = []

        for file in filenames:
            df = pd.read_csv(file,
                             sep=generic_params.separator,
                             usecols=generic_params.columns_to_load)
            df.dropna()
            df.drop_duplicates()
            df.rename(columns=generic_params.column_mapping, inplace=True)

            if "alpha_amino_acid_sequence" in df:
                df["alpha_amino_acid_sequence"] = df[
                    "alpha_amino_acid_sequence"].str[1:-1]
            if "beta_amino_acid_sequence" in df:
                df["beta_amino_acid_sequence"] = df[
                    "beta_amino_acid_sequence"].str[1:-1]
            if "alpha_nucleotide_sequence" in df:
                df["alpha_nucleotide_sequence"] = df[
                    "alpha_nucleotide_sequence"].str[3:-3]
            if "beta_nucleotide_sequence" in df:
                df["beta_nucleotide_sequence"] = df[
                    "beta_nucleotide_sequence"].str[3:-3]

            chain_vals = [ch for ch in generic_params.receptor_chains.value]
            chain_names = [
                Chain.get_chain(ch).name.lower()
                for ch in generic_params.receptor_chains.value
            ]

            for chain_name in chain_names:
                df = SingleLineReceptorImport.make_gene_columns(
                    df, ["v", "j"], chain_name)

            for index, row in df.iterrows():
                sequences = {
                    chain_vals[i]: ReceptorSequence(
                        amino_acid_sequence=row[chain_name +
                                                "_amino_acid_sequence"] if
                        chain_name + "_amino_acid_sequence" in row else None,
                        nucleotide_sequence=row[chain_name +
                                                "_nucleotide_sequence"] if
                        chain_name + "_nucleotide_sequence" in row else None,
                        metadata=SequenceMetadata(
                            v_gene=row[f"{chain_name}_v_gene"],
                            v_allele=row[f"{chain_name}_v_allele"],
                            v_subgroup=row[f'{chain_name}_v_subgroup'],
                            j_gene=row[f"{chain_name}_j_gene"],
                            j_allele=row[f"{chain_name}_j_allele"],
                            j_subgroup=row[f'{chain_name}_j_subgroup'],
                            chain=chain_name,
                            count=row["count"],
                            region_type=generic_params.region_type.value))
                    for i, chain_name in enumerate(chain_names)
                }

                elements.append(
                    ReceptorBuilder.build_object(
                        sequences, row["identifier"], {
                            key: row[key]
                            for key in row.keys() if all(
                                item not in key for item in
                                ["v_gene", 'j_gene', "count", "identifier"] +
                                chain_names)
                        }))

        return ReceptorDataset.build_from_objects(
            elements, generic_params.sequence_file_size,
            generic_params.result_path)