def create_dummy_sequencedataset(self, path):
        sequences = [
            ReceptorSequence(amino_acid_sequence="AAATTT",
                             identifier="1a",
                             metadata=SequenceMetadata(v_gene="TRAV1",
                                                       j_gene="TRAJ1",
                                                       chain=Chain.ALPHA,
                                                       frame_type="IN",
                                                       custom_params={
                                                           "d_call": "TRAD1",
                                                           "custom1": "cust1"
                                                       })),
            ReceptorSequence(amino_acid_sequence="ATATAT",
                             identifier="1b",
                             metadata=SequenceMetadata(v_gene="TRBV1",
                                                       j_gene="TRBJ1",
                                                       chain=Chain.BETA,
                                                       frame_type="IN",
                                                       custom_params={
                                                           "d_call": "TRBD1",
                                                           "custom2": "cust1"
                                                       })),
            ReceptorSequence(amino_acid_sequence="ATATAT",
                             identifier="2b",
                             metadata=SequenceMetadata(v_gene="TRBV1",
                                                       j_gene="TRBJ1",
                                                       chain=Chain.BETA,
                                                       frame_type="IN",
                                                       custom_params={
                                                           "d_call": "TRBD1",
                                                           "custom2": "cust1"
                                                       }))
        ]

        return SequenceDataset.build(sequences, 2, "{}sequences".format(path))
Exemple #2
0
    def test_encode_sequence(self):
        sequence = ReceptorSequence(
            amino_acid_sequence="AAA",
            metadata=SequenceMetadata(frame_type="OUT"))
        enc = IdentitySequenceEncoder()
        self.assertEqual(
            enc.encode_sequence(
                sequence,
                EncoderParams(model={},
                              label_config=LabelConfiguration(),
                              result_path="")), ["AAA"])

        sequence = ReceptorSequence(
            amino_acid_sequence="AAA",
            metadata=SequenceMetadata(frame_type="STOP"))
        enc = IdentitySequenceEncoder()
        self.assertEqual(
            enc.encode_sequence(
                sequence,
                EncoderParams(model={},
                              label_config=LabelConfiguration(),
                              result_path="")), ["AAA"])

        sequence = ReceptorSequence(amino_acid_sequence="AAA",
                                    metadata=SequenceMetadata(frame_type="IN"))
        enc = IdentitySequenceEncoder()
        self.assertEqual(["AAA"],
                         enc.encode_sequence(
                             sequence,
                             EncoderParams(model={},
                                           label_config=LabelConfiguration(),
                                           result_path="")))
Exemple #3
0
    def create_dataset(self, path, dataset_size: int = 50):

        sequences = []

        for i in range(dataset_size):
            if i % 2 == 0:
                sequences.append(
                    ReceptorSequence(
                        amino_acid_sequence="AAACCC",
                        identifier=str(i),
                        metadata=SequenceMetadata(custom_params={"l1": 1})))
            else:
                sequences.append(
                    ReceptorSequence(
                        amino_acid_sequence="ACACAC",
                        identifier=str(i),
                        metadata=SequenceMetadata(custom_params={"l1": 2})))

        PathBuilder.build(path)
        filename = "{}sequences.pkl".format(path)
        with open(filename, "wb") as file:
            pickle.dump(sequences, file)

        lc = LabelConfiguration()
        lc.add_label("l1", [1, 2])

        dataset = SequenceDataset(params={"l1": [1, 2]},
                                  filenames=[filename],
                                  identifier="d1")
        return dataset
    def create_dummy_receptordataset(self, path):
        receptors = [
            TCABReceptor(identifier="1",
                         alpha=ReceptorSequence(amino_acid_sequence="AAATTT",
                                                identifier="1a",
                                                metadata=SequenceMetadata(
                                                    v_gene="TRAV1",
                                                    j_gene="TRAJ1",
                                                    chain=Chain.ALPHA,
                                                    frame_type="IN",
                                                    custom_params={
                                                        "d_call": "TRAD1",
                                                        "custom1": "cust1"
                                                    })),
                         beta=ReceptorSequence(amino_acid_sequence="ATATAT",
                                               identifier="1b",
                                               metadata=SequenceMetadata(
                                                   v_gene="TRBV1",
                                                   j_gene="TRBJ1",
                                                   chain=Chain.BETA,
                                                   frame_type="IN",
                                                   custom_params={
                                                       "d_call": "TRBD1",
                                                       "custom1": "cust1"
                                                   }))),
            TCABReceptor(identifier="2",
                         alpha=ReceptorSequence(amino_acid_sequence="AAAAAA",
                                                identifier="2a",
                                                metadata=SequenceMetadata(
                                                    v_gene="TRAV1",
                                                    j_gene="TRAJ1",
                                                    chain=Chain.ALPHA,
                                                    frame_type="IN",
                                                    custom_params={
                                                        "d_call": "TRAD1",
                                                        "custom2": "cust1"
                                                    })),
                         beta=ReceptorSequence(amino_acid_sequence="AAAAAA",
                                               identifier="2b",
                                               metadata=SequenceMetadata(
                                                   v_gene="TRBV1",
                                                   j_gene="TRBJ1",
                                                   chain=Chain.BETA,
                                                   frame_type="IN",
                                                   custom_params={
                                                       "d_call": "TRBD1",
                                                       "custom2": "cust1"
                                                   })))
        ]

        return ReceptorDataset.build(receptors, 2, "{}receptors".format(path))
    def test_process(self):

        path = EnvironmentSettings.root_path + "test/tmp/chain_filter/"
        PathBuilder.build(path)

        rep1 = Repertoire.build_from_sequence_objects([
            ReceptorSequence(
                "AAA", metadata=SequenceMetadata(chain="A"), identifier="1")
        ],
                                                      path=path,
                                                      metadata={})
        rep2 = Repertoire.build_from_sequence_objects([
            ReceptorSequence(
                "AAC", metadata=SequenceMetadata(chain="B"), identifier="2")
        ],
                                                      path=path,
                                                      metadata={})

        metadata = pd.DataFrame({"CD": [1, 0]})
        metadata.to_csv(path + "metadata.csv")

        dataset = RepertoireDataset(repertoires=[rep1, rep2],
                                    metadata_file=path + "metadata.csv")

        dataset2 = ChainRepertoireFilter.process(
            dataset, {
                "keep_chain": "ALPHA",
                "result_path": path + "results/"
            })

        self.assertEqual(1, len(dataset2.get_data()))
        self.assertEqual(2, len(dataset.get_data()))

        metadata_dict = dataset2.get_metadata(["CD"])
        self.assertEqual(1, len(metadata_dict["CD"]))
        self.assertEqual(1, metadata_dict["CD"][0])

        for rep in dataset2.get_data():
            self.assertEqual("AAA", rep.sequences[0].get_sequence())

        self.assertRaises(AssertionError, ChainRepertoireFilter.process,
                          dataset, {
                              "keep_chain": "GAMMA",
                              "result_path": path + "results/"
                          })

        shutil.rmtree(path)
Exemple #6
0
    def _construct_test_dataset(self, path):
        sequences = [
            ReceptorSequence(amino_acid_sequence="AAAA", identifier="1", metadata=SequenceMetadata(custom_params={"l1": 1, "l2": 1})),
            ReceptorSequence(amino_acid_sequence="ATA", identifier="2", metadata=SequenceMetadata(custom_params={"l1": 2, "l2": 1})),
            ReceptorSequence(amino_acid_sequence="ATT", identifier="3", metadata=SequenceMetadata(custom_params={"l1": 1, "l2": 2}))]

        filename = "{}sequences.pkl".format(path)
        with open(filename, "wb") as file:
            pickle.dump(sequences, file)

        lc = LabelConfiguration()
        lc.add_label("l1", [1, 2])
        lc.add_label("l2", [1, 2])

        dataset = SequenceDataset(params={"l1": [1, 2]}, filenames=[filename], identifier="d1")

        return dataset, lc
    def build(sequences: list, path: str, labels: dict = None, seq_metadata: list = None, subject_ids: list = None):

        if subject_ids is not None:
            assert len(subject_ids) == len(sequences)

        if seq_metadata is not None:
            assert len(sequences) == len(seq_metadata)
            for index, sequence_list in enumerate(sequences):
                assert len(sequence_list) == len(seq_metadata[index])

        PathBuilder.build(path)
        rep_path = PathBuilder.build(path + "repertoires/")

        repertoires = []
        if subject_ids is None:
            subject_ids = []

        for rep_index, sequence_list in enumerate(sequences):
            rep_sequences = ReceptorSequenceList()
            if len(subject_ids) < len(sequences):
                subject_ids.append("rep_" + str(rep_index))
            for seq_index, sequence in enumerate(sequence_list):
                if seq_metadata is None:
                    m = SequenceMetadata(v_subgroup="TRBV1", v_gene="TRBV1-1", v_allele="TRBV1-1*01", j_subgroup="TRBJ1", j_gene="TRBJ1-1", j_allele="TRBJ1-1*01", count=1, chain="TRB", region_type="IMGT_CDR3")
                else:
                    m = SequenceMetadata(**seq_metadata[rep_index][seq_index])

                s = ReceptorSequence(amino_acid_sequence=sequence, metadata=m, identifier=str(seq_index))
                rep_sequences.append(s)

            if labels is not None:
                metadata = {key: labels[key][rep_index] for key in labels.keys()}
            else:
                metadata = {}

            metadata = {**metadata, **{"subject_id": subject_ids[rep_index]}}

            repertoire = Repertoire.build_from_sequence_objects(rep_sequences, rep_path, metadata)
            repertoires.append(repertoire)

        df = pd.DataFrame({**{"filename": [f"{repertoire.identifier}_data.npy" for repertoire in repertoires], "subject_id": subject_ids,
                              "repertoire_identifier": [repertoire.identifier for repertoire in repertoires]},
                           **(labels if labels is not None else {})})
        df.to_csv(path + "metadata.csv", index=False)

        return repertoires, path + "metadata.csv"
    def create_dummy_repertoire(self, path):
        sequence_objects = [
            ReceptorSequence(amino_acid_sequence="AAA",
                             nucleotide_sequence="GCTGCTGCT",
                             identifier="receptor_1",
                             metadata=SequenceMetadata(v_gene="TRBV1",
                                                       j_gene="TRBJ1",
                                                       chain=Chain.BETA,
                                                       count=5,
                                                       region_type="IMGT_CDR3",
                                                       frame_type="IN",
                                                       custom_params={
                                                           "d_call": "TRBD1",
                                                           "custom_test":
                                                           "cust1"
                                                       })),
            ReceptorSequence(amino_acid_sequence="GGG",
                             nucleotide_sequence="GGTGGTGGT",
                             identifier="receptor_2",
                             metadata=SequenceMetadata(v_gene="TRAV2",
                                                       v_allele="TRAV2*01",
                                                       j_gene="TRAJ2",
                                                       chain=Chain.ALPHA,
                                                       count=15,
                                                       frame_type=None,
                                                       region_type="IMGT_CDR3",
                                                       custom_params={
                                                           "d_call": "TRAD2",
                                                           "custom_test":
                                                           "cust2"
                                                       }))
        ]

        repertoire = Repertoire.build_from_sequence_objects(
            sequence_objects=sequence_objects,
            path=path,
            metadata={"subject_id": "REP1"})
        df = pd.DataFrame({
            "filename": [f"{repertoire.identifier}_data.npy"],
            "subject_id": ["1"],
            "repertoire_identifier": [repertoire.identifier]
        })
        df.to_csv(path + "metadata.csv", index=False)

        return repertoire, path + "metadata.csv"
    def generate_receptor_dataset(receptor_count: int, chain_1_length_probabilities: dict, chain_2_length_probabilities: dict, labels: dict,
                                  path: str):
        """
        Creates receptor_count receptors where the length of sequences in each chain is sampled independently for each sequence from
        chain_n_length_probabilities distribution. The labels are also randomly assigned to receptors from the distribution given in
        labels. In this case, labels are multi-class, so each receptor will get one class from each label. This means that negative
        classes for the labels should be included as well in the specification. chain 1 and 2 in this case refer to alpha and beta
        chain of a T-cell receptor.

        An example of input parameters is given below:

        receptor_count: 100 # generate 100 TRABReceptors
        chain_1_length_probabilities:
            14: 0.8 # 80% of all generated sequences for all receptors (for chain 1) will have length 14
            15: 0.2 # 20% of all generated sequences across all receptors (for chain 1) will have length 15
        chain_2_length_probabilities:
            14: 0.8 # 80% of all generated sequences for all receptors (for chain 2) will have length 14
            15: 0.2 # 20% of all generated sequences across all receptors (for chain 2) will have length 15
        labels:
            epitope1: # label name
                True: 0.5 # 50% of the receptors will have class True
                False: 0.5 # 50% of the receptors will have class False
            epitope2: # next label with classes that will be assigned to receptors independently of the previous label or other parameters
                1: 0.3 # 30% of the generated receptors will have class 1
                0: 0.7 # 70% of the generated receptors will have class 0
        """
        RandomDatasetGenerator._check_receptor_dataset_generation_params(receptor_count, chain_1_length_probabilities,
                                                                         chain_2_length_probabilities, labels, path)

        alphabet = EnvironmentSettings.get_sequence_alphabet()
        PathBuilder.build(path)

        get_random_sequence = lambda proba, chain, id: ReceptorSequence("".join(random.choices(alphabet, k=random.choices(list(proba.keys()),
                                                                                                                      proba.values())[0])),
                                                                    metadata=SequenceMetadata(count=1,
                                                                                              v_subgroup=chain+"V1",
                                                                                              v_gene=chain+"V1-1",
                                                                                              v_allele=chain+"V1-1*01",
                                                                                              j_subgroup=chain + "J1",
                                                                                              j_gene=chain + "J1-1",
                                                                                              j_allele=chain + "J1-1*01",
                                                                                              chain=chain,
                                                                                              cell_id=id))

        receptors = [TCABReceptor(alpha=get_random_sequence(chain_1_length_probabilities, "TRA", i),
                                  beta=get_random_sequence(chain_2_length_probabilities, "TRB", i),
                                  metadata={**{label: random.choices(list(label_dict.keys()), label_dict.values(), k=1)[0]
                                               for label, label_dict in labels.items()}, **{"subject": f"subj_{i + 1}"}})
                     for i in range(receptor_count)]

        filename = f"{path if path[-1] == '/' else path + '/'}batch01.pickle"

        with open(filename, "wb") as file:
            pickle.dump(receptors, file)

        return ReceptorDataset(params={label: list(label_dict.keys()) for label, label_dict in labels.items()},
                               filenames=[filename], file_size=receptor_count)
Exemple #10
0
 def __init__(self,
              amino_acid_sequence: str = None,
              nucleotide_sequence: str = None,
              identifier: str = None,
              annotation: SequenceAnnotation = None,
              metadata: SequenceMetadata = SequenceMetadata()):
     self.identifier = identifier
     self.amino_acid_sequence = amino_acid_sequence
     self.nucleotide_sequence = nucleotide_sequence
     self.annotation = annotation
     self.metadata = metadata
Exemple #11
0
    def _create_new_sequences(self, sequences, new_sequence_count, signal) -> List[ReceptorSequence]:
        new_sequences = sequences[:-new_sequence_count]

        for _ in range(new_sequence_count):

            motif = random.choice(signal.motifs)
            motif_instance = motif.instantiate_motif()
            annotation = SequenceAnnotation([ImplantAnnotation(signal_id=signal.id, motif_id=motif.identifier,
                                                               motif_instance=motif_instance.instance, position=0)])
            metadata = SequenceMetadata(v_gene="TRBV6-1", j_gene="TRBJ2-7", count=1, chain="B")

            new_sequences.append(ReceptorSequence(amino_acid_sequence=motif_instance.instance, annotation=annotation, metadata=metadata))

        return new_sequences
    def test_match_repertoire(self):

        path = EnvironmentSettings.root_path + "test/tmp/seqmatchrep/"
        PathBuilder.build(path)

        repertoire = Repertoire.build_from_sequence_objects(sequence_objects=[
            ReceptorSequence(amino_acid_sequence="AAAAAA",
                             identifier="1",
                             metadata=SequenceMetadata(chain="A", count=3)),
            ReceptorSequence(amino_acid_sequence="CCCCCC",
                             identifier="2",
                             metadata=SequenceMetadata(chain="A", count=2)),
            ReceptorSequence(amino_acid_sequence="AAAACC",
                             identifier="3",
                             metadata=SequenceMetadata(chain="A", count=1)),
            ReceptorSequence(amino_acid_sequence="TADQVF",
                             identifier="4",
                             metadata=SequenceMetadata(chain="A", count=4))
        ],
                                                            metadata={
                                                                "CD": True
                                                            },
                                                            path=path)

        sequences = [
            ReceptorSequence("AAAACA", metadata=SequenceMetadata(chain="A")),
            ReceptorSequence("TADQV", metadata=SequenceMetadata(chain="A"))
        ]

        matcher = SequenceMatcher()
        result = matcher.match_repertoire(repertoire, 0, sequences, 2,
                                          SequenceMatchingSummaryType.COUNT)

        self.assertTrue("sequences" in result)
        self.assertTrue("repertoire" in result)
        self.assertTrue("repertoire_index" in result)

        self.assertEqual(4, len(result["sequences"]))
        self.assertEqual(1, len(result["sequences"][0]["matching_sequences"]))
        self.assertEqual(0, len(result["sequences"][1]["matching_sequences"]))
        self.assertEqual(1, len(result["sequences"][2]["matching_sequences"]))
        self.assertEqual(1, len(result["sequences"][3]["matching_sequences"]))

        self.assertEqual(
            3,
            len([
                r for r in result["sequences"]
                if len(r["matching_sequences"]) > 0
            ]))
        self.assertTrue(result["metadata"]["CD"])

        result = matcher.match_repertoire(
            repertoire, 0, sequences, 2,
            SequenceMatchingSummaryType.CLONAL_PERCENTAGE)
        self.assertEqual(0.8, result["clonal_percentage"])

        shutil.rmtree(path)
    def test_match(self):
        path = EnvironmentSettings.root_path + "test/tmp/seqmatch/"
        PathBuilder.build(path)

        repertoire = Repertoire.build_from_sequence_objects(
            sequence_objects=[
                ReceptorSequence(amino_acid_sequence="AAAAAA",
                                 metadata=SequenceMetadata(chain="A",
                                                           v_gene="V1",
                                                           j_gene="J2"),
                                 identifier="3"),
                ReceptorSequence(amino_acid_sequence="CCCCCC",
                                 metadata=SequenceMetadata(chain="A",
                                                           v_gene="V1",
                                                           j_gene="J2"),
                                 identifier="4"),
                ReceptorSequence(amino_acid_sequence="AAAACC",
                                 metadata=SequenceMetadata(chain="A",
                                                           v_gene="V1",
                                                           j_gene="J2"),
                                 identifier="5"),
                ReceptorSequence(amino_acid_sequence="TADQVF",
                                 metadata=SequenceMetadata(chain="A",
                                                           v_gene="V1",
                                                           j_gene="J3"),
                                 identifier="6")
            ],
            metadata={"CD": True},
            path=path)

        dataset = RepertoireDataset(repertoires=[repertoire])
        sequences = [
            ReceptorSequence("AAAACA",
                             metadata=SequenceMetadata(chain="A",
                                                       v_gene="V1",
                                                       j_gene="J2"),
                             identifier="1"),
            ReceptorSequence("TADQV",
                             metadata=SequenceMetadata(chain="A",
                                                       v_gene="V1",
                                                       j_gene="J3"),
                             identifier="2")
        ]

        matcher = SequenceMatcher()
        result = matcher.match(dataset, sequences, 2,
                               SequenceMatchingSummaryType.PERCENTAGE)

        self.assertTrue("repertoires" in result)
        self.assertEqual(
            1,
            len(result["repertoires"][0]["sequences"][3]
                ["matching_sequences"]))
        self.assertTrue(result["repertoires"][0]["metadata"]["CD"])
        self.assertEqual(1, len(result["repertoires"]))

        shutil.rmtree(path)
Exemple #14
0
    def _make_sequence_object(self, row):

        fields = row.dtype.names

        keys = [key for key in row.dtype.names if "signal" in key]
        implants = []
        for key in keys:
            value_dict = row[key]
            if value_dict:
                implants.append(
                    ImplantAnnotation(**ast.literal_eval(value_dict)))

        seq = ReceptorSequence(
            amino_acid_sequence=row["sequence_aas"]
            if "sequence_aas" in fields else None,
            nucleotide_sequence=row["sequences"]
            if "sequences" in fields else None,
            identifier=row["sequence_identifiers"]
            if "sequence_identifiers" in fields else None,
            metadata=SequenceMetadata(
                v_gene=row["v_genes"] if "v_genes" in fields else None,
                j_gene=row["j_genes"] if "j_genes" in fields else None,
                v_subgroup=row["v_subgroups"]
                if "v_subgroups" in fields else None,
                j_subgroup=row["j_subgroups"]
                if "j_subgroups" in fields else None,
                v_allele=row["v_alleles"] if "v_alleles" in fields else None,
                j_allele=row["j_alleles"] if "j_alleles" in fields else None,
                chain=row["chains"] if "chains" in fields else None,
                count=row["counts"] if "counts" in fields else None,
                region_type=row["region_types"]
                if "region_types" in fields else None,
                frame_type=row["frame_types"]
                if "frame_types" in fields else "IN",
                cell_id=row["cell_ids"] if "cell_ids" in fields else None,
                custom_params={
                    key: row[key] if key in fields else None
                    for key in set(self.fields) - set(Repertoire.FIELDS)
                }),
            annotation=SequenceAnnotation(implants=implants))

        return seq
Exemple #15
0
    def import_sequence(row, metadata_columns=None) -> ReceptorSequence:
        if metadata_columns is None:
            metadata_columns = []
        metadata = SequenceMetadata(v_gene=str(row["v_genes"]) if "v_genes" in row and row["v_genes"] is not None else None,
                                    v_allele=str(row["v_alleles"]) if "v_alleles" in row and row["v_alleles"] is not None else None,
                                    j_gene=str(row["j_genes"]) if "j_genes" in row and row["j_genes"] is not None else None,
                                    j_allele=str(row["j_alleles"]) if "j_alleles" in row and row["j_alleles"] is not None else None,
                                    chain=row["chains"] if "chains" in row and row["chains"] is not None else None,
                                    region_type=row["region_types"] if "region_types" in row and row["region_types"] is not None else None,
                                    count=int(row["counts"]) if "counts" in row and row["counts"] is not None else None,
                                    frame_type=row["frame_types"] if "frame_types" in row and row["frame_types"] is not None else None,
                                    custom_params={custom_col: row[custom_col] for custom_col in metadata_columns if
                                                   custom_col in row} if metadata_columns is not None else {})
        sequence = ReceptorSequence(
            amino_acid_sequence=str(row["sequence_aas"]) if "sequence_aas" in row and row["sequence_aas"] is not None else None,
            nucleotide_sequence=str(row["sequences"]) if "sequences" in row and row["sequences"] is not None else None,
            identifier=str(row["sequence_identifiers"]) if "sequence_identifiers" in row and row["sequence_identifiers"] is not None else None,
            metadata=metadata)

        return sequence
Exemple #16
0
    def process_iris_chain(row, chain, dual_chain_id, all_genes):
        sequences = ReceptorSequenceList()

        v_alleles = set([
            gene.replace("TR{}".format(chain), "").replace(chain, "")
            for gene in row["TR{} - V gene (1)".format(chain)].split(" | ")
        ])
        j_alleles = set([
            gene.replace("TR{}".format(chain), "").replace(chain, "")
            for gene in row["TR{} - J gene (1)".format(chain)].split(" | ")
        ])

        make_sequence_metadata = lambda v_allele, j_allele, chain, dual_chain_id: \
            SequenceMetadata(v_gene=v_allele.split(Constants.ALLELE_DELIMITER)[0], v_allele=v_allele, v_subgroup=v_allele.split("-")[0],
                             j_gene=j_allele.split(Constants.ALLELE_DELIMITER)[0], j_allele=j_allele, j_subgroup=j_allele.split("-")[0], chain=chain,
                             custom_params={"dual_chain_id": dual_chain_id})

        if all_genes:
            for v_allele in v_alleles:
                for j_allele in j_alleles:
                    metadata = make_sequence_metadata(v_allele, j_allele,
                                                      chain, dual_chain_id)
                    sequences.append(
                        ReceptorSequence(amino_acid_sequence=row[
                            f"Chain: TR{chain} ({dual_chain_id})"],
                                         metadata=metadata))
        else:
            # select a random v and j gene
            v_allele = v_alleles.pop()
            j_allele = j_alleles.pop()
            metadata = make_sequence_metadata(v_allele, j_allele, chain,
                                              dual_chain_id)
            sequences.append(
                ReceptorSequence(amino_acid_sequence=row[
                    f"Chain: TR{chain} ({dual_chain_id})"],
                                 metadata=metadata))

        return sequences
    def test_encode(self):
        path = EnvironmentSettings.root_path + "test/tmp/evennessenc/"

        PathBuilder.build(path)

        rep1 = Repertoire.build_from_sequence_objects(sequence_objects=[
            ReceptorSequence("AAA", metadata=SequenceMetadata(count=10))
            for i in range(1000)
        ] + [
            ReceptorSequence("AAA", metadata=SequenceMetadata(count=100))
            for i in range(1000)
        ] + [
            ReceptorSequence("AAA", metadata=SequenceMetadata(count=1))
            for i in range(1000)
        ],
                                                      metadata={
                                                          "l1": "test_1",
                                                          "l2": 2
                                                      },
                                                      path=path)

        rep2 = Repertoire.build_from_sequence_objects(sequence_objects=[
            ReceptorSequence("AAA", metadata=SequenceMetadata(count=10))
            for i in range(1000)
        ],
                                                      metadata={
                                                          "l1": "test_2",
                                                          "l2": 3
                                                      },
                                                      path=path)

        lc = LabelConfiguration()
        lc.add_label("l1", ["test_1", "test_2"])
        lc.add_label("l2", [0, 3])

        dataset = RepertoireDataset(repertoires=[rep1, rep2])

        encoder = EvennessProfileEncoder.build_object(
            dataset, **{
                "min_alpha": 0,
                "max_alpha": 10,
                "dimension": 51
            })

        d1 = encoder.encode(
            dataset, EncoderParams(
                result_path=path + "1/",
                label_config=lc,
            ))

        encoder = EvennessProfileEncoder.build_object(
            dataset, **{
                "min_alpha": 0,
                "max_alpha": 10,
                "dimension": 11
            })

        d2 = encoder.encode(
            dataset,
            EncoderParams(result_path=path, label_config=lc, pool_size=2))

        self.assertAlmostEqual(d1.encoded_data.examples[0, 0], 1)
        self.assertAlmostEqual(d1.encoded_data.examples[0, 1], 0.786444)
        self.assertAlmostEqual(d1.encoded_data.examples[1, 0], 1)
        self.assertAlmostEqual(d1.encoded_data.examples[1, 1], 1)

        shutil.rmtree(path)
Exemple #18
0
    def test_repertoire(self):

        path = EnvironmentSettings.tmp_test_path + "sequencerepertoire/"
        PathBuilder.build(path)

        sequences = [
            ReceptorSequence(amino_acid_sequence="AAA",
                             identifier="1",
                             metadata=SequenceMetadata(v_gene="V1",
                                                       cell_id="1",
                                                       chain=Chain.ALPHA,
                                                       custom_params={
                                                           "cmv": "no",
                                                           "coeliac": False
                                                       })),
            ReceptorSequence(amino_acid_sequence="CCC",
                             identifier="2",
                             metadata=SequenceMetadata(j_gene="J1",
                                                       cell_id="1",
                                                       chain=Chain.BETA,
                                                       custom_params={
                                                           "cmv": "yes",
                                                           "coeliac": True
                                                       }))
        ]

        obj = Repertoire.build_from_sequence_objects(sequences, path, {
            "cmv": "yes",
            'subject_id': "1"
        })

        self.assertTrue(os.path.isfile(obj.data_filename))
        self.assertTrue(isinstance(obj, Repertoire))
        self.assertTrue(
            np.array_equal(np.array(["1", "2"]),
                           obj.get_sequence_identifiers()))
        self.assertTrue(
            np.array_equal(np.array(["AAA", "CCC"]), obj.get_sequence_aas()))
        self.assertTrue(
            np.array_equal(np.array(["V1", None]), obj.get_v_genes()))
        self.assertTrue(
            np.array_equal(np.array([None, "J1"]), obj.get_j_genes()))
        self.assertTrue(
            np.array_equal(np.array(["no", "yes"]), obj.get_attribute("cmv")))
        self.assertTrue(
            np.array_equal(np.array([False, True]),
                           obj.get_attribute("coeliac")))
        self.assertEqual("yes", obj.metadata["cmv"])
        self.assertEqual("1", obj.metadata["subject_id"])

        rebuilt_sequences = obj.sequences

        self.assertTrue(
            all(
                isinstance(seq, ReceptorSequence)
                for seq in rebuilt_sequences))
        self.assertEqual(2, len(rebuilt_sequences))
        self.assertEqual("1", rebuilt_sequences[0].identifier)
        self.assertEqual("2", rebuilt_sequences[1].identifier)
        self.assertEqual("AAA", rebuilt_sequences[0].amino_acid_sequence)
        self.assertEqual("yes",
                         rebuilt_sequences[1].metadata.custom_params["cmv"])

        obj.free_memory()

        self.assertTrue(key in obj.data for key in Repertoire.FIELDS)
        self.assertTrue(obj.data[key] is None for key in Repertoire.FIELDS)

        shutil.rmtree(path)
Exemple #19
0
    def test_receptor(self):
        path = EnvironmentSettings.tmp_test_path + "receptortestingpathrepertoire/"
        PathBuilder.build(path)

        sequences = [
            ReceptorSequence(amino_acid_sequence="AAA",
                             identifier="1",
                             metadata=SequenceMetadata(v_gene="V1",
                                                       cell_id="1",
                                                       chain=Chain.ALPHA,
                                                       custom_params={
                                                           "cmv": "no",
                                                           "coeliac": False
                                                       })),
            ReceptorSequence(amino_acid_sequence="CCC",
                             identifier="2",
                             metadata=SequenceMetadata(j_gene="J1",
                                                       cell_id="1",
                                                       chain=Chain.BETA,
                                                       custom_params={
                                                           "cmv": "yes",
                                                           "coeliac": True
                                                       })),
            ReceptorSequence(amino_acid_sequence="FFF",
                             identifier="3",
                             metadata=SequenceMetadata(v_gene="V1",
                                                       cell_id="1",
                                                       chain=Chain.ALPHA,
                                                       custom_params={
                                                           "cmv": "no",
                                                           "coeliac": False
                                                       })),
            ReceptorSequence(amino_acid_sequence="EEE",
                             identifier="4",
                             metadata=SequenceMetadata(j_gene="J1",
                                                       cell_id="1",
                                                       chain=Chain.BETA,
                                                       custom_params={
                                                           "cmv": "yes",
                                                           "coeliac": True
                                                       })),
            ReceptorSequence(amino_acid_sequence="FFF",
                             identifier="5",
                             metadata=SequenceMetadata(v_gene="V1",
                                                       cell_id="2",
                                                       chain=Chain.GAMMA,
                                                       custom_params={
                                                           "cmv": "no",
                                                           "coeliac": False
                                                       })),
            ReceptorSequence(amino_acid_sequence="EEE",
                             identifier="6",
                             metadata=SequenceMetadata(j_gene="J1",
                                                       cell_id="2",
                                                       chain=Chain.DELTA,
                                                       custom_params={
                                                           "cmv": "yes",
                                                           "coeliac": True
                                                       })),
            ReceptorSequence(amino_acid_sequence="EEE",
                             identifier="7",
                             metadata=SequenceMetadata(j_gene="J2",
                                                       cell_id="2",
                                                       chain=Chain.DELTA,
                                                       custom_params={
                                                           "cmv": "yes",
                                                           "coeliac": True
                                                       }))
        ]

        obj = Repertoire.build_from_sequence_objects(sequences, path, {
            "cmv": "yes",
            'subject_id': "1"
        })
        receptors = obj.receptors

        self.assertEqual(6, len(receptors))

        cells = obj.cells

        self.assertEqual(2, len(cells))

        shutil.rmtree(path)
Exemple #20
0
    def test(self):

        sequences = [
            ReceptorSequence(
                amino_acid_sequence="AAACCC",
                identifier="1",
                metadata=SequenceMetadata(custom_params={"l1": 1})),
            ReceptorSequence(
                amino_acid_sequence="ACACAC",
                identifier="2",
                metadata=SequenceMetadata(custom_params={"l1": 2})),
            ReceptorSequence(
                amino_acid_sequence="CCCAAA",
                identifier="3",
                metadata=SequenceMetadata(custom_params={"l1": 1})),
            ReceptorSequence(
                amino_acid_sequence="AAACCC",
                identifier="4",
                metadata=SequenceMetadata(custom_params={"l1": 2})),
            ReceptorSequence(
                amino_acid_sequence="ACACAC",
                identifier="5",
                metadata=SequenceMetadata(custom_params={"l1": 1})),
            ReceptorSequence(
                amino_acid_sequence="CCCAAA",
                identifier="6",
                metadata=SequenceMetadata(custom_params={"l1": 2})),
            ReceptorSequence(
                amino_acid_sequence="AAACCC",
                identifier="7",
                metadata=SequenceMetadata(custom_params={"l1": 1})),
            ReceptorSequence(
                amino_acid_sequence="ACACAC",
                identifier="8",
                metadata=SequenceMetadata(custom_params={"l1": 2})),
            ReceptorSequence(
                amino_acid_sequence="CCCAAA",
                identifier="9",
                metadata=SequenceMetadata(custom_params={"l1": 1}))
        ]

        path = EnvironmentSettings.tmp_test_path + "kmrefreqseqfacencoder/"
        PathBuilder.build(path)
        filename = "{}sequences.pkl".format(path)
        with open(filename, "wb") as file:
            pickle.dump(sequences, file)

        lc = LabelConfiguration()
        lc.add_label("l1", [1, 2])

        dataset = SequenceDataset(params={"l1": [1, 2]},
                                  filenames=[filename],
                                  identifier="d1")

        encoder = KmerFreqSequenceEncoder.build_object(
            dataset, **{
                "normalization_type":
                NormalizationType.RELATIVE_FREQUENCY.name,
                "reads": ReadsType.UNIQUE.name,
                "sequence_encoding": SequenceEncodingType.CONTINUOUS_KMER.name,
                "k": 3
            })

        encoded_dataset = encoder.encode(
            dataset,
            EncoderParams(result_path=path + "2/",
                          label_config=lc,
                          pool_size=2,
                          learn_model=True,
                          model={},
                          filename="dataset.csv"))

        self.assertEqual(9, encoded_dataset.encoded_data.examples.shape[0])
        self.assertTrue(
            all(identifier in encoded_dataset.encoded_data.example_ids for
                identifier in ['1', '2', '3', '4', '5', '6', '7', '8', '9']))
        self.assertTrue(
            numpy.array_equal(encoded_dataset.encoded_data.examples[0].A,
                              encoded_dataset.encoded_data.examples[3].A))

        shutil.rmtree(path)
    def _import_from_files(
            filenames: List[str],
            generic_params: DatasetImportParams) -> ReceptorDataset:
        elements = []

        for file in filenames:
            df = pd.read_csv(file,
                             sep=generic_params.separator,
                             usecols=generic_params.columns_to_load)
            df.dropna()
            df.drop_duplicates()
            df.rename(columns=generic_params.column_mapping, inplace=True)

            if "alpha_amino_acid_sequence" in df:
                df["alpha_amino_acid_sequence"] = df[
                    "alpha_amino_acid_sequence"].str[1:-1]
            if "beta_amino_acid_sequence" in df:
                df["beta_amino_acid_sequence"] = df[
                    "beta_amino_acid_sequence"].str[1:-1]
            if "alpha_nucleotide_sequence" in df:
                df["alpha_nucleotide_sequence"] = df[
                    "alpha_nucleotide_sequence"].str[3:-3]
            if "beta_nucleotide_sequence" in df:
                df["beta_nucleotide_sequence"] = df[
                    "beta_nucleotide_sequence"].str[3:-3]

            chain_vals = [ch for ch in generic_params.receptor_chains.value]
            chain_names = [
                Chain.get_chain(ch).name.lower()
                for ch in generic_params.receptor_chains.value
            ]

            for chain_name in chain_names:
                df = SingleLineReceptorImport.make_gene_columns(
                    df, ["v", "j"], chain_name)

            for index, row in df.iterrows():
                sequences = {
                    chain_vals[i]: ReceptorSequence(
                        amino_acid_sequence=row[chain_name +
                                                "_amino_acid_sequence"] if
                        chain_name + "_amino_acid_sequence" in row else None,
                        nucleotide_sequence=row[chain_name +
                                                "_nucleotide_sequence"] if
                        chain_name + "_nucleotide_sequence" in row else None,
                        metadata=SequenceMetadata(
                            v_gene=row[f"{chain_name}_v_gene"],
                            v_allele=row[f"{chain_name}_v_allele"],
                            v_subgroup=row[f'{chain_name}_v_subgroup'],
                            j_gene=row[f"{chain_name}_j_gene"],
                            j_allele=row[f"{chain_name}_j_allele"],
                            j_subgroup=row[f'{chain_name}_j_subgroup'],
                            chain=chain_name,
                            count=row["count"],
                            region_type=generic_params.region_type.value))
                    for i, chain_name in enumerate(chain_names)
                }

                elements.append(
                    ReceptorBuilder.build_object(
                        sequences, row["identifier"], {
                            key: row[key]
                            for key in row.keys() if all(
                                item not in key for item in
                                ["v_gene", 'j_gene', "count", "identifier"] +
                                chain_names)
                        }))

        return ReceptorDataset.build(elements,
                                     generic_params.sequence_file_size,
                                     generic_params.result_path)