Beispiel #1
0
    def create_dummy_receptordataset(self, path):
        receptors = [TCABReceptor(identifier="1",
                                  alpha=ReceptorSequence(amino_acid_sequence="AAATTT", identifier="1a",
                                                         metadata=SequenceMetadata(v_gene="TRAV1", j_gene="TRAJ1",
                                                                                   chain=Chain.ALPHA,
                                                                                   frame_type="IN",
                                                                                   custom_params={"d_call": "TRAD1",
                                                                                                  "custom1": "cust1"})),
                                  beta=ReceptorSequence(amino_acid_sequence="ATATAT", identifier="1b",
                                                        metadata=SequenceMetadata(v_gene="TRBV1", j_gene="TRBJ1",
                                                                                  chain=Chain.BETA,
                                                                                  frame_type="IN",
                                                                                  custom_params={"d_call": "TRBD1",
                                                                                                 "custom1": "cust1"}))),
                     TCABReceptor(identifier="2",
                                  alpha=ReceptorSequence(amino_acid_sequence="AAAAAA", identifier="2a",
                                                         metadata=SequenceMetadata(v_gene="TRAV1", j_gene="TRAJ1",
                                                                                   chain=Chain.ALPHA,
                                                                                   frame_type="IN",
                                                                                   custom_params={"d_call": "TRAD1",
                                                                                                  "custom2": "cust1"})),
                                  beta=ReceptorSequence(amino_acid_sequence="AAAAAA", identifier="2b",
                                                        metadata=SequenceMetadata(v_gene="TRBV1", j_gene="TRBJ1",
                                                                                  chain=Chain.BETA,
                                                                                  frame_type="IN",
                                                                                  custom_params={"d_call": "TRBD1",
                                                                                                 "custom2": "cust1"})))]

        receptors_path = path / "receptors"
        PathBuilder.build(receptors_path)
        return ReceptorDataset.build_from_objects(receptors, 2, receptors_path)
    def create_dataset(self, path, dataset_size: int = 50):

        sequences = []

        for i in range(dataset_size):
            if i % 2 == 0:
                sequences.append(
                    ReceptorSequence(
                        amino_acid_sequence="AAACCC",
                        identifier=str(i),
                        metadata=SequenceMetadata(custom_params={"l1": 1})))
            else:
                sequences.append(
                    ReceptorSequence(
                        amino_acid_sequence="ACACAC",
                        identifier=str(i),
                        metadata=SequenceMetadata(custom_params={"l1": 2})))

        PathBuilder.build(path)
        filename = path / "sequences.pkl"
        with open(filename, "wb") as file:
            pickle.dump(sequences, file)

        lc = LabelConfiguration()
        lc.add_label("l1", [1, 2])

        dataset = SequenceDataset(labels={"l1": [1, 2]},
                                  filenames=[filename],
                                  identifier="d1")
        return dataset
Beispiel #3
0
    def test_encode_sequence(self):
        sequence = ReceptorSequence(
            amino_acid_sequence="AAA",
            metadata=SequenceMetadata(frame_type="OUT"))
        enc = IdentitySequenceEncoder()
        self.assertEqual(
            enc.encode_sequence(
                sequence,
                EncoderParams(model={},
                              label_config=LabelConfiguration(),
                              result_path="")), ["AAA"])

        sequence = ReceptorSequence(
            amino_acid_sequence="AAA",
            metadata=SequenceMetadata(frame_type="STOP"))
        enc = IdentitySequenceEncoder()
        self.assertEqual(
            enc.encode_sequence(
                sequence,
                EncoderParams(model={},
                              label_config=LabelConfiguration(),
                              result_path="")), ["AAA"])

        sequence = ReceptorSequence(amino_acid_sequence="AAA",
                                    metadata=SequenceMetadata(frame_type="IN"))
        enc = IdentitySequenceEncoder()
        self.assertEqual(["AAA"],
                         enc.encode_sequence(
                             sequence,
                             EncoderParams(model={},
                                           label_config=LabelConfiguration(),
                                           result_path="")))
Beispiel #4
0
    def create_dummy_repertoire(self, path):
        sequence_objects = [ReceptorSequence(amino_acid_sequence="AAA",
                                             nucleotide_sequence="GCTGCTGCT",
                                             identifier="receptor_1",
                                             metadata=SequenceMetadata(v_gene="TRBV1",
                                                                       j_gene="TRBJ1",
                                                                       chain=Chain.BETA,
                                                                       count=5,
                                                                       region_type="IMGT_CDR3",
                                                                       frame_type="IN",
                                                                       custom_params={"d_call": "TRBD1",
                                                                                      "custom_test": "cust1"})),
                            ReceptorSequence(amino_acid_sequence="GGG",
                                             nucleotide_sequence="GGTGGTGGT",
                                             identifier="receptor_2",
                                             metadata=SequenceMetadata(v_gene="TRAV2", v_allele="TRAV2*01",
                                                                       j_gene="TRAJ2",
                                                                       chain=Chain.ALPHA,
                                                                       count=15,
                                                                       frame_type=None,
                                                                       region_type="IMGT_CDR3",
                                                                       custom_params={"d_call": "TRAD2",
                                                                                      "custom_test": "cust2"}))]

        repertoire = Repertoire.build_from_sequence_objects(sequence_objects=sequence_objects, path=path, metadata={"subject_id": "REP1"})
        df = pd.DataFrame({"filename": [f"{repertoire.identifier}_data.npy"], "subject_id": ["1"],
                           "repertoire_identifier": [repertoire.identifier]})
        df.to_csv(path / "metadata.csv", index=False)

        return repertoire, path / "metadata.csv"
Beispiel #5
0
    def _construct_test_dataset(self, path):
        sequences = [
            ReceptorSequence(amino_acid_sequence="AAAA",
                             identifier="1",
                             metadata=SequenceMetadata(custom_params={
                                 "l1": 1,
                                 "l2": 1
                             })),
            ReceptorSequence(amino_acid_sequence="ATA",
                             identifier="2",
                             metadata=SequenceMetadata(custom_params={
                                 "l1": 2,
                                 "l2": 1
                             })),
            ReceptorSequence(amino_acid_sequence="ATT",
                             identifier="3",
                             metadata=SequenceMetadata(custom_params={
                                 "l1": 1,
                                 "l2": 2
                             }))
        ]

        lc = LabelConfiguration()
        lc.add_label("l1", [1, 2])
        lc.add_label("l2", [1, 2])

        dataset = SequenceDataset.build(sequences=sequences,
                                        file_size=10,
                                        path=path)

        return dataset, lc
Beispiel #6
0
 def create_dummy_sequencedataset(self, path):
     sequences = [ReceptorSequence(amino_acid_sequence="AAATTT", identifier="1a",
                                   metadata=SequenceMetadata(v_gene="TRAV1", j_gene="TRAJ1", chain=Chain.ALPHA, frame_type="IN",
                                                             custom_params={"d_call": "TRAD1",
                                                                            "custom1": "cust1"})),
                  ReceptorSequence(amino_acid_sequence="ATATAT", identifier="1b",
                                   metadata=SequenceMetadata(v_gene="TRBV1", j_gene="TRBJ1", chain=Chain.BETA, frame_type="IN",
                                                             custom_params={"d_call": "TRBD1",
                                                                            "custom2": "cust1"})),
                  ReceptorSequence(amino_acid_sequence="ATATAT", identifier="2b",
                                   metadata=SequenceMetadata(v_gene="TRBV1", j_gene="TRBJ1", chain=Chain.BETA, frame_type="IN",
                                                             custom_params={"d_call": "TRBD1",
                                                                            "custom2": "cust1"}))]
     sequences_path = path / "sequences"
     PathBuilder.build(sequences_path)
     return SequenceDataset.build_from_objects(sequences, 2, sequences_path)
Beispiel #7
0
    def _create_new_sequences(self, sequences, new_sequence_count,
                              signal) -> List[ReceptorSequence]:
        new_sequences = sequences[:-new_sequence_count]

        for _ in range(new_sequence_count):

            motif = random.choice(signal.motifs)
            motif_instance = motif.instantiate_motif()
            annotation = SequenceAnnotation([
                ImplantAnnotation(signal_id=signal.id,
                                  motif_id=motif.identifier,
                                  motif_instance=motif_instance.instance,
                                  position=0)
            ])
            metadata = SequenceMetadata(v_gene="TRBV6-1",
                                        j_gene="TRBJ2-7",
                                        count=1,
                                        chain="B")

            new_sequences.append(
                ReceptorSequence(amino_acid_sequence=motif_instance.instance,
                                 annotation=annotation,
                                 metadata=metadata))

        return new_sequences
    def test_process(self):

        path = EnvironmentSettings.root_path / "test/tmp/chain_filter/"
        PathBuilder.build(path)

        rep1 = Repertoire.build_from_sequence_objects([
            ReceptorSequence(
                "AAA", metadata=SequenceMetadata(chain="A"), identifier="1")
        ],
                                                      path=path,
                                                      metadata={})
        rep2 = Repertoire.build_from_sequence_objects([
            ReceptorSequence(
                "AAC", metadata=SequenceMetadata(chain="B"), identifier="2")
        ],
                                                      path=path,
                                                      metadata={})

        metadata = pd.DataFrame({"CD": [1, 0]})
        metadata.to_csv(path / "metadata.csv")

        dataset = RepertoireDataset(repertoires=[rep1, rep2],
                                    metadata_file=path / "metadata.csv")

        dataset2 = ChainRepertoireFilter.process(
            dataset, {
                "keep_chain": "ALPHA",
                "result_path": path / "results"
            })

        self.assertEqual(1, len(dataset2.get_data()))
        self.assertEqual(2, len(dataset.get_data()))

        metadata_dict = dataset2.get_metadata(["CD"])
        self.assertEqual(1, len(metadata_dict["CD"]))
        self.assertEqual(1, metadata_dict["CD"][0])

        for rep in dataset2.get_data():
            self.assertEqual("AAA", rep.sequences[0].get_sequence())

        self.assertRaises(AssertionError, ChainRepertoireFilter.process,
                          dataset, {
                              "keep_chain": "GAMMA",
                              "result_path": path / "results"
                          })

        shutil.rmtree(path)
    def build(sequences: list, path: Path, labels: dict = None, seq_metadata: list = None, subject_ids: list = None):

        if subject_ids is not None:
            assert len(subject_ids) == len(sequences)

        if seq_metadata is not None:
            assert len(sequences) == len(seq_metadata)
            for index, sequence_list in enumerate(sequences):
                assert len(sequence_list) == len(seq_metadata[index])

        PathBuilder.build(path)
        rep_path = PathBuilder.build(path / "repertoires")

        repertoires = []
        if subject_ids is None:
            subject_ids = []

        for rep_index, sequence_list in enumerate(sequences):
            rep_sequences = ReceptorSequenceList()
            if len(subject_ids) < len(sequences):
                subject_ids.append("rep_" + str(rep_index))
            for seq_index, sequence in enumerate(sequence_list):
                if seq_metadata is None:
                    m = SequenceMetadata(v_subgroup="TRBV1", v_gene="TRBV1-1", v_allele="TRBV1-1*01", j_subgroup="TRBJ1", j_gene="TRBJ1-1", j_allele="TRBJ1-1*01", count=1, chain="TRB", region_type="IMGT_CDR3")
                else:
                    m = SequenceMetadata(**seq_metadata[rep_index][seq_index])

                s = ReceptorSequence(amino_acid_sequence=sequence, metadata=m, identifier=str(seq_index))
                rep_sequences.append(s)

            if labels is not None:
                metadata = {key: labels[key][rep_index] for key in labels.keys()}
            else:
                metadata = {}

            metadata = {**metadata, **{"subject_id": subject_ids[rep_index]}}

            repertoire = Repertoire.build_from_sequence_objects(rep_sequences, rep_path, metadata, filename_base=f"rep_{rep_index}")
            repertoires.append(repertoire)

        df = pd.DataFrame({**{"filename": [repertoire.data_filename for repertoire in repertoires],
                              "subject_id": subject_ids,
                              "repertoire_identifier": [repertoire.identifier for repertoire in repertoires]},
                           **(labels if labels is not None else {})})
        df.to_csv(path / "metadata.csv", index=False)

        return repertoires, path / "metadata.csv"
Beispiel #10
0
    def construct_test_flatten_dataset(self, path):
        sequences = [
            ReceptorSequence(
                amino_acid_sequence="AAATTT",
                identifier="1",
                metadata=SequenceMetadata(custom_params={"l1": 1})),
            ReceptorSequence(
                amino_acid_sequence="ATATAT",
                identifier="2",
                metadata=SequenceMetadata(custom_params={"l1": 2}))
        ]

        PathBuilder.build(path)

        return SequenceDataset.build(sequences=sequences,
                                     file_size=10,
                                     path=path)
Beispiel #11
0
 def __init__(self,
              amino_acid_sequence: str = None,
              nucleotide_sequence: str = None,
              identifier: str = None,
              annotation: SequenceAnnotation = None,
              metadata: SequenceMetadata = SequenceMetadata()):
     self.identifier = identifier
     self.amino_acid_sequence = amino_acid_sequence
     self.nucleotide_sequence = nucleotide_sequence
     self.annotation = annotation
     self.metadata = metadata
    def test_match(self):
        path = EnvironmentSettings.root_path / "test/tmp/seqmatch/"
        PathBuilder.build(path)

        repertoire = Repertoire.build_from_sequence_objects(
            sequence_objects=[
                ReceptorSequence(amino_acid_sequence="AAAAAA",
                                 metadata=SequenceMetadata(chain="A",
                                                           v_gene="V1",
                                                           j_gene="J2"),
                                 identifier="3"),
                ReceptorSequence(amino_acid_sequence="CCCCCC",
                                 metadata=SequenceMetadata(chain="A",
                                                           v_gene="V1",
                                                           j_gene="J2"),
                                 identifier="4"),
                ReceptorSequence(amino_acid_sequence="AAAACC",
                                 metadata=SequenceMetadata(chain="A",
                                                           v_gene="V1",
                                                           j_gene="J2"),
                                 identifier="5"),
                ReceptorSequence(amino_acid_sequence="TADQVF",
                                 metadata=SequenceMetadata(chain="A",
                                                           v_gene="V1",
                                                           j_gene="J3"),
                                 identifier="6")
            ],
            metadata={"CD": True},
            path=path)

        dataset = RepertoireDataset(repertoires=[repertoire])
        sequences = [
            ReceptorSequence("AAAACA",
                             metadata=SequenceMetadata(chain="A",
                                                       v_gene="V1",
                                                       j_gene="J2"),
                             identifier="1"),
            ReceptorSequence("TADQV",
                             metadata=SequenceMetadata(chain="A",
                                                       v_gene="V1",
                                                       j_gene="J3"),
                             identifier="2")
        ]

        matcher = SequenceMatcher()
        result = matcher.match(dataset, sequences, 2,
                               SequenceMatchingSummaryType.PERCENTAGE)

        self.assertTrue("repertoires" in result)
        self.assertEqual(
            1,
            len(result["repertoires"][0]["sequences"][3]
                ["matching_sequences"]))
        self.assertTrue(result["repertoires"][0]["metadata"]["CD"])
        self.assertEqual(1, len(result["repertoires"]))

        shutil.rmtree(path)
    def test_match_repertoire(self):

        path = EnvironmentSettings.root_path / "test/tmp/seqmatchrep/"
        PathBuilder.build(path)

        repertoire = Repertoire.build_from_sequence_objects(sequence_objects=[
            ReceptorSequence(amino_acid_sequence="AAAAAA",
                             identifier="1",
                             metadata=SequenceMetadata(chain="A", count=3)),
            ReceptorSequence(amino_acid_sequence="CCCCCC",
                             identifier="2",
                             metadata=SequenceMetadata(chain="A", count=2)),
            ReceptorSequence(amino_acid_sequence="AAAACC",
                             identifier="3",
                             metadata=SequenceMetadata(chain="A", count=1)),
            ReceptorSequence(amino_acid_sequence="TADQVF",
                             identifier="4",
                             metadata=SequenceMetadata(chain="A", count=4))
        ],
                                                            metadata={
                                                                "CD": True
                                                            },
                                                            path=path)

        sequences = [
            ReceptorSequence("AAAACA", metadata=SequenceMetadata(chain="A")),
            ReceptorSequence("TADQV", metadata=SequenceMetadata(chain="A"))
        ]

        matcher = SequenceMatcher()
        result = matcher.match_repertoire(repertoire, 0, sequences, 2,
                                          SequenceMatchingSummaryType.COUNT)

        self.assertTrue("sequences" in result)
        self.assertTrue("repertoire" in result)
        self.assertTrue("repertoire_index" in result)

        self.assertEqual(4, len(result["sequences"]))
        self.assertEqual(1, len(result["sequences"][0]["matching_sequences"]))
        self.assertEqual(0, len(result["sequences"][1]["matching_sequences"]))
        self.assertEqual(1, len(result["sequences"][2]["matching_sequences"]))
        self.assertEqual(1, len(result["sequences"][3]["matching_sequences"]))

        self.assertEqual(
            3,
            len([
                r for r in result["sequences"]
                if len(r["matching_sequences"]) > 0
            ]))
        self.assertTrue(result["metadata"]["CD"])

        result = matcher.match_repertoire(
            repertoire, 0, sequences, 2,
            SequenceMatchingSummaryType.CLONAL_PERCENTAGE)
        self.assertEqual(0.8, result["clonal_percentage"])

        shutil.rmtree(path)
Beispiel #14
0
    def _make_sequence_object(self, row, load_implants: bool = False):

        fields = row.dtype.names

        implants = []
        if load_implants:
            keys = [
                key for key in row.dtype.names if key not in Repertoire.FIELDS
            ]
            for key in keys:
                value_dict = row[key]
                if value_dict:
                    try:
                        implants.append(
                            ImplantAnnotation(**ast.literal_eval(value_dict)))
                    except (SyntaxError, ValueError, TypeError) as e:
                        pass

        seq = ReceptorSequence(
            amino_acid_sequence=row["sequence_aas"]
            if "sequence_aas" in fields else None,
            nucleotide_sequence=row["sequences"]
            if "sequences" in fields else None,
            identifier=row["sequence_identifiers"]
            if "sequence_identifiers" in fields else None,
            metadata=SequenceMetadata(
                v_gene=row["v_genes"] if "v_genes" in fields else None,
                j_gene=row["j_genes"] if "j_genes" in fields else None,
                v_subgroup=row["v_subgroups"]
                if "v_subgroups" in fields else None,
                j_subgroup=row["j_subgroups"]
                if "j_subgroups" in fields else None,
                v_allele=row["v_alleles"] if "v_alleles" in fields else None,
                j_allele=row["j_alleles"] if "j_alleles" in fields else None,
                chain=row["chains"] if "chains" in fields else None,
                count=row["counts"] if "counts" in fields
                and not NumpyHelper.is_nan_or_empty(row['counts']) else None,
                region_type=row["region_types"]
                if "region_types" in fields else None,
                frame_type=row["frame_types"]
                if "frame_types" in fields else "IN",
                cell_id=row["cell_ids"] if "cell_ids" in fields else None,
                custom_params={
                    key: row[key] if key in fields else None
                    for key in set(self.fields) - set(Repertoire.FIELDS)
                }),
            annotation=SequenceAnnotation(implants=implants))

        return seq
Beispiel #15
0
 def create_from_record(cls, record: np.void):
     if 'version' in record.dtype.names and record['version'] == cls.version:
         return ReceptorSequence(
             **{
                 **{
                     key: record[key]
                     for key, val_type in ReceptorSequence.FIELDS.items() if val_type == str and key != 'version'
                 },
                 **{
                     'metadata':
                     SequenceMetadata(**json.loads(record['metadata'])),
                     'annotation':
                     SequenceAnnotation(**json.loads(record['annotation']))
                 }
             })
     else:
         raise NotImplementedError
    def _import_from_files(filenames: List[str], generic_params: DatasetImportParams) -> ReceptorDataset:
        elements = []

        for file in filenames:
            df = pd.read_csv(file, sep=generic_params.separator, usecols=generic_params.columns_to_load)
            df.dropna()
            df.drop_duplicates()
            df.rename(columns=generic_params.column_mapping, inplace=True)

            if "alpha_amino_acid_sequence" in df:
                df["alpha_amino_acid_sequence"] = df["alpha_amino_acid_sequence"].str[1:-1]
            if "beta_amino_acid_sequence" in df:
                df["beta_amino_acid_sequence"] = df["beta_amino_acid_sequence"].str[1:-1]
            if "alpha_nucleotide_sequence" in df:
                df["alpha_nucleotide_sequence"] = df["alpha_nucleotide_sequence"].str[3:-3]
            if "beta_nucleotide_sequence" in df:
                df["beta_nucleotide_sequence"] = df["beta_nucleotide_sequence"].str[3:-3]

            chain_vals = [ch for ch in generic_params.receptor_chains.value]
            chain_names = [Chain.get_chain(ch).name.lower() for ch in generic_params.receptor_chains.value]

            for chain_name in chain_names:
                df = SingleLineReceptorImport.make_gene_columns(df, ["v", "j"], chain_name)

            for index, row in df.iterrows():
                sequences = {chain_vals[i]: ReceptorSequence(amino_acid_sequence=row[
                                     chain_name + "_amino_acid_sequence"] if chain_name + "_amino_acid_sequence" in row else None,
                                                  nucleotide_sequence=row[
                                                      chain_name + "_nucleotide_sequence"] if chain_name + "_nucleotide_sequence" in row else None,
                                                  metadata=SequenceMetadata(
                                                      v_gene=row[f"{chain_name}_v_gene"], v_allele=row[f"{chain_name}_v_allele"],
                                                      v_subgroup=row[f'{chain_name}_v_subgroup'],
                                                      j_gene=row[f"{chain_name}_j_gene"], j_allele=row[f"{chain_name}_j_allele"],
                                                      j_subgroup=row[f'{chain_name}_j_subgroup'],
                                                      chain=chain_name, count=row["count"], region_type=generic_params.region_type.value))
                             for i, chain_name in enumerate(chain_names)}

                elements.append(ReceptorBuilder.build_object(sequences, row["identifier"],
                                                             {key: row[key] for key in row.keys()
                                                              if all(item not in key for item in
                                                                     ["v_gene", 'j_gene', "count", "identifier"] + chain_names)}))

        return ReceptorDataset.build(elements, generic_params.sequence_file_size, generic_params.result_path)
Beispiel #17
0
    def _make_sequence_object(self, row):

        fields = row.dtype.names

        keys = [key for key in row.dtype.names if "signal" in key]
        implants = []
        for key in keys:
            value_dict = row[key]
            if value_dict:
                implants.append(
                    ImplantAnnotation(**ast.literal_eval(value_dict)))

        seq = ReceptorSequence(
            amino_acid_sequence=row["sequence_aas"]
            if "sequence_aas" in fields else None,
            nucleotide_sequence=row["sequences"]
            if "sequences" in fields else None,
            identifier=row["sequence_identifiers"]
            if "sequence_identifiers" in fields else None,
            metadata=SequenceMetadata(
                v_gene=row["v_genes"] if "v_genes" in fields else None,
                j_gene=row["j_genes"] if "j_genes" in fields else None,
                v_subgroup=row["v_subgroups"]
                if "v_subgroups" in fields else None,
                j_subgroup=row["j_subgroups"]
                if "j_subgroups" in fields else None,
                v_allele=row["v_alleles"] if "v_alleles" in fields else None,
                j_allele=row["j_alleles"] if "j_alleles" in fields else None,
                chain=row["chains"] if "chains" in fields else None,
                count=row["counts"] if "counts" in fields else None,
                region_type=row["region_types"]
                if "region_types" in fields else None,
                frame_type=row["frame_types"]
                if "frame_types" in fields else "IN",
                cell_id=row["cell_ids"] if "cell_ids" in fields else None,
                custom_params={
                    key: row[key] if key in fields else None
                    for key in set(self.fields) - set(Repertoire.FIELDS)
                }),
            annotation=SequenceAnnotation(implants=implants))

        return seq
Beispiel #18
0
    def import_sequence(row, metadata_columns=None) -> ReceptorSequence:
        if metadata_columns is None:
            metadata_columns = []
        metadata = SequenceMetadata(v_gene=str(row["v_genes"]) if "v_genes" in row and row["v_genes"] is not None else None,
                                    v_allele=str(row["v_alleles"]) if "v_alleles" in row and row["v_alleles"] is not None else None,
                                    j_gene=str(row["j_genes"]) if "j_genes" in row and row["j_genes"] is not None else None,
                                    j_allele=str(row["j_alleles"]) if "j_alleles" in row and row["j_alleles"] is not None else None,
                                    chain=row["chains"] if "chains" in row and row["chains"] is not None else None,
                                    region_type=row["region_types"] if "region_types" in row and row["region_types"] is not None else None,
                                    count=int(row["counts"]) if "counts" in row and row["counts"] is not None else None,
                                    frame_type=row["frame_types"] if "frame_types" in row and row["frame_types"] is not None else None,
                                    custom_params={custom_col: row[custom_col] for custom_col in metadata_columns if
                                                   custom_col in row} if metadata_columns is not None else {})
        sequence = ReceptorSequence(
            amino_acid_sequence=str(row["sequence_aas"]) if "sequence_aas" in row and row["sequence_aas"] is not None else None,
            nucleotide_sequence=str(row["sequences"]) if "sequences" in row and row["sequences"] is not None else None,
            identifier=str(row["sequence_identifiers"]) if "sequence_identifiers" in row and row["sequence_identifiers"] is not None else None,
            metadata=metadata)

        return sequence
Beispiel #19
0
    def process_iris_chain(row, chain, dual_chain_id, all_genes):
        sequences = ReceptorSequenceList()

        v_alleles = set([gene.replace("TR{}".format(chain), "").replace(chain, "") for gene in row["TR{} - V gene (1)".format(chain)].split(" | ")])
        j_alleles = set([gene.replace("TR{}".format(chain), "").replace(chain, "") for gene in row["TR{} - J gene (1)".format(chain)].split(" | ")])

        make_sequence_metadata = lambda v_allele, j_allele, chain, dual_chain_id: \
            SequenceMetadata(v_gene=v_allele.split(Constants.ALLELE_DELIMITER)[0], v_allele=v_allele, v_subgroup=v_allele.split("-")[0],
                             j_gene=j_allele.split(Constants.ALLELE_DELIMITER)[0], j_allele=j_allele, j_subgroup=j_allele.split("-")[0], chain=chain,
                             custom_params={"dual_chain_id": dual_chain_id})

        if all_genes:
            for v_allele in v_alleles:
                for j_allele in j_alleles:
                    metadata = make_sequence_metadata(v_allele, j_allele, chain, dual_chain_id)
                    sequences.append(ReceptorSequence(amino_acid_sequence=row[f"Chain: TR{chain} ({dual_chain_id})"], metadata=metadata))
        else:
            # select a random v and j gene
            v_allele = v_alleles.pop()
            j_allele = j_alleles.pop()
            metadata = make_sequence_metadata(v_allele, j_allele, chain, dual_chain_id)
            sequences.append(ReceptorSequence(amino_acid_sequence=row[f"Chain: TR{chain} ({dual_chain_id})"], metadata=metadata))

        return sequences
Beispiel #20
0
    def test_encode(self):
        path = EnvironmentSettings.root_path / "test/tmp/evennessenc/"

        PathBuilder.build(path)

        rep1 = Repertoire.build_from_sequence_objects(sequence_objects=[
            ReceptorSequence("AAA", metadata=SequenceMetadata(count=10))
            for i in range(1000)
        ] + [
            ReceptorSequence("AAA", metadata=SequenceMetadata(count=100))
            for i in range(1000)
        ] + [
            ReceptorSequence("AAA", metadata=SequenceMetadata(count=1))
            for i in range(1000)
        ],
                                                      metadata={
                                                          "l1": "test_1",
                                                          "l2": 2
                                                      },
                                                      path=path)

        rep2 = Repertoire.build_from_sequence_objects(sequence_objects=[
            ReceptorSequence("AAA", metadata=SequenceMetadata(count=10))
            for i in range(1000)
        ],
                                                      metadata={
                                                          "l1": "test_2",
                                                          "l2": 3
                                                      },
                                                      path=path)

        lc = LabelConfiguration()
        lc.add_label("l1", ["test_1", "test_2"])
        lc.add_label("l2", [0, 3])

        dataset = RepertoireDataset(repertoires=[rep1, rep2])

        encoder = EvennessProfileEncoder.build_object(
            dataset, **{
                "min_alpha": 0,
                "max_alpha": 10,
                "dimension": 51
            })

        d1 = encoder.encode(
            dataset, EncoderParams(
                result_path=path / "1/",
                label_config=lc,
            ))

        encoder = EvennessProfileEncoder.build_object(
            dataset, **{
                "min_alpha": 0,
                "max_alpha": 10,
                "dimension": 11
            })

        d2 = encoder.encode(
            dataset,
            EncoderParams(result_path=path, label_config=lc, pool_size=2))

        self.assertAlmostEqual(d1.encoded_data.examples[0, 0], 1)
        self.assertAlmostEqual(d1.encoded_data.examples[0, 1], 0.786444)
        self.assertAlmostEqual(d1.encoded_data.examples[1, 0], 1)
        self.assertAlmostEqual(d1.encoded_data.examples[1, 1], 1)

        shutil.rmtree(path)
    def test(self):

        sequences = [
            ReceptorSequence(
                amino_acid_sequence="AAACCC",
                nucleotide_sequence="AAACCC",
                identifier="1",
                metadata=SequenceMetadata(custom_params={"l1": 1})),
            ReceptorSequence(
                amino_acid_sequence="ACACAC",
                nucleotide_sequence="ACACAC",
                identifier="2",
                metadata=SequenceMetadata(custom_params={"l1": 2})),
            ReceptorSequence(
                amino_acid_sequence="CCCAAA",
                nucleotide_sequence="CCCAAA",
                identifier="3",
                metadata=SequenceMetadata(custom_params={"l1": 1})),
            ReceptorSequence(
                amino_acid_sequence="AAACCC",
                nucleotide_sequence="AAACCC",
                identifier="4",
                metadata=SequenceMetadata(custom_params={"l1": 2})),
            ReceptorSequence(
                amino_acid_sequence="ACACAC",
                nucleotide_sequence="ACACAC",
                identifier="5",
                metadata=SequenceMetadata(custom_params={"l1": 1})),
            ReceptorSequence(
                amino_acid_sequence="CCCAAA",
                nucleotide_sequence="CCCAAA",
                identifier="6",
                metadata=SequenceMetadata(custom_params={"l1": 2})),
            ReceptorSequence(
                amino_acid_sequence="AAACCC",
                nucleotide_sequence="AAACCC",
                identifier="7",
                metadata=SequenceMetadata(custom_params={"l1": 1})),
            ReceptorSequence(
                amino_acid_sequence="ACACAC",
                nucleotide_sequence="ACACAC",
                identifier="8",
                metadata=SequenceMetadata(custom_params={"l1": 2})),
            ReceptorSequence(
                amino_acid_sequence="CCCAAA",
                nucleotide_sequence="CCCAAA",
                identifier="9",
                metadata=SequenceMetadata(custom_params={"l1": 1}))
        ]

        path = EnvironmentSettings.tmp_test_path / "kmrefreqseqfacencoder/"
        PathBuilder.build(path)
        dataset = SequenceDataset.build_from_objects(
            sequences, 100, PathBuilder.build(path / 'data'), 'd2')

        lc = LabelConfiguration()
        lc.add_label("l1", [1, 2])

        encoder = KmerFreqSequenceEncoder.build_object(
            dataset, **{
                "normalization_type":
                NormalizationType.RELATIVE_FREQUENCY.name,
                "reads": ReadsType.UNIQUE.name,
                "sequence_encoding": SequenceEncodingType.CONTINUOUS_KMER.name,
                "sequence_type": SequenceType.NUCLEOTIDE.name,
                "k": 3
            })

        encoded_dataset = encoder.encode(
            dataset,
            EncoderParams(result_path=path / "2/",
                          label_config=lc,
                          pool_size=2,
                          learn_model=True,
                          model={},
                          filename="dataset.csv"))

        self.assertEqual(9, encoded_dataset.encoded_data.examples.shape[0])
        self.assertTrue(
            all(identifier in encoded_dataset.encoded_data.example_ids for
                identifier in ['1', '2', '3', '4', '5', '6', '7', '8', '9']))
        self.assertTrue(
            numpy.array_equal(encoded_dataset.encoded_data.examples[0].A,
                              encoded_dataset.encoded_data.examples[3].A))

        shutil.rmtree(path)
Beispiel #22
0
    def generate_sequence_dataset(sequence_count: int,
                                  length_probabilities: dict, labels: dict,
                                  path: Path):
        """
        Creates sequence_count receptor sequences (single chain) where the length of sequences in each chain is sampled independently for each sequence from
        length_probabilities distribution. The labels are also randomly assigned to sequences from the distribution given in
        labels. In this case, labels are multi-class, so each sequences will get one class from each label. This means that negative
        classes for the labels should be included as well in the specification.

        An example of input parameters is given below:

        sequence_count: 100 # generate 100 TRB ReceptorSequences
        length_probabilities:
            14: 0.8 # 80% of all generated sequences for all receptors (for chain 1) will have length 14
            15: 0.2 # 20% of all generated sequences across all receptors (for chain 1) will have length 15
        labels:
            epitope1: # label name
                True: 0.5 # 50% of the receptors will have class True
                False: 0.5 # 50% of the receptors will have class False
            epitope2: # next label with classes that will be assigned to receptors independently of the previous label or other parameters
                1: 0.3 # 30% of the generated receptors will have class 1
                0: 0.7 # 70% of the generated receptors will have class 0
        """
        RandomDatasetGenerator._check_sequence_dataset_generation_params(
            sequence_count, length_probabilities, labels, path)

        alphabet = EnvironmentSettings.get_sequence_alphabet()
        PathBuilder.build(path)

        chain = "TRB"

        sequences = [
            ReceptorSequence(
                "".join(
                    random.choices(alphabet,
                                   k=random.choices(
                                       list(length_probabilities.keys()),
                                       length_probabilities.values())[0])),
                metadata=SequenceMetadata(
                    count=1,
                    v_subgroup=chain + "V1",
                    v_gene=chain + "V1-1",
                    v_allele=chain + "V1-1*01",
                    j_subgroup=chain + "J1",
                    j_gene=chain + "J1-1",
                    j_allele=chain + "J1-1*01",
                    chain=chain,
                    custom_params={
                        **{
                            label: random.choices(list(label_dict.keys()),
                                                  label_dict.values(),
                                                  k=1)[0]
                            for label, label_dict in labels.items()
                        },
                        **{
                            "subject": f"subj_{i + 1}"
                        }
                    })) for i in range(sequence_count)
        ]

        filename = path / "batch01.npy"

        sequence_matrix = np.core.records.fromrecords(
            [seq.get_record() for seq in sequences],
            names=ReceptorSequence.get_record_names())
        np.save(str(filename), sequence_matrix, allow_pickle=False)

        return SequenceDataset(labels={
            label: list(label_dict.keys())
            for label, label_dict in labels.items()
        },
                               filenames=[filename],
                               file_size=sequence_count)
Beispiel #23
0
    def generate_receptor_dataset(receptor_count: int,
                                  chain_1_length_probabilities: dict,
                                  chain_2_length_probabilities: dict,
                                  labels: dict, path: Path):
        """
        Creates receptor_count receptors where the length of sequences in each chain is sampled independently for each sequence from
        chain_n_length_probabilities distribution. The labels are also randomly assigned to receptors from the distribution given in
        labels. In this case, labels are multi-class, so each receptor will get one class from each label. This means that negative
        classes for the labels should be included as well in the specification. chain 1 and 2 in this case refer to alpha and beta
        chain of a T-cell receptor.

        An example of input parameters is given below:

        receptor_count: 100 # generate 100 TRABReceptors
        chain_1_length_probabilities:
            14: 0.8 # 80% of all generated sequences for all receptors (for chain 1) will have length 14
            15: 0.2 # 20% of all generated sequences across all receptors (for chain 1) will have length 15
        chain_2_length_probabilities:
            14: 0.8 # 80% of all generated sequences for all receptors (for chain 2) will have length 14
            15: 0.2 # 20% of all generated sequences across all receptors (for chain 2) will have length 15
        labels:
            epitope1: # label name
                True: 0.5 # 50% of the receptors will have class True
                False: 0.5 # 50% of the receptors will have class False
            epitope2: # next label with classes that will be assigned to receptors independently of the previous label or other parameters
                1: 0.3 # 30% of the generated receptors will have class 1
                0: 0.7 # 70% of the generated receptors will have class 0
        """
        RandomDatasetGenerator._check_receptor_dataset_generation_params(
            receptor_count, chain_1_length_probabilities,
            chain_2_length_probabilities, labels, path)

        alphabet = EnvironmentSettings.get_sequence_alphabet()
        PathBuilder.build(path)

        get_random_sequence = lambda proba, chain, id: ReceptorSequence(
            "".join(
                random.choices(alphabet,
                               k=random.choices(list(proba.keys()),
                                                proba.values())[0])),
            metadata=SequenceMetadata(count=1,
                                      v_subgroup=chain + "V1",
                                      v_gene=chain + "V1-1",
                                      v_allele=chain + "V1-1*01",
                                      j_subgroup=chain + "J1",
                                      j_gene=chain + "J1-1",
                                      j_allele=chain + "J1-1*01",
                                      chain=chain,
                                      cell_id=id))

        receptors = [
            TCABReceptor(alpha=get_random_sequence(
                chain_1_length_probabilities, "TRA", i),
                         beta=get_random_sequence(chain_2_length_probabilities,
                                                  "TRB", i),
                         metadata={
                             **{
                                 label: random.choices(list(label_dict.keys()),
                                                       label_dict.values(),
                                                       k=1)[0]
                                 for label, label_dict in labels.items()
                             },
                             **{
                                 "subject": f"subj_{i + 1}"
                             }
                         }) for i in range(receptor_count)
        ]

        filename = path / "batch01.npy"

        receptor_matrix = np.core.records.fromrecords(
            [receptor.get_record() for receptor in receptors],
            names=TCABReceptor.get_record_names())
        np.save(str(filename), receptor_matrix, allow_pickle=False)

        return ReceptorDataset(labels={
            label: list(label_dict.keys())
            for label, label_dict in labels.items()
        },
                               filenames=[filename],
                               file_size=receptor_count,
                               element_class_name=type(receptors[0]).__name__
                               if len(receptors) > 0 else None)