def create_dummy_sequencedataset(self, path): sequences = [ ReceptorSequence(amino_acid_sequence="AAATTT", identifier="1a", metadata=SequenceMetadata(v_gene="TRAV1", j_gene="TRAJ1", chain=Chain.ALPHA, frame_type="IN", custom_params={ "d_call": "TRAD1", "custom1": "cust1" })), ReceptorSequence(amino_acid_sequence="ATATAT", identifier="1b", metadata=SequenceMetadata(v_gene="TRBV1", j_gene="TRBJ1", chain=Chain.BETA, frame_type="IN", custom_params={ "d_call": "TRBD1", "custom2": "cust1" })), ReceptorSequence(amino_acid_sequence="ATATAT", identifier="2b", metadata=SequenceMetadata(v_gene="TRBV1", j_gene="TRBJ1", chain=Chain.BETA, frame_type="IN", custom_params={ "d_call": "TRBD1", "custom2": "cust1" })) ] return SequenceDataset.build(sequences, 2, "{}sequences".format(path))
def test_encode_sequence(self): sequence = ReceptorSequence( amino_acid_sequence="AAA", metadata=SequenceMetadata(frame_type="OUT")) enc = IdentitySequenceEncoder() self.assertEqual( enc.encode_sequence( sequence, EncoderParams(model={}, label_config=LabelConfiguration(), result_path="")), ["AAA"]) sequence = ReceptorSequence( amino_acid_sequence="AAA", metadata=SequenceMetadata(frame_type="STOP")) enc = IdentitySequenceEncoder() self.assertEqual( enc.encode_sequence( sequence, EncoderParams(model={}, label_config=LabelConfiguration(), result_path="")), ["AAA"]) sequence = ReceptorSequence(amino_acid_sequence="AAA", metadata=SequenceMetadata(frame_type="IN")) enc = IdentitySequenceEncoder() self.assertEqual(["AAA"], enc.encode_sequence( sequence, EncoderParams(model={}, label_config=LabelConfiguration(), result_path="")))
def create_dataset(self, path, dataset_size: int = 50): sequences = [] for i in range(dataset_size): if i % 2 == 0: sequences.append( ReceptorSequence( amino_acid_sequence="AAACCC", identifier=str(i), metadata=SequenceMetadata(custom_params={"l1": 1}))) else: sequences.append( ReceptorSequence( amino_acid_sequence="ACACAC", identifier=str(i), metadata=SequenceMetadata(custom_params={"l1": 2}))) PathBuilder.build(path) filename = "{}sequences.pkl".format(path) with open(filename, "wb") as file: pickle.dump(sequences, file) lc = LabelConfiguration() lc.add_label("l1", [1, 2]) dataset = SequenceDataset(params={"l1": [1, 2]}, filenames=[filename], identifier="d1") return dataset
def create_dummy_receptordataset(self, path): receptors = [ TCABReceptor(identifier="1", alpha=ReceptorSequence(amino_acid_sequence="AAATTT", identifier="1a", metadata=SequenceMetadata( v_gene="TRAV1", j_gene="TRAJ1", chain=Chain.ALPHA, frame_type="IN", custom_params={ "d_call": "TRAD1", "custom1": "cust1" })), beta=ReceptorSequence(amino_acid_sequence="ATATAT", identifier="1b", metadata=SequenceMetadata( v_gene="TRBV1", j_gene="TRBJ1", chain=Chain.BETA, frame_type="IN", custom_params={ "d_call": "TRBD1", "custom1": "cust1" }))), TCABReceptor(identifier="2", alpha=ReceptorSequence(amino_acid_sequence="AAAAAA", identifier="2a", metadata=SequenceMetadata( v_gene="TRAV1", j_gene="TRAJ1", chain=Chain.ALPHA, frame_type="IN", custom_params={ "d_call": "TRAD1", "custom2": "cust1" })), beta=ReceptorSequence(amino_acid_sequence="AAAAAA", identifier="2b", metadata=SequenceMetadata( v_gene="TRBV1", j_gene="TRBJ1", chain=Chain.BETA, frame_type="IN", custom_params={ "d_call": "TRBD1", "custom2": "cust1" }))) ] return ReceptorDataset.build(receptors, 2, "{}receptors".format(path))
def test_process(self): path = EnvironmentSettings.root_path + "test/tmp/chain_filter/" PathBuilder.build(path) rep1 = Repertoire.build_from_sequence_objects([ ReceptorSequence( "AAA", metadata=SequenceMetadata(chain="A"), identifier="1") ], path=path, metadata={}) rep2 = Repertoire.build_from_sequence_objects([ ReceptorSequence( "AAC", metadata=SequenceMetadata(chain="B"), identifier="2") ], path=path, metadata={}) metadata = pd.DataFrame({"CD": [1, 0]}) metadata.to_csv(path + "metadata.csv") dataset = RepertoireDataset(repertoires=[rep1, rep2], metadata_file=path + "metadata.csv") dataset2 = ChainRepertoireFilter.process( dataset, { "keep_chain": "ALPHA", "result_path": path + "results/" }) self.assertEqual(1, len(dataset2.get_data())) self.assertEqual(2, len(dataset.get_data())) metadata_dict = dataset2.get_metadata(["CD"]) self.assertEqual(1, len(metadata_dict["CD"])) self.assertEqual(1, metadata_dict["CD"][0]) for rep in dataset2.get_data(): self.assertEqual("AAA", rep.sequences[0].get_sequence()) self.assertRaises(AssertionError, ChainRepertoireFilter.process, dataset, { "keep_chain": "GAMMA", "result_path": path + "results/" }) shutil.rmtree(path)
def _construct_test_dataset(self, path): sequences = [ ReceptorSequence(amino_acid_sequence="AAAA", identifier="1", metadata=SequenceMetadata(custom_params={"l1": 1, "l2": 1})), ReceptorSequence(amino_acid_sequence="ATA", identifier="2", metadata=SequenceMetadata(custom_params={"l1": 2, "l2": 1})), ReceptorSequence(amino_acid_sequence="ATT", identifier="3", metadata=SequenceMetadata(custom_params={"l1": 1, "l2": 2}))] filename = "{}sequences.pkl".format(path) with open(filename, "wb") as file: pickle.dump(sequences, file) lc = LabelConfiguration() lc.add_label("l1", [1, 2]) lc.add_label("l2", [1, 2]) dataset = SequenceDataset(params={"l1": [1, 2]}, filenames=[filename], identifier="d1") return dataset, lc
def build(sequences: list, path: str, labels: dict = None, seq_metadata: list = None, subject_ids: list = None): if subject_ids is not None: assert len(subject_ids) == len(sequences) if seq_metadata is not None: assert len(sequences) == len(seq_metadata) for index, sequence_list in enumerate(sequences): assert len(sequence_list) == len(seq_metadata[index]) PathBuilder.build(path) rep_path = PathBuilder.build(path + "repertoires/") repertoires = [] if subject_ids is None: subject_ids = [] for rep_index, sequence_list in enumerate(sequences): rep_sequences = ReceptorSequenceList() if len(subject_ids) < len(sequences): subject_ids.append("rep_" + str(rep_index)) for seq_index, sequence in enumerate(sequence_list): if seq_metadata is None: m = SequenceMetadata(v_subgroup="TRBV1", v_gene="TRBV1-1", v_allele="TRBV1-1*01", j_subgroup="TRBJ1", j_gene="TRBJ1-1", j_allele="TRBJ1-1*01", count=1, chain="TRB", region_type="IMGT_CDR3") else: m = SequenceMetadata(**seq_metadata[rep_index][seq_index]) s = ReceptorSequence(amino_acid_sequence=sequence, metadata=m, identifier=str(seq_index)) rep_sequences.append(s) if labels is not None: metadata = {key: labels[key][rep_index] for key in labels.keys()} else: metadata = {} metadata = {**metadata, **{"subject_id": subject_ids[rep_index]}} repertoire = Repertoire.build_from_sequence_objects(rep_sequences, rep_path, metadata) repertoires.append(repertoire) df = pd.DataFrame({**{"filename": [f"{repertoire.identifier}_data.npy" for repertoire in repertoires], "subject_id": subject_ids, "repertoire_identifier": [repertoire.identifier for repertoire in repertoires]}, **(labels if labels is not None else {})}) df.to_csv(path + "metadata.csv", index=False) return repertoires, path + "metadata.csv"
def create_dummy_repertoire(self, path): sequence_objects = [ ReceptorSequence(amino_acid_sequence="AAA", nucleotide_sequence="GCTGCTGCT", identifier="receptor_1", metadata=SequenceMetadata(v_gene="TRBV1", j_gene="TRBJ1", chain=Chain.BETA, count=5, region_type="IMGT_CDR3", frame_type="IN", custom_params={ "d_call": "TRBD1", "custom_test": "cust1" })), ReceptorSequence(amino_acid_sequence="GGG", nucleotide_sequence="GGTGGTGGT", identifier="receptor_2", metadata=SequenceMetadata(v_gene="TRAV2", v_allele="TRAV2*01", j_gene="TRAJ2", chain=Chain.ALPHA, count=15, frame_type=None, region_type="IMGT_CDR3", custom_params={ "d_call": "TRAD2", "custom_test": "cust2" })) ] repertoire = Repertoire.build_from_sequence_objects( sequence_objects=sequence_objects, path=path, metadata={"subject_id": "REP1"}) df = pd.DataFrame({ "filename": [f"{repertoire.identifier}_data.npy"], "subject_id": ["1"], "repertoire_identifier": [repertoire.identifier] }) df.to_csv(path + "metadata.csv", index=False) return repertoire, path + "metadata.csv"
def generate_receptor_dataset(receptor_count: int, chain_1_length_probabilities: dict, chain_2_length_probabilities: dict, labels: dict, path: str): """ Creates receptor_count receptors where the length of sequences in each chain is sampled independently for each sequence from chain_n_length_probabilities distribution. The labels are also randomly assigned to receptors from the distribution given in labels. In this case, labels are multi-class, so each receptor will get one class from each label. This means that negative classes for the labels should be included as well in the specification. chain 1 and 2 in this case refer to alpha and beta chain of a T-cell receptor. An example of input parameters is given below: receptor_count: 100 # generate 100 TRABReceptors chain_1_length_probabilities: 14: 0.8 # 80% of all generated sequences for all receptors (for chain 1) will have length 14 15: 0.2 # 20% of all generated sequences across all receptors (for chain 1) will have length 15 chain_2_length_probabilities: 14: 0.8 # 80% of all generated sequences for all receptors (for chain 2) will have length 14 15: 0.2 # 20% of all generated sequences across all receptors (for chain 2) will have length 15 labels: epitope1: # label name True: 0.5 # 50% of the receptors will have class True False: 0.5 # 50% of the receptors will have class False epitope2: # next label with classes that will be assigned to receptors independently of the previous label or other parameters 1: 0.3 # 30% of the generated receptors will have class 1 0: 0.7 # 70% of the generated receptors will have class 0 """ RandomDatasetGenerator._check_receptor_dataset_generation_params(receptor_count, chain_1_length_probabilities, chain_2_length_probabilities, labels, path) alphabet = EnvironmentSettings.get_sequence_alphabet() PathBuilder.build(path) get_random_sequence = lambda proba, chain, id: ReceptorSequence("".join(random.choices(alphabet, k=random.choices(list(proba.keys()), proba.values())[0])), metadata=SequenceMetadata(count=1, v_subgroup=chain+"V1", v_gene=chain+"V1-1", v_allele=chain+"V1-1*01", j_subgroup=chain + "J1", j_gene=chain + "J1-1", j_allele=chain + "J1-1*01", chain=chain, cell_id=id)) receptors = [TCABReceptor(alpha=get_random_sequence(chain_1_length_probabilities, "TRA", i), beta=get_random_sequence(chain_2_length_probabilities, "TRB", i), metadata={**{label: random.choices(list(label_dict.keys()), label_dict.values(), k=1)[0] for label, label_dict in labels.items()}, **{"subject": f"subj_{i + 1}"}}) for i in range(receptor_count)] filename = f"{path if path[-1] == '/' else path + '/'}batch01.pickle" with open(filename, "wb") as file: pickle.dump(receptors, file) return ReceptorDataset(params={label: list(label_dict.keys()) for label, label_dict in labels.items()}, filenames=[filename], file_size=receptor_count)
def __init__(self, amino_acid_sequence: str = None, nucleotide_sequence: str = None, identifier: str = None, annotation: SequenceAnnotation = None, metadata: SequenceMetadata = SequenceMetadata()): self.identifier = identifier self.amino_acid_sequence = amino_acid_sequence self.nucleotide_sequence = nucleotide_sequence self.annotation = annotation self.metadata = metadata
def _create_new_sequences(self, sequences, new_sequence_count, signal) -> List[ReceptorSequence]: new_sequences = sequences[:-new_sequence_count] for _ in range(new_sequence_count): motif = random.choice(signal.motifs) motif_instance = motif.instantiate_motif() annotation = SequenceAnnotation([ImplantAnnotation(signal_id=signal.id, motif_id=motif.identifier, motif_instance=motif_instance.instance, position=0)]) metadata = SequenceMetadata(v_gene="TRBV6-1", j_gene="TRBJ2-7", count=1, chain="B") new_sequences.append(ReceptorSequence(amino_acid_sequence=motif_instance.instance, annotation=annotation, metadata=metadata)) return new_sequences
def test_match_repertoire(self): path = EnvironmentSettings.root_path + "test/tmp/seqmatchrep/" PathBuilder.build(path) repertoire = Repertoire.build_from_sequence_objects(sequence_objects=[ ReceptorSequence(amino_acid_sequence="AAAAAA", identifier="1", metadata=SequenceMetadata(chain="A", count=3)), ReceptorSequence(amino_acid_sequence="CCCCCC", identifier="2", metadata=SequenceMetadata(chain="A", count=2)), ReceptorSequence(amino_acid_sequence="AAAACC", identifier="3", metadata=SequenceMetadata(chain="A", count=1)), ReceptorSequence(amino_acid_sequence="TADQVF", identifier="4", metadata=SequenceMetadata(chain="A", count=4)) ], metadata={ "CD": True }, path=path) sequences = [ ReceptorSequence("AAAACA", metadata=SequenceMetadata(chain="A")), ReceptorSequence("TADQV", metadata=SequenceMetadata(chain="A")) ] matcher = SequenceMatcher() result = matcher.match_repertoire(repertoire, 0, sequences, 2, SequenceMatchingSummaryType.COUNT) self.assertTrue("sequences" in result) self.assertTrue("repertoire" in result) self.assertTrue("repertoire_index" in result) self.assertEqual(4, len(result["sequences"])) self.assertEqual(1, len(result["sequences"][0]["matching_sequences"])) self.assertEqual(0, len(result["sequences"][1]["matching_sequences"])) self.assertEqual(1, len(result["sequences"][2]["matching_sequences"])) self.assertEqual(1, len(result["sequences"][3]["matching_sequences"])) self.assertEqual( 3, len([ r for r in result["sequences"] if len(r["matching_sequences"]) > 0 ])) self.assertTrue(result["metadata"]["CD"]) result = matcher.match_repertoire( repertoire, 0, sequences, 2, SequenceMatchingSummaryType.CLONAL_PERCENTAGE) self.assertEqual(0.8, result["clonal_percentage"]) shutil.rmtree(path)
def test_match(self): path = EnvironmentSettings.root_path + "test/tmp/seqmatch/" PathBuilder.build(path) repertoire = Repertoire.build_from_sequence_objects( sequence_objects=[ ReceptorSequence(amino_acid_sequence="AAAAAA", metadata=SequenceMetadata(chain="A", v_gene="V1", j_gene="J2"), identifier="3"), ReceptorSequence(amino_acid_sequence="CCCCCC", metadata=SequenceMetadata(chain="A", v_gene="V1", j_gene="J2"), identifier="4"), ReceptorSequence(amino_acid_sequence="AAAACC", metadata=SequenceMetadata(chain="A", v_gene="V1", j_gene="J2"), identifier="5"), ReceptorSequence(amino_acid_sequence="TADQVF", metadata=SequenceMetadata(chain="A", v_gene="V1", j_gene="J3"), identifier="6") ], metadata={"CD": True}, path=path) dataset = RepertoireDataset(repertoires=[repertoire]) sequences = [ ReceptorSequence("AAAACA", metadata=SequenceMetadata(chain="A", v_gene="V1", j_gene="J2"), identifier="1"), ReceptorSequence("TADQV", metadata=SequenceMetadata(chain="A", v_gene="V1", j_gene="J3"), identifier="2") ] matcher = SequenceMatcher() result = matcher.match(dataset, sequences, 2, SequenceMatchingSummaryType.PERCENTAGE) self.assertTrue("repertoires" in result) self.assertEqual( 1, len(result["repertoires"][0]["sequences"][3] ["matching_sequences"])) self.assertTrue(result["repertoires"][0]["metadata"]["CD"]) self.assertEqual(1, len(result["repertoires"])) shutil.rmtree(path)
def _make_sequence_object(self, row): fields = row.dtype.names keys = [key for key in row.dtype.names if "signal" in key] implants = [] for key in keys: value_dict = row[key] if value_dict: implants.append( ImplantAnnotation(**ast.literal_eval(value_dict))) seq = ReceptorSequence( amino_acid_sequence=row["sequence_aas"] if "sequence_aas" in fields else None, nucleotide_sequence=row["sequences"] if "sequences" in fields else None, identifier=row["sequence_identifiers"] if "sequence_identifiers" in fields else None, metadata=SequenceMetadata( v_gene=row["v_genes"] if "v_genes" in fields else None, j_gene=row["j_genes"] if "j_genes" in fields else None, v_subgroup=row["v_subgroups"] if "v_subgroups" in fields else None, j_subgroup=row["j_subgroups"] if "j_subgroups" in fields else None, v_allele=row["v_alleles"] if "v_alleles" in fields else None, j_allele=row["j_alleles"] if "j_alleles" in fields else None, chain=row["chains"] if "chains" in fields else None, count=row["counts"] if "counts" in fields else None, region_type=row["region_types"] if "region_types" in fields else None, frame_type=row["frame_types"] if "frame_types" in fields else "IN", cell_id=row["cell_ids"] if "cell_ids" in fields else None, custom_params={ key: row[key] if key in fields else None for key in set(self.fields) - set(Repertoire.FIELDS) }), annotation=SequenceAnnotation(implants=implants)) return seq
def import_sequence(row, metadata_columns=None) -> ReceptorSequence: if metadata_columns is None: metadata_columns = [] metadata = SequenceMetadata(v_gene=str(row["v_genes"]) if "v_genes" in row and row["v_genes"] is not None else None, v_allele=str(row["v_alleles"]) if "v_alleles" in row and row["v_alleles"] is not None else None, j_gene=str(row["j_genes"]) if "j_genes" in row and row["j_genes"] is not None else None, j_allele=str(row["j_alleles"]) if "j_alleles" in row and row["j_alleles"] is not None else None, chain=row["chains"] if "chains" in row and row["chains"] is not None else None, region_type=row["region_types"] if "region_types" in row and row["region_types"] is not None else None, count=int(row["counts"]) if "counts" in row and row["counts"] is not None else None, frame_type=row["frame_types"] if "frame_types" in row and row["frame_types"] is not None else None, custom_params={custom_col: row[custom_col] for custom_col in metadata_columns if custom_col in row} if metadata_columns is not None else {}) sequence = ReceptorSequence( amino_acid_sequence=str(row["sequence_aas"]) if "sequence_aas" in row and row["sequence_aas"] is not None else None, nucleotide_sequence=str(row["sequences"]) if "sequences" in row and row["sequences"] is not None else None, identifier=str(row["sequence_identifiers"]) if "sequence_identifiers" in row and row["sequence_identifiers"] is not None else None, metadata=metadata) return sequence
def process_iris_chain(row, chain, dual_chain_id, all_genes): sequences = ReceptorSequenceList() v_alleles = set([ gene.replace("TR{}".format(chain), "").replace(chain, "") for gene in row["TR{} - V gene (1)".format(chain)].split(" | ") ]) j_alleles = set([ gene.replace("TR{}".format(chain), "").replace(chain, "") for gene in row["TR{} - J gene (1)".format(chain)].split(" | ") ]) make_sequence_metadata = lambda v_allele, j_allele, chain, dual_chain_id: \ SequenceMetadata(v_gene=v_allele.split(Constants.ALLELE_DELIMITER)[0], v_allele=v_allele, v_subgroup=v_allele.split("-")[0], j_gene=j_allele.split(Constants.ALLELE_DELIMITER)[0], j_allele=j_allele, j_subgroup=j_allele.split("-")[0], chain=chain, custom_params={"dual_chain_id": dual_chain_id}) if all_genes: for v_allele in v_alleles: for j_allele in j_alleles: metadata = make_sequence_metadata(v_allele, j_allele, chain, dual_chain_id) sequences.append( ReceptorSequence(amino_acid_sequence=row[ f"Chain: TR{chain} ({dual_chain_id})"], metadata=metadata)) else: # select a random v and j gene v_allele = v_alleles.pop() j_allele = j_alleles.pop() metadata = make_sequence_metadata(v_allele, j_allele, chain, dual_chain_id) sequences.append( ReceptorSequence(amino_acid_sequence=row[ f"Chain: TR{chain} ({dual_chain_id})"], metadata=metadata)) return sequences
def test_encode(self): path = EnvironmentSettings.root_path + "test/tmp/evennessenc/" PathBuilder.build(path) rep1 = Repertoire.build_from_sequence_objects(sequence_objects=[ ReceptorSequence("AAA", metadata=SequenceMetadata(count=10)) for i in range(1000) ] + [ ReceptorSequence("AAA", metadata=SequenceMetadata(count=100)) for i in range(1000) ] + [ ReceptorSequence("AAA", metadata=SequenceMetadata(count=1)) for i in range(1000) ], metadata={ "l1": "test_1", "l2": 2 }, path=path) rep2 = Repertoire.build_from_sequence_objects(sequence_objects=[ ReceptorSequence("AAA", metadata=SequenceMetadata(count=10)) for i in range(1000) ], metadata={ "l1": "test_2", "l2": 3 }, path=path) lc = LabelConfiguration() lc.add_label("l1", ["test_1", "test_2"]) lc.add_label("l2", [0, 3]) dataset = RepertoireDataset(repertoires=[rep1, rep2]) encoder = EvennessProfileEncoder.build_object( dataset, **{ "min_alpha": 0, "max_alpha": 10, "dimension": 51 }) d1 = encoder.encode( dataset, EncoderParams( result_path=path + "1/", label_config=lc, )) encoder = EvennessProfileEncoder.build_object( dataset, **{ "min_alpha": 0, "max_alpha": 10, "dimension": 11 }) d2 = encoder.encode( dataset, EncoderParams(result_path=path, label_config=lc, pool_size=2)) self.assertAlmostEqual(d1.encoded_data.examples[0, 0], 1) self.assertAlmostEqual(d1.encoded_data.examples[0, 1], 0.786444) self.assertAlmostEqual(d1.encoded_data.examples[1, 0], 1) self.assertAlmostEqual(d1.encoded_data.examples[1, 1], 1) shutil.rmtree(path)
def test_repertoire(self): path = EnvironmentSettings.tmp_test_path + "sequencerepertoire/" PathBuilder.build(path) sequences = [ ReceptorSequence(amino_acid_sequence="AAA", identifier="1", metadata=SequenceMetadata(v_gene="V1", cell_id="1", chain=Chain.ALPHA, custom_params={ "cmv": "no", "coeliac": False })), ReceptorSequence(amino_acid_sequence="CCC", identifier="2", metadata=SequenceMetadata(j_gene="J1", cell_id="1", chain=Chain.BETA, custom_params={ "cmv": "yes", "coeliac": True })) ] obj = Repertoire.build_from_sequence_objects(sequences, path, { "cmv": "yes", 'subject_id': "1" }) self.assertTrue(os.path.isfile(obj.data_filename)) self.assertTrue(isinstance(obj, Repertoire)) self.assertTrue( np.array_equal(np.array(["1", "2"]), obj.get_sequence_identifiers())) self.assertTrue( np.array_equal(np.array(["AAA", "CCC"]), obj.get_sequence_aas())) self.assertTrue( np.array_equal(np.array(["V1", None]), obj.get_v_genes())) self.assertTrue( np.array_equal(np.array([None, "J1"]), obj.get_j_genes())) self.assertTrue( np.array_equal(np.array(["no", "yes"]), obj.get_attribute("cmv"))) self.assertTrue( np.array_equal(np.array([False, True]), obj.get_attribute("coeliac"))) self.assertEqual("yes", obj.metadata["cmv"]) self.assertEqual("1", obj.metadata["subject_id"]) rebuilt_sequences = obj.sequences self.assertTrue( all( isinstance(seq, ReceptorSequence) for seq in rebuilt_sequences)) self.assertEqual(2, len(rebuilt_sequences)) self.assertEqual("1", rebuilt_sequences[0].identifier) self.assertEqual("2", rebuilt_sequences[1].identifier) self.assertEqual("AAA", rebuilt_sequences[0].amino_acid_sequence) self.assertEqual("yes", rebuilt_sequences[1].metadata.custom_params["cmv"]) obj.free_memory() self.assertTrue(key in obj.data for key in Repertoire.FIELDS) self.assertTrue(obj.data[key] is None for key in Repertoire.FIELDS) shutil.rmtree(path)
def test_receptor(self): path = EnvironmentSettings.tmp_test_path + "receptortestingpathrepertoire/" PathBuilder.build(path) sequences = [ ReceptorSequence(amino_acid_sequence="AAA", identifier="1", metadata=SequenceMetadata(v_gene="V1", cell_id="1", chain=Chain.ALPHA, custom_params={ "cmv": "no", "coeliac": False })), ReceptorSequence(amino_acid_sequence="CCC", identifier="2", metadata=SequenceMetadata(j_gene="J1", cell_id="1", chain=Chain.BETA, custom_params={ "cmv": "yes", "coeliac": True })), ReceptorSequence(amino_acid_sequence="FFF", identifier="3", metadata=SequenceMetadata(v_gene="V1", cell_id="1", chain=Chain.ALPHA, custom_params={ "cmv": "no", "coeliac": False })), ReceptorSequence(amino_acid_sequence="EEE", identifier="4", metadata=SequenceMetadata(j_gene="J1", cell_id="1", chain=Chain.BETA, custom_params={ "cmv": "yes", "coeliac": True })), ReceptorSequence(amino_acid_sequence="FFF", identifier="5", metadata=SequenceMetadata(v_gene="V1", cell_id="2", chain=Chain.GAMMA, custom_params={ "cmv": "no", "coeliac": False })), ReceptorSequence(amino_acid_sequence="EEE", identifier="6", metadata=SequenceMetadata(j_gene="J1", cell_id="2", chain=Chain.DELTA, custom_params={ "cmv": "yes", "coeliac": True })), ReceptorSequence(amino_acid_sequence="EEE", identifier="7", metadata=SequenceMetadata(j_gene="J2", cell_id="2", chain=Chain.DELTA, custom_params={ "cmv": "yes", "coeliac": True })) ] obj = Repertoire.build_from_sequence_objects(sequences, path, { "cmv": "yes", 'subject_id': "1" }) receptors = obj.receptors self.assertEqual(6, len(receptors)) cells = obj.cells self.assertEqual(2, len(cells)) shutil.rmtree(path)
def test(self): sequences = [ ReceptorSequence( amino_acid_sequence="AAACCC", identifier="1", metadata=SequenceMetadata(custom_params={"l1": 1})), ReceptorSequence( amino_acid_sequence="ACACAC", identifier="2", metadata=SequenceMetadata(custom_params={"l1": 2})), ReceptorSequence( amino_acid_sequence="CCCAAA", identifier="3", metadata=SequenceMetadata(custom_params={"l1": 1})), ReceptorSequence( amino_acid_sequence="AAACCC", identifier="4", metadata=SequenceMetadata(custom_params={"l1": 2})), ReceptorSequence( amino_acid_sequence="ACACAC", identifier="5", metadata=SequenceMetadata(custom_params={"l1": 1})), ReceptorSequence( amino_acid_sequence="CCCAAA", identifier="6", metadata=SequenceMetadata(custom_params={"l1": 2})), ReceptorSequence( amino_acid_sequence="AAACCC", identifier="7", metadata=SequenceMetadata(custom_params={"l1": 1})), ReceptorSequence( amino_acid_sequence="ACACAC", identifier="8", metadata=SequenceMetadata(custom_params={"l1": 2})), ReceptorSequence( amino_acid_sequence="CCCAAA", identifier="9", metadata=SequenceMetadata(custom_params={"l1": 1})) ] path = EnvironmentSettings.tmp_test_path + "kmrefreqseqfacencoder/" PathBuilder.build(path) filename = "{}sequences.pkl".format(path) with open(filename, "wb") as file: pickle.dump(sequences, file) lc = LabelConfiguration() lc.add_label("l1", [1, 2]) dataset = SequenceDataset(params={"l1": [1, 2]}, filenames=[filename], identifier="d1") encoder = KmerFreqSequenceEncoder.build_object( dataset, **{ "normalization_type": NormalizationType.RELATIVE_FREQUENCY.name, "reads": ReadsType.UNIQUE.name, "sequence_encoding": SequenceEncodingType.CONTINUOUS_KMER.name, "k": 3 }) encoded_dataset = encoder.encode( dataset, EncoderParams(result_path=path + "2/", label_config=lc, pool_size=2, learn_model=True, model={}, filename="dataset.csv")) self.assertEqual(9, encoded_dataset.encoded_data.examples.shape[0]) self.assertTrue( all(identifier in encoded_dataset.encoded_data.example_ids for identifier in ['1', '2', '3', '4', '5', '6', '7', '8', '9'])) self.assertTrue( numpy.array_equal(encoded_dataset.encoded_data.examples[0].A, encoded_dataset.encoded_data.examples[3].A)) shutil.rmtree(path)
def _import_from_files( filenames: List[str], generic_params: DatasetImportParams) -> ReceptorDataset: elements = [] for file in filenames: df = pd.read_csv(file, sep=generic_params.separator, usecols=generic_params.columns_to_load) df.dropna() df.drop_duplicates() df.rename(columns=generic_params.column_mapping, inplace=True) if "alpha_amino_acid_sequence" in df: df["alpha_amino_acid_sequence"] = df[ "alpha_amino_acid_sequence"].str[1:-1] if "beta_amino_acid_sequence" in df: df["beta_amino_acid_sequence"] = df[ "beta_amino_acid_sequence"].str[1:-1] if "alpha_nucleotide_sequence" in df: df["alpha_nucleotide_sequence"] = df[ "alpha_nucleotide_sequence"].str[3:-3] if "beta_nucleotide_sequence" in df: df["beta_nucleotide_sequence"] = df[ "beta_nucleotide_sequence"].str[3:-3] chain_vals = [ch for ch in generic_params.receptor_chains.value] chain_names = [ Chain.get_chain(ch).name.lower() for ch in generic_params.receptor_chains.value ] for chain_name in chain_names: df = SingleLineReceptorImport.make_gene_columns( df, ["v", "j"], chain_name) for index, row in df.iterrows(): sequences = { chain_vals[i]: ReceptorSequence( amino_acid_sequence=row[chain_name + "_amino_acid_sequence"] if chain_name + "_amino_acid_sequence" in row else None, nucleotide_sequence=row[chain_name + "_nucleotide_sequence"] if chain_name + "_nucleotide_sequence" in row else None, metadata=SequenceMetadata( v_gene=row[f"{chain_name}_v_gene"], v_allele=row[f"{chain_name}_v_allele"], v_subgroup=row[f'{chain_name}_v_subgroup'], j_gene=row[f"{chain_name}_j_gene"], j_allele=row[f"{chain_name}_j_allele"], j_subgroup=row[f'{chain_name}_j_subgroup'], chain=chain_name, count=row["count"], region_type=generic_params.region_type.value)) for i, chain_name in enumerate(chain_names) } elements.append( ReceptorBuilder.build_object( sequences, row["identifier"], { key: row[key] for key in row.keys() if all( item not in key for item in ["v_gene", 'j_gene', "count", "identifier"] + chain_names) })) return ReceptorDataset.build(elements, generic_params.sequence_file_size, generic_params.result_path)