def test_epitope_unequal(self): seq1_example = 'aaAA' seq2_example = 'aaaA' epitope1 = Epitope(SeqRecord(Seq(seq1_example))) epitope2 = Epitope(SeqRecord(Seq(seq2_example))) self.assertNotEqual(epitope1, epitope2)
def test_get_epitopes_with_max_verified_regions(self): expected_epitopes_dataset = EpitopesDataset([ Epitope(SeqRecord(Seq('AaAA'))), Epitope(SeqRecord(Seq('bBBBB'))), Epitope(SeqRecord(Seq('DDdDdD'))) ]) epitopes_clusters = EpitopesClusters(EPITOPES_CLUSTERS1_PATH, EPITOPES_FASTA1_PATH) actual_epitopes_dataset = get_epitopes_with_max_verified_regions( epitopes_clusters) self.assertEqual(expected_epitopes_dataset, actual_epitopes_dataset)
def test_merge_identical_seqs(self): expected_epitopes = \ [ add_verified_regions_lst(Epitope(SeqRecord(Seq('aaaA'))), [(2, 3), (3, 3), (1, 2)]), Epitope(SeqRecord(Seq('B'))), add_verified_regions_lst(Epitope(SeqRecord(Seq('bBBbb'))), [(3, 4), (2, 3)]), add_verified_regions_lst(Epitope(SeqRecord(Seq('cccCC'))), [(2, 3)]), add_verified_regions_lst(Epitope(SeqRecord(Seq('Dd'))), [(1, 1)]) ] epitopes_dataset = EpitopesDataset(EPITOPES_BATCHES_PATHS) epitopes_dataset.merge_identical_seqs() actual_epitopes = list(epitopes_dataset) self.assertEqual(expected_epitopes, actual_epitopes)
def __parse_clstr_file(clstr_file_path: str, records_fasta_path: str) -> List[List[Epitope]]: with open(records_fasta_path) as records_fasta_file: records_dict = SeqIO.to_dict( SeqIO.parse(records_fasta_file, 'fasta')) epitopes_clusters_lst = [] with open(clstr_file_path) as epitopes_ids_clusters_file: curr_cluster = [] for line in epitopes_ids_clusters_file.readlines(): line = line.strip() if line != '': # when new cluster found appending the current cluster set and creating new one # if the cluster set is not empty (should occur on first line) if line.startswith(CLUSTER_PREFIX): if len(curr_cluster) > 0: epitopes_clusters_lst.append(curr_cluster) curr_cluster = [] else: epitope_id = line.split(EPITOPE_ID_PREFIX)[1].split( EPITOPE_ID_SUFFIX)[0] seq_record = records_dict[epitope_id] epitope = Epitope(seq_record) curr_cluster.append(epitope) # adding last cluster ser if len(curr_cluster) > 0: epitopes_clusters_lst.append(curr_cluster) return epitopes_clusters_lst
def test_epitope_init_verified_region3(self): seq_example = 'AAaa' expected_verified_regions = [(0, 1)] epitope = Epitope(SeqRecord(Seq(seq_example))) actual_verified_regions = epitope.verified_regions self.assertEqual(expected_verified_regions, actual_verified_regions)
def test_init1(self): seq_example = 'aaaA' expected_seq = seq_example expected_verified_regions = [(3, 3)] actual_epitope = Epitope(SeqRecord(Seq(seq_example))) actual_verified_regions = actual_epitope.verified_regions self.assertEqual(expected_seq, str(actual_epitope)) self.assertEqual(expected_verified_regions, actual_verified_regions)
def __parse_records_batches_fasta_files( records_batches_fasta_paths: List[str]) -> List[Epitope]: raw_records = [] for records_batch_fasta_path in records_batches_fasta_paths: with open(records_batch_fasta_path) as records_batch_file: records_batch = [ Epitope(seq_record) for seq_record in SeqIO.parse(records_batch_file, 'fasta') ] raw_records.extend(records_batch) return raw_records
def test_add_verified_region(self): seq_example = 'AAaa' verified_region_to_add_example = (3, 3) expected_verified_regions = [(0, 1), (3, 3)] expected_record_seq = 'AAaA' epitope = Epitope(SeqRecord(Seq(seq_example))) epitope.add_verified_region(verified_region_to_add_example) actual_verified_regions = epitope.verified_regions actual_record_seq = str(epitope.record.seq) self.assertEqual(expected_verified_regions, actual_verified_regions) self.assertEqual(expected_record_seq, actual_record_seq)
def test_unequal(self): epitopes_dataset1 = EpitopesDataset( [ Epitope(SeqRecord(Seq('a'))), Epitope(SeqRecord(Seq('A'))), Epitope(SeqRecord(Seq('aa'))), Epitope(SeqRecord(Seq('aa'))), ] ) epitopes_dataset2 = EpitopesDataset( [ Epitope(SeqRecord(Seq('a'))), Epitope(SeqRecord(Seq('B'))), Epitope(SeqRecord(Seq('aa'))), Epitope(SeqRecord(Seq('aa'))), ] ) self.assertNotEqual(epitopes_dataset1, epitopes_dataset2)
def test_init(self): expected_epitopes_clusters_lst = [[ Epitope(SeqRecord('AaAA')), Epitope(SeqRecord('aaaa')) ], [Epitope(SeqRecord('bBBBB'))], [ Epitope(SeqRecord('ddDDD')), Epitope(SeqRecord('DDdDdD')), Epitope(SeqRecord('DDdddD')) ]] actual_epitopes_clusters = EpitopesClusters(EPITOPES_CLUSTERS_PATH, EPITOPES_FASTA_PATH) self.assertEqual(expected_epitopes_clusters_lst, list(actual_epitopes_clusters))
def test_init_epitopes_dataset(self): expected_epitopes = \ [ Epitope(SeqRecord(Seq('aaaA'))), Epitope(SeqRecord(Seq('aaAA'))), Epitope(SeqRecord(Seq('B'))), Epitope(SeqRecord(Seq('aaaA'))), Epitope(SeqRecord(Seq('aAAa'))), Epitope(SeqRecord(Seq('bBBbb'))), Epitope(SeqRecord(Seq('bbbBB'))), Epitope(SeqRecord(Seq('bbBBb'))), Epitope(SeqRecord(Seq('cccCC'))), Epitope(SeqRecord(Seq('ccCCc'))), Epitope(SeqRecord(Seq('Dd'))), Epitope(SeqRecord(Seq('dD'))) ] actual_epitopes_dataset = EpitopesDataset(EPITOPES_BATCHES_PATHS) self.assertEqual(expected_epitopes, list(actual_epitopes_dataset))
def test_split_epitopes_clusters_to_cv_groups_cv10(self): cv_fold = 10 expected_epitopes_cv_datasets = [ EpitopesDataset([ Epitope(SeqRecord(Seq('aaaAA'))), Epitope(SeqRecord(Seq('Aaa'))), Epitope(SeqRecord(Seq('bbbBC'))), Epitope(SeqRecord(Seq('cccaaCd'))), Epitope(SeqRecord(Seq('DcDcDc'))), Epitope(SeqRecord(Seq('AAAAA'))) ]), EpitopesDataset([ Epitope(SeqRecord(Seq('aaaG'))), Epitope(SeqRecord(Seq('GCAcGcGa'))), Epitope(SeqRecord(Seq('aCGPfpc'))), Epitope(SeqRecord(Seq('cccccCCCccc'))), Epitope(SeqRecord(Seq('GgG'))), Epitope(SeqRecord(Seq('DDDDD'))) ]), EpitopesDataset([ Epitope(SeqRecord(Seq('EEeeeGGGDDD'))), Epitope(SeqRecord(Seq('BBBbbb'))), Epitope(SeqRecord(Seq('NMnMnM'))), Epitope(SeqRecord(Seq('KPkgK'))), Epitope(SeqRecord(Seq('AAAaaA'))), Epitope(SeqRecord(Seq('AAAaaaA'))) ]), EpitopesDataset([ Epitope(SeqRecord(Seq('BBBbbBbB'))), Epitope(SeqRecord(Seq('CCCccC'))), Epitope(SeqRecord(Seq('GGGggGG'))), Epitope(SeqRecord(Seq('CcCcCc'))), Epitope(SeqRecord(Seq('cCcccc'))), Epitope(SeqRecord(Seq('ccCccC'))), ]), EpitopesDataset([ Epitope(SeqRecord(Seq('CccCC'))), Epitope(SeqRecord(Seq('cccCCcC'))), Epitope(SeqRecord(Seq('CccCCCccC'))), Epitope(SeqRecord(Seq('CcccCCcccc'))), Epitope(SeqRecord(Seq('cccCCCccC'))), Epitope(SeqRecord(Seq('cccCccccc'))) ]), EpitopesDataset([ Epitope(SeqRecord(Seq('aaAAAaaAA'))), Epitope(SeqRecord(Seq('BBBBbbBB'))), Epitope(SeqRecord(Seq('bbbBBBBB'))), Epitope(SeqRecord(Seq('GGGGgg'))), Epitope(SeqRecord(Seq('GGGG'))) ]), EpitopesDataset([ Epitope(SeqRecord(Seq('TTTtTTT'))), Epitope(SeqRecord(Seq('HHHHHHhhh'))), Epitope(SeqRecord(Seq('HHHhhhhKK'))), Epitope(SeqRecord(Seq('kkkkKKKk'))), Epitope(SeqRecord(Seq('UUUuuuU'))), Epitope(SeqRecord(Seq('GFgGgF'))), Epitope(SeqRecord(Seq('CCCcCBBb'))), Epitope(SeqRecord(Seq('mmmmmmMMMmm'))), Epitope(SeqRecord(Seq('BBbb'))) ]), EpitopesDataset([ Epitope(SeqRecord(Seq('GGGg'))), Epitope(SeqRecord(Seq('AAaa'))), Epitope(SeqRecord(Seq('AAAa'))) ]) ] epitopes_clusters = EpitopesClusters(EPITOPES_CLUSTERS2_PATH, EPITOPES_FASTA2_PATH) actual_epitopes_cv_datasets = split_epitopes_clusters_to_cv_datasets( epitopes_clusters, cv_fold, shuffle_clusters=False) self.assertEqual(expected_epitopes_cv_datasets, actual_epitopes_cv_datasets)