def test_dataset_with_gene_missing_reading_frame(self): SAMPLE_DATA_PATH = os.path.join( os.path.abspath(os.path.dirname(__file__)), 'sample_data.txt') with open(SAMPLE_DATA_PATH, 'r') as handle: sample_data = json.loads(handle.read()) data = [] append = data.append for i in sample_data: seq_record = SeqRecordExpanded(i['seq'], voucher_code=i['voucher_code'], taxonomy=i['taxonomy'], gene_code=i['gene_code'], reading_frame=i['reading_frame'], table=i['table']) if i['gene_code'] == 'ArgKin': seq_record.reading_frame = None append(seq_record) self.assertRaises(ValueError, Dataset, data, format='NEXUS', codon_positions='ALL', partitioning='1st-2nd, 3rd', aminoacids=True)
def test_degen_standard(self): seq_record = SeqRecordExpanded(self.seq, reading_frame=1, table=1) expected = 'TCNGARTGGAARACNAARMGNCCN' self.assertEqual(expected, seq_record.degenerate(), 'Using reading_frame=1') seq_record = SeqRecordExpanded(self.seq, reading_frame=2, table=1) expected = 'YTNAAYGGNMGNCARAGYGTNCA' self.assertEqual(expected, seq_record.degenerate(), 'Using reading_frame=2')
def test_degen_sz(self): seq_record = SeqRecordExpanded(self.seq, reading_frame=1) expected = 'NNNGARTGGAARACNAARMGNCCN' self.assertEqual(expected, seq_record.degenerate(method='SZ'), 'Using reading_frame=1') seq_record = SeqRecordExpanded(self.seq, reading_frame=3) expected = 'TGAATGGARGAYAARGCNNNNA' self.assertEqual(expected, seq_record.degenerate(method='SZ'), 'Using reading_frame=3')
def test_translate_with_table_at_function_level(self): seq = 'TCTGAATGGAAGACAAAGCGTCCA' seq_record = SeqRecordExpanded(seq, reading_frame=1) expected = 'SEWKTKRP' self.assertEqual(expected, seq_record.translate(table=1)) seq = 'ACACGTCGACTCCGGCAAGTCCACTACCACAGGA' seq_record = SeqRecordExpanded(seq, reading_frame=2) expected = 'HVDSGKSTTTG' self.assertEqual(expected, seq_record.translate(table=1))
def test_gapped_translation_with_mixed_codons(self): seq = 'TCTN--GAATGGAAGACAAAGCGTCCA' seq_record = SeqRecordExpanded(seq, reading_frame=1) self.assertRaises(TranslationErrorMixedGappedSeq, seq_record.translate, table=1) try: seq_record.translate(table=1) except TranslationErrorMixedGappedSeq as e: self.assertTrue("Gene" in e.__str__())
def test_getting_codon_positions_reading_frame_3(self): seq = 'GTCGTGGGGGCCCACGTGGACGTGG' seq_record = SeqRecordExpanded(seq, reading_frame=3) expected = 'CGGCCGCG' self.assertEqual(expected, seq_record.first_codon_position(), 'Fist codon position') expected = 'GGGGCGGGG' self.assertEqual(expected, seq_record.second_codon_position(), 'Second codon position') expected = 'TTGCATAT' self.assertEqual(expected, seq_record.third_codon_position(), 'Third codon position')
def test_getting_codon_positions_reading_frame_1(self): seq = 'GAATGGAAGACAAAGTCTCGTCCA' seq_record = SeqRecordExpanded(seq, reading_frame=1) expected = 'GTAAATCC' self.assertEqual(expected, seq_record.first_codon_position(), 'Fist codon position') expected = 'AGACACGC' self.assertEqual(expected, seq_record.second_codon_position(), 'Second codon position') expected = 'AGGAGTTA' self.assertEqual(expected, seq_record.third_codon_position(), 'Third codon position')
def test_getting_codon_positions_reading_frame_2(self): seq = 'ACACGTCGACTCCGGCAAGTCCACTACCACAGGACATTTGATTTACAAATGTGGTGGTATCGACAAGCGT' seq_record = SeqRecordExpanded(seq, reading_frame=2) expected = 'CGGTGATAAAGCTATATGGAGAC' self.assertEqual(expected, seq_record.first_codon_position(), 'Fist codon position') expected = 'ATACGACCCCGATTAAGGGTAAG' self.assertEqual(expected, seq_record.second_codon_position(), 'Second codon position') expected = 'ACCCCCGCTCAATGTCATTTCCGT' self.assertEqual(expected, seq_record.third_codon_position(), 'Third codon position')
def test_degenerate(self): SAMPLE_DATA_PATH = os.path.join( os.path.abspath(os.path.dirname(__file__)), 'sample_data.txt') with open(SAMPLE_DATA_PATH, 'r') as handle: sample_data = json.loads(handle.read()) data = [] append = data.append for i in sample_data: seq_record = SeqRecordExpanded(i['seq'], voucher_code=i['voucher_code'], taxonomy=i['taxonomy'], gene_code=i['gene_code'], reading_frame=i['reading_frame'], table=i['table']) append(seq_record) with open(os.path.join(NEXUS_DATA_PATH, 'dataset_degenerated.nex'), 'r') as handle: expected = handle.read().strip() dataset = Dataset(data, format='NEXUS', codon_positions='ALL', partitioning='by gene', degenerate='S') result = dataset.dataset_str self.assertEqual(expected, result)
def test_flatten_taxonomy(self): seq = { 'seq': '????????????', 'voucher_code': "CP100-11", 'taxonomy': { "orden": "Lepidoptera", "family": "Nymphalidae", "subfamily": "Satyrinae", "tribe": "Satyrini", "subtribe": "Euptychiina", "genus": "Euptychia", "species": "", }, 'gene_code': 'ef1a', 'reading_frame': 2, 'table': 1, } dataset_block = DatasetBlock(data="", codon_positions="ALL", partitioning="") seq_record = SeqRecordExpanded( seq['seq'], voucher_code=seq['voucher_code'], taxonomy=seq['taxonomy'], gene_code=seq['gene_code'], reading_frame=seq['reading_frame'], table=seq['table'], ) expected = "_Lepidoptera_Nymphalidae_Satyrinae_Satyrini_Euptychiina_Euptychia" result = dataset_block.flatten_taxonomy(seq_record) self.assertEqual(expected, result)
def build_seq_obj(self, code, gene_code, accession_number, our_taxon_names, all_seqs): """Builds a SeqRecordExpanded object. If cannot be built, returns None. """ this_voucher_seqs = self.extract_sequence_from_all_seqs_in_db(all_seqs, code, gene_code) if this_voucher_seqs == '?': seq = '?' * self.gene_codes_metadata[gene_code]['length'] else: seq = self.create_seq_record(this_voucher_seqs) if code in our_taxon_names: lineage = self.get_lineage(code) seq_record = SeqRecordExpanded( seq, voucher_code=code.replace(" ", "_"), taxonomy=our_taxon_names[code], gene_code=gene_code, reading_frame=self.gene_codes_metadata[gene_code]['reading_frame'], table=self.gene_codes_metadata[gene_code]['genetic_code'], lineage=lineage, accession_number=accession_number, ) return seq_record else: return None
def test_dataset_with_gene_missing_reading_frame(self): SAMPLE_DATA_PATH = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'sample_data.txt') with open(SAMPLE_DATA_PATH, 'r') as handle: sample_data = json.loads(handle.read()) data = [] append = data.append for i in sample_data: seq_record = SeqRecordExpanded(i['seq'], voucher_code=i['voucher_code'], taxonomy=i['taxonomy'], gene_code=i['gene_code'], reading_frame=i['reading_frame'], table=i['table']) if i['gene_code'] == 'ArgKin': seq_record.reading_frame = None append(seq_record) self.assertRaises(ValueError, Dataset, data, format='NEXUS', codon_positions='ALL', partitioning='1st-2nd, 3rd', aminoacids=True)
def test_get_seq__amino_acid_with_missing_bp_as_dash(self): seq_record = SeqRecordExpanded('ATACGGTA-', table=1, reading_frame=1, voucher_code="CP100-10", gene_code="wingless") result = get_seq(seq_record, codon_positions='ALL', aminoacids=True) self.assertEqual("IRX", result.seq)
def setUp(self): self.maxDiff = None with open(SAMPLE_DATA_PATH, 'r') as handle: sample_data = json.loads(handle.read()) self.data = [] append = self.data.append for i in sample_data: seq_record = SeqRecordExpanded(i['seq'], voucher_code=i['voucher_code'], taxonomy=i['taxonomy'], gene_code=i['gene_code'], reading_frame=i['reading_frame'], table=i['table']) append(seq_record)
def test_dataset_when_seqrecord_taxonomy_is_none(self): raw_data = get_test_data('raw_data') raw_data[0]['taxonomy'] = None seq_records = [] for i in raw_data: seq_record = SeqRecordExpanded(i['seq'], voucher_code=i['voucher_code'], taxonomy=i['taxonomy'], gene_code=i['gene_code'], reading_frame=i['reading_frame'], table=i['table']) seq_records.append(seq_record) dataset = Dataset(seq_records, format='NEXUS', partitioning='by gene') self.assertTrue('CP100_10 ' in dataset.dataset_str)
def get_test_data(filename="sample_data.txt"): SAMPLE_DATA_PATH = os.path.join(os.path.abspath(os.path.dirname(__file__)), filename) with open(SAMPLE_DATA_PATH, 'r') as handle: sample_data = json.loads(handle.read()) data = [] append = data.append for i in sample_data: seq_record = SeqRecordExpanded(i['seq'], voucher_code=i['voucher_code'], taxonomy=i['taxonomy'], gene_code=i['gene_code'], reading_frame=i['reading_frame'], table=i['table']) append(seq_record) return data
def test_dataset_when_seqrecord_taxonomy_is_has_family(self): raw_data = get_test_data('raw_data') raw_data[0]['taxonomy'] = { 'family': 'Aussidae', 'genus': 'Aus', 'species': 'aus' } seq_records = [] for i in raw_data: seq_record = SeqRecordExpanded(i['seq'], voucher_code=i['voucher_code'], taxonomy=i['taxonomy'], gene_code=i['gene_code'], reading_frame=i['reading_frame'], table=i['table']) seq_records.append(seq_record) dataset = Dataset(seq_records, format='NEXUS', partitioning='by gene') self.assertTrue('CP100_10_Aussidae_Aus_aus ' in dataset.dataset_str)
def test_translate(self): seq = 'TCTGAATGGAAGACAAAGCGTCCA' seq_record = SeqRecordExpanded(seq, reading_frame=1, table=1) expected = 'SEWKTKRP' self.assertEqual(expected, seq_record.translate(), 'Using reading_frame=1') seq = 'TCTGAATGGAA?ACAAAGCGT???' seq_record = SeqRecordExpanded(seq, reading_frame=1, table=1) expected = 'SEWXTKRX' self.assertEqual(expected, seq_record.translate(), 'Using reading_frame=1') seq = 'ACACGTCGACTCCGGCAAGTCCACTACCACAGGA' seq_record = SeqRecordExpanded(seq, reading_frame=2, table=1) expected = 'HVDSGKSTTTG' self.assertEqual(expected, seq_record.translate(), 'Using reading_frame=2')
def get_test_data(type_of_data=None, filename="sample_data.txt"): """ Parameters: type_of_data (str): ``raw_data`` or ``seq_records`` """ with open(os.path.join(NEXUS_DATA_PATH, "..", filename), "r") as handle: raw_data = json.loads(handle.read()) if type_of_data == 'raw_data': return raw_data elif type_of_data == 'seq_records': seq_records = [] for i in raw_data: seq_record = SeqRecordExpanded(i['seq'], voucher_code=i['voucher_code'], taxonomy=i['taxonomy'], gene_code=i['gene_code'], reading_frame=i['reading_frame'], table=i['table']) seq_records.append(seq_record) return seq_records
def test_flatten_taxonomy__no_taxonomy(self): seq = { 'seq': '????????????', 'voucher_code': "CP100-11", 'gene_code': 'ef1a', 'reading_frame': 2, 'table': 1, } dataset_block = DatasetBlock(data="", codon_positions="ALL", partitioning="") seq_record = SeqRecordExpanded( seq['seq'], voucher_code=seq['voucher_code'], gene_code=seq['gene_code'], reading_frame=seq['reading_frame'], table=seq['table'], ) expected = "" result = dataset_block.flatten_taxonomy(seq_record) self.assertEqual(expected, result)
def build_seq_obj(self, code, gene_code, our_taxon_names, all_seqs): """Builds a SeqRecordExpanded object. I cannot be built, returns None. """ this_voucher_seqs = self.extract_sequence_from_all_seqs_in_db(all_seqs, code, gene_code) if this_voucher_seqs == '?': seq = '?' * self.gene_codes_metadata[gene_code]['length'] else: seq = self.create_seq_record(this_voucher_seqs) seq_record = SeqRecordExpanded(seq) if code in our_taxon_names: seq_record.voucher_code = code seq_record.taxonomy = our_taxon_names[code] seq_record.gene_code = gene_code seq_record.reading_frame = self.gene_codes_metadata[gene_code]['reading_frame'] seq_record.table = self.gene_codes_metadata[gene_code]['genetic_code'] return seq_record else: return None
def setUp(self): self.seq_record = SeqRecordExpanded('ATACGGTAG', table=1, reading_frame=1, voucher_code="CP100-10", gene_code="wingless")
def test_sorting_seq_records(self): """Test SeqRecordExpanded objects are sorted by gene_code and then by voucher_code. """ my_list = [ { 'seq': 'CTgCacGgTCaagacGTGCtGgaTgaggctGccGacTTttcggtCtGTAgGCGACGCCTTGAAGGACGGCTTCGACGGAGCGTCGCGGGTTATGATGCCCAATACGGAGTTGGAAGCACCAGCTCAGCGAAACGATGCCGCTCCGCACAGAGTCCCGCGACGAGACCGATACAGATTTCAACTwCGGCCGCACAATCCTGACCACAAAACACCCGGAGTCAAGGACCTAGTGTACTTGGAATCATCGCCGGGTTTCTGCGAAAAGAACCCGCGGCTGGGCATTCCAGGCACGCACGGGCGTTCCTGCAATGACACGAGTATCGGCGTCGACGGCTGCGACCTCATGTGCTGTGGCCGTGGCTACCGGACCGAGACAATGTTCGTTGTGGAGCGATGCAAC', 'voucher_code': "CP100-11", 'taxonomy': { 'genus': 'Aus', 'species': 'bus' }, 'gene_code': 'wingless', 'reading_frame': 2, 'table': 1 }, { 'seq': '???????????????????????????????????????????TCTGTAGGCGATGCCTTGAAGGACGGCTTCGACGGAGCGTCGCGGGTCATGATGCCCAATACGGAGTTAGAAGCGCCTGCTCAGCGAAACGACGCCGCCCCGCACAGAGTCCCGCGACGAGACCGATACAGATTTCAACTTCGGCCGCACAATCCTGACCACAAAACACCCGGA?TCAAGGACCTAGTGTACTTGGAATCATCGCCGGGTTTCTGCGAAAAGAACCCGCGGCTGGGCATTCCCGGCACGCACGGGCGTGCCTGCAACGACACGAGTATCGGCGTCGACGGCTGCGACCTCATGTGCTGCGGCCGTGGCTACCGGACCGAGACAATGTTCGTCGTGGAGCGATGCAAC', 'voucher_code': "CP100-10", 'taxonomy': { 'genus': 'Aus', 'species': 'aus' }, 'gene_code': 'wingless', 'reading_frame': 2, 'table': 1 }, { 'seq': '???????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????', 'voucher_code': "CP100-11", 'taxonomy': { 'genus': 'Aus', 'species': 'bus' }, 'gene_code': 'RpS2', 'reading_frame': 3, 'table': 1 }, { 'seq': '???????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????', 'voucher_code': "CP100-10", 'taxonomy': { 'genus': 'Aus', 'species': 'aus' }, 'gene_code': 'RpS2', 'reading_frame': 3, 'table': 1 }, { 'seq': '??????????????????????????????????????????????????????????????????????????AAAAAGAAACTTTTGGAAATTTaggtataATTTATGCTATATTAGCAATTGGATTATTAGGATTTATTGTGTGAGCTCATCATATATTTACTGTAGGTATAGATATTGATACTCGAGCTTATTTTACCTCTGCTACAATAATTATTGCAGTCCCAACAGGAATTAAAATTTTTAGTTGATTAGCAACTCTACATGGAACACAAATTAATTATAGTCCTTCCATACTTTGAAGACTAGGATTTATTTTTTTATTTACAGTAGGAGGATTAACTGGTGTAATTTTAGCTAATTCTTCAATTGATATTGCTCTTCATGATACTTATTATGTAgTAgCCCACTTTCATTATGTATTGTCTATAGGAGCAGTATTTGCTATTTTTGGAGGATTTGTCCATTGATATCCTTTATTTACAGGATTAATATTAAATCCATATTTATTAAAAATTCAATTTATTTCAATATTTATTGGAGTTAACTTAACTTTTTTCCCACAACATTTTTTAGGTTTAGCTGGTATACCTCGACGTTACTCAGATTACCCAGATAATTTTTTATCTTGAAATATTATTTCATCATTAGGATCTTATATTTCTCTATTTTCTATAATAATAATAaTTATTATTATATGAGAATCAATAACTTATCAACGTATAATTTTATTTTCATTAAATATACctTCTTCAATTGAGTGATATCAAAAaTTACCACCTGCCGAACATTCTTATAAtGAAC', 'voucher_code': "CP100-10", 'taxonomy': { 'genus': 'Aus', 'species': 'aus' }, 'gene_code': 'COI_end', 'reading_frame': 2, 'table': 5 }, { 'seq': '?????????????????????????????????????????????????????????????????????????AAAAAAGAAACTTTCGGAAGCTTAGGTATAATTTACGCTATATTAGCTATTGGATTATTAGGATtTATTGTATGAGCTCATCATATATTTACAGTAGGAATAGATATTGATACCCGAGCTTATTTTACTTCTGCTACAATAATTATTGCCGTACCAACAGGAATTAAAATTTTTAGCTGATTAGCAACTCTTCACGGAACTCAAATCAATTATAGTCCTTCCATACTTTGAAGATTAGGATTTATTTTTTTATTTACAGTAGGAGGACTAACTGGTGTAATTTTAGCTAATTCTTCAATTGATATTACTCTCCATGATACTTATTATGTTGTAGCTCATTTTCATTATGTTCTATCTATAGGAGCAGTATTTGCTATTTTCGGAGGATTTATCCACTGATACCCCTTATTTACAGGATTAATATTAAACCCATATTTATTAAAAATTCAATTCATTTCAATATTTATTGGAGTTAATTTAACTTTTTTTCCACAACATTTTTTAGGGTTAGCTGGTATACCTCGTCGTTATTCAGATTACCCAGATAATTTTTTATCTTGAAATATTATTTCATCATTAGGATCTTATATTTCATTATTTTCTATAATAATAATAATTATTATTATTTGAGAATCAATAATTTATCAACGTATAATTTTATTTACATTAAATATACCCTCTTCAATTGAATGATATCAAAATTTACCTCCTGCCGAACATTCTTATAATGAAC', 'voucher_code': "CP100-11", 'taxonomy': { 'genus': 'Aus', 'species': 'bus' }, 'gene_code': 'COI_end', 'reading_frame': 2, 'table': 5 }, { 'seq': '????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????', 'voucher_code': "CP100-10", 'taxonomy': { 'genus': 'Aus', 'species': 'aus' }, 'gene_code': 'ef1a', 'reading_frame': 2, 'table': 1 }, { 'seq': '????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????', 'voucher_code': "CP100-11", 'taxonomy': { 'genus': 'Aus', 'species': 'bus' }, 'gene_code': 'ef1a', 'reading_frame': 2, 'table': 1 }, ] seq_records = [] append = seq_records.append for i in my_list: seq_record = SeqRecordExpanded( i['seq'], voucher_code=i['voucher_code'], taxonomy=i['taxonomy'], gene_code=i['gene_code'], reading_frame=i['reading_frame'], table=i['table'], ) append(seq_record) expected = [ 'COI_end', 'COI_end', 'ef1a', 'ef1a', 'RpS2', 'RpS2', 'wingless', 'wingless' ] result = Dataset(seq_records).seq_records self.assertEqual(expected, [i.gene_code for i in result]) expected = [ 'CP100_10', 'CP100_11', 'CP100_10', 'CP100_11', 'CP100_10', 'CP100_11', 'CP100_10', 'CP100_11' ] self.assertEqual(expected, [i.voucher_code for i in result])
def test_getting_first_and_second_codon_positions_reading_frame_1(self): seq = 'GAATGGAAGACAAAGTCTCGTCCA' seq_record = SeqRecordExpanded(seq, reading_frame=1) expected = 'GATGAAACAATCCGCC' self.assertEqual(expected, seq_record.first_and_second_codon_positions())
def test_degen_z(self): seq_record = SeqRecordExpanded(self.seq, reading_frame=1) expected = 'TCNGARTGGAARACNAARMGNCCN' self.assertEqual(expected, seq_record.degenerate(method='Z'))
def test_gapped_translation(self): seq = 'TCT---GAATGGAAGACAAAGCGTCCA' seq_record = SeqRecordExpanded(seq, reading_frame=1) expected = 'S-EWKTKRP' self.assertEqual(expected, seq_record.translate(table=1))