def test_get_synonymous_codons(self): """Test getting synonymous codons. """ # Re-assign random function. import random def fake_random(): return 0.1 random.random = fake_random CODON_USAGE_MEMEX = CodonUsageMemex(build_codon_usage_dict()) print CODON_USAGE_MEMEX.get_synonymous_codons('TTA')
def test_codon_rarity_profile(self): # Override sliding_window_size for testing. CodonRarityFeatureProfile.sliding_window_size = 15 before = 'TTTAAACCCTTTGGG' feature_1_seq = 'ATGTTTGGG' whole_seq = before + feature_1_seq seq = Seq(whole_seq, generic_dna) seq_record = SeqRecord(seq) feature_1_loc = FeatureLocation( len(before), len(before) + len(feature_1_seq), strand=1) feature_1 = SeqFeature(feature_1_loc, type='CDS', id=1) seq_record.features.append(feature_1) # Simple table for testing. AA_TO_CODON_LIST_DICT = { 'M': { 'ATG': {'usage': 1.0}, }, 'T': { 'TTT': {'usage': 0.1}, 'AAA': {'usage': 0.9}, }, 'R': { 'CCC': {'usage': 0.8}, 'GGG': {'usage': 0.1}, 'TTT': {'usage': 0.1}, } } CODON_USAGE_MEMEX = CodonUsageMemex(AA_TO_CODON_LIST_DICT) kwargs = { 'original_codon_usage_memex': CODON_USAGE_MEMEX, 'refactored_codon_usage_memex': CODON_USAGE_MEMEX } profile = CodonRarityFeatureProfile( feature_1.id, seq_record, **kwargs) EXPECTED_PROFILE_VALUES = [ (0.9 + 0.8 + 0.1 + 0.1 + 1.0) / 5, (0.8 + 0.1 + 0.1 + 1.0 + 0.1) / 5, (0.1 + 0.1 + 1.0 + 0.1 + 0.1) / 5, ] self.assertEqual(EXPECTED_PROFILE_VALUES, profile.values)
def test_replacer__start_codons_unchanged(self): """Tests that the replacer doesn't swap out forbidden codons in the that are serving as a start codon. """ CODONS_TO_REMOVE = ['TTG', 'CTA'] # Simple table for testing. AA_TO_CODON_LIST_DICT = { 'L': { 'TTG': {}, 'CTT': {}, 'CTA': {}, }, '*': {'TAG': {}, 'TAA': {}, 'AGC': {}}, } CODON_USAGE_MEMEX = CodonUsageMemex(AA_TO_CODON_LIST_DICT) feature_1_seq = 'TTG' + 'GCT' + 'AAT' + 'TTG' + 'TAA' whole_seq = feature_1_seq seq = Seq(whole_seq, generic_dna) seq_record = SeqRecord(seq) feature_1_loc = FeatureLocation(0, len(feature_1_seq), strand=1) feature_1 = SeqFeature(feature_1_loc, type='CDS', id='1') seq_record.features.append(feature_1) codon_replacer = GraphSearchCodonReplacer( CODONS_TO_REMOVE, CODON_USAGE_MEMEX) # Perform replacement. replace_result = codon_replacer.replace_codons_in_feature( feature_1.id, seq_record) # Assert successful fix. self.assertTrue(replace_result['is_success']) # Assert the new sequence has only the TTG that is not a start codon # removed. EXPECTED_NEW_FEATURE_SEQUENCE = 'TTG' + 'GCT' + 'AAT' + 'CTT' + 'TAA' self.assertEqual( EXPECTED_NEW_FEATURE_SEQUENCE, str(replace_result['new_feature_seq']))
def test_graph_search_codon_replacer_gc_content(self): CODONS_TO_REMOVE = ['ACC', 'AGG', 'TAG'] # Simple table for testing. AA_TO_CODON_LIST_DICT = { 'T': {'ACC': {}, 'AAA': {}, 'ACG': {}}, 'R': {'AGG': {}, 'ATT': {}, 'AGC': {}}, '*': {'TAG': {}, 'TAA': {}, 'AGC': {}}, } prefix = 'CCCAAAGGGCCCAAATTTAAAGGGCCC' feature_1_seq = 'ATGACCTAG' other = 'TTTAAACCCTTT' whole_seq = prefix + feature_1_seq + other seq = Seq(whole_seq, generic_dna) seq_record = SeqRecord(seq) feature_1_start = len(prefix) feature_1_end = len(prefix) + len(feature_1_seq) feature_1_loc = FeatureLocation( feature_1_start, feature_1_end, strand=1) feature_1 = SeqFeature(feature_1_loc, type='CDS', id=1) seq_record.features.append(feature_1) profile_kwargs = { 'sliding_window_size': 20 } gc_content_profile = GCContentFeatureProfile( feature_1.id, seq_record, **profile_kwargs) CODON_USAGE_MEMEX = CodonUsageMemex(AA_TO_CODON_LIST_DICT) codon_replacer = GraphSearchCodonReplacer( CODONS_TO_REMOVE, CODON_USAGE_MEMEX, [gc_content_profile]) # Perform replacement. replace_result = codon_replacer.replace_codons_in_feature( feature_1.id, seq_record) EXPECTED_NEW_FEATURE_SEQUENCE = 'ATGACGTAA' self.assertEqual( EXPECTED_NEW_FEATURE_SEQUENCE, str(replace_result['new_feature_seq']))
def test_replacer__TGA_codon_mid_seq(self): """Tests that TGA codon mis-sequence is not replaced, as it codes for Selenocysteine. """ CODONS_TO_REMOVE = ['TGA'] # Simple table for testing. AA_TO_CODON_LIST_DICT = { '*': {'TGA': {}, 'TAG': {}}, } CODON_USAGE_MEMEX = CodonUsageMemex(AA_TO_CODON_LIST_DICT) feature_1_seq = 'TTG' + 'GCT' + 'TGA' + 'TTG' + 'TGA' whole_seq = feature_1_seq seq = Seq(whole_seq, generic_dna) seq_record = SeqRecord(seq) feature_1_loc = FeatureLocation(0, len(feature_1_seq), strand=1) feature_1 = SeqFeature(feature_1_loc, type='CDS', id='1') seq_record.features.append(feature_1) codon_replacer = GraphSearchCodonReplacer( CODONS_TO_REMOVE, CODON_USAGE_MEMEX) # Perform replacement. replace_result = codon_replacer.replace_codons_in_feature( feature_1.id, seq_record) # Assert successful fix. self.assertTrue(replace_result['is_success']) # Assert the new sequence has only the TTG that is not a start codon # removed. EXPECTED_NEW_FEATURE_SEQUENCE = 'TTG' + 'GCT' + 'TGA' + 'TTG' + 'TAG' self.assertEqual( EXPECTED_NEW_FEATURE_SEQUENCE, str(replace_result['new_feature_seq']))
def test_simple_codon_replacer(self): CODONS_TO_REMOVE = ['ACC', 'AGG'] # Simple table for testing. AA_TO_CODON_LIST_DICT = { 'T': { 'ACC':{}, 'ACU': {}, }, 'R': { 'AGG': {}, 'AGA': {}, } } feature_1_seq = 'ATGACCAGG' other = 'TTTAAACCCTTT' whole_seq = feature_1_seq + other seq = Seq(whole_seq, generic_dna) seq_record = SeqRecord(seq) feature_1_loc = FeatureLocation(0, len(feature_1_seq), strand=1) feature_1 = SeqFeature(feature_1_loc, type='CDS', id=1) seq_record.features.append(feature_1) CODON_USAGE_MEMEX = CodonUsageMemex(AA_TO_CODON_LIST_DICT) codon_replacer = SimpleCodonReplacer( CODONS_TO_REMOVE, CODON_USAGE_MEMEX) # Perform replacement. replace_result = codon_replacer.replace_codons_in_feature( feature_1.id, seq_record) EXPECTED_NEW_FEATURE_SEQUENCE = 'ATGACUAGA' self.assertEqual( EXPECTED_NEW_FEATURE_SEQUENCE, str(replace_result['new_feature_seq']))
def add_translation_info(genome, profile): ''' This function generates a large amount of per-gene and per-codon data based on ribosome profiling, per-codon occupancy, NTE/CTE data, etc. It creates three objects, (genome, codons, genes, bad_cds) containing this data. The new genome SeqRecord object has additional data appended to each SeqFeature regarding occupancy and mRNA transcription level. Additionally, return a codon table with CTE and NTE usage for E. coli. ''' codons = defaultdict( lambda: { 'count': 0, 'count_total': 0, 'count_nterm': 0, 'rnaseq_count': 0, 'occup': 0, 'logmean_occup': 0, 'mean_occup': 0, 'cte': 0, 'rpkm': 0, 'dpkm': 0, 'rpkm_nterm': 0, 'dpkm_nterm': 0, 'occup_nterm': 0, 'logmean_occup_nterm': 0, 'mean_occup_nterm': 0 }) genes = defaultdict( lambda: { 'logmean_occup': 0, 'mean_occup': 0, 'logmean_occup_nterm': 0, 'mean_occup_nterm': 0, 'rpkm': 0, 'dpkm': 0 }) hexamers = defaultdict( lambda: { 'count': 0, 'count_total': 0, 'count_nterm': 0, 'rnaseq_count': 0, 'occup': 0, 'logmean_occup': 0, 'mean_occup': 0, 'occup_nterm': 0, 'logmean_occup_nterm': 0, 'mean_occup_nterm': 0, 'rpkm': 0, 'dpkm': 0, 'rpkm_nterm': 0, 'dpkm_nterm': 0 }) cds = defaultdict(dict) bad_cds = {} for feature in genome.features: #only take CDS features: if feature.type != 'CDS': continue #skip if no gene id try: gene_id = feature.qualifiers['gene'][0] except (KeyError): continue #Li/Weissman don't use these genes due to frameshift/homology if gene_id in ('tufA', 'tufB', 'prfX', 'dnaX'): continue #get the profile for this region strand = 'fwd' if feature.strand == 1 else 'rev' #skip if this gene is too short - less than 30 aa if len(feature) < 90: bad_cds[feature.qualifiers['gene'] [0]] = 'Too short (%d)' % len(feature) continue #get the DNA sequence for this feature feature_location_slice = slice(feature.location.start.position, feature.location.end.position) feature_seq = genome.seq[feature_location_slice] if strand == 'rev': feature_seq = feature_seq.reverse_complement() #skip if this CDS is not a multiple of 3 if not (len(feature_seq) % 3 == 0): bad_cds[feature.qualifiers['gene'][0]] = 'Not codon divisible' continue #skip if this CDS does not start with 'NTG' if not (feature_seq[1:3].tostring() == 'TG'): bad_cds[feature.qualifiers['gene'][0]] = 'First codon is ' + \ feature_seq[0:3].tostring() continue #skip if this CDS does not end with a stop codon (TGA, TAT, TAG) final_codon = feature_seq[len(feature) - 3:len(feature)].tostring() if not (final_codon in ['TGA', 'TAA', 'TAG']): bad_cds[feature.qualifiers['gene'][0]] = 'Last codon is ' + \ final_codon continue #get the ribosomal occupancy region_profile = extract_region_profile( profile, strand, feature.location.start.position, feature.location.end.position) #get ribosome occupancy for this gene, but we want to ignore the #first 10 and last 10 codons as Li does occupancies = region_profile.values()[30:len(region_profile) - 30] occupancies_nterm = region_profile.values()[0:30] #replace 0 occupancy with float min to avoid log of 0 errors replace_zero = lambda occ: sys.float_info[3] if occ == 0 else occ occupancies = map(replace_zero, occupancies) occupancies_nterm = map(replace_zero, occupancies_nterm) #mean occup for this gene (and for first 10 aa) mean_occup = float(sum(occupancies)) / float(len(occupancies)) mean_occup_nterm = float(sum(occupancies_nterm)) / float(30) #logmean occup for this gene (and for first 10 aa) logmean_occup = math.exp( sum(map(math.log, occupancies)) / float(len(occupancies))) logmean_occup_nterm = math.exp( sum(map(math.log, occupancies_nterm)) / float(30)) #create new rbs occupancy qualifiers for this region feature.qualifiers['mean_rbs_occupancy'] = mean_occup feature.qualifiers['logmean_rbs_occupancy'] = logmean_occup genes[gene_id]['mean_rbs_occupancy'] = mean_occup genes[gene_id]['logmean_rbs_occupancy'] = logmean_occup genes[gene_id]['mean_rbs_occupancy_nterm'] = mean_occup_nterm genes[gene_id]['logmean_rbs_occupancy_nterm'] = logmean_occup_nterm if 'rpkm' in feature.qualifiers: genes[gene_id]['rpkm'] = feature.qualifiers['rpkm'] genes[gene_id]['dpkm'] = feature.qualifiers['dpkm'] #update the scores for each codon in this CDS feature_codons = enumerate( zip(range(0, len(feature) - 3, 3), range(3, len(feature), 3))) #for each codon in this feature, update occupancy and mRNA stats for codon_i, (codon_start, codon_end) in feature_codons: codon = feature_seq[codon_start:codon_end].tostring() codon_occup = sum(occupancies[codon_start:codon_end + 1]) / 4 codons[codon]['count_total'] += 1 codons[codon]['mean_occup'] += mean_occup codons[codon]['logmean_occup'] += logmean_occup #skip this if this feature does not have rna-seq data if 'rpkm' in feature.qualifiers: codons[codon]['rnaseq_count'] += 1 #ignore the first and last 10 codons for occupancy measures if codon_start > 30 and (len(feature) - codon_end > 30): codons[codon]['count'] += 1 codons[codon]['occup'] += codon_occup / mean_occup codons[codon]['mean_occup'] += mean_occup codons[codon]['logmean_occup'] += logmean_occup if 'rpkm' in feature.qualifiers: codons[codon]['rpkm'] += feature.qualifiers['rpkm'] codons[codon]['dpkm'] += feature.qualifiers['dpkm'] #but record the first 10 codons separately if codon_start < 30: codons[codon]['count_nterm'] += 1 codons[codon]['occup_nterm'] += codon_occup / mean_occup codons[codon]['mean_occup_nterm'] += mean_occup codons[codon]['logmean_occup_nterm'] += logmean_occup if 'rpkm' in feature.qualifiers: codons[codon]['rpkm_nterm'] += feature.qualifiers['rpkm'] codons[codon]['dpkm_nterm'] += feature.qualifiers['dpkm'] #do the same thing for all sliding hexamers feature_hexamers = enumerate( zip(range(0, len(feature)), range(6, len(feature) + 6))) #for each codon in this feature, update occupancy and mRNA stats for hexamer_i, (hex_start, hex_end) in feature_hexamers: hexamer = feature_seq[hex_start:hex_end].tostring() hex_occup = sum(occupancies[hex_start:hex_end + 1]) / 7 hexamers[hexamer]['count_total'] += 1 hexamers[hexamer]['mean_occup'] += mean_occup hexamers[hexamer]['logmean_occup'] += logmean_occup #some features do not have rnaseq data if 'rpkm' in feature.qualifiers: hexamers[hexamer]['rnaseq_count'] += 1 #ignore the first and last 10 codons for occupancy measures if hex_start > 25 and (len(feature) - hex_end > 30): hexamers[hexamer]['count'] += 1 hexamers[hexamer]['occup'] += hex_occup / mean_occup hexamers[hexamer]['mean_occup'] += mean_occup hexamers[hexamer]['logmean_occup'] += logmean_occup if 'rpkm' in feature.qualifiers: hexamers[hexamer]['rpkm'] += feature.qualifiers['rpkm'] hexamers[hexamer]['dpkm'] += feature.qualifiers['dpkm'] #but record the first 10 codons separately if hex_start < 25: hexamers[hexamer]['count_nterm'] += 1 hexamers[hexamer]['occup_nterm'] += codon_occup / mean_occup hexamers[hexamer]['mean_occup_nterm'] += mean_occup hexamers[hexamer]['logmean_occup_nterm'] += logmean_occup if 'rpkm' in feature.qualifiers: hexamers[hexamer]['rpkm_nterm'] += feature.qualifiers[ 'rpkm'] hexamers[hexamer]['dpkm_nterm'] += feature.qualifiers[ 'dpkm'] #Next, let's compute the codon usage per amino acid codon_lookup = CodonUsageMemex(build_codon_usage_dict()) aa_usage_counts = defaultdict(int) aa_usage_counts_nterm = defaultdict(int) #calc aa usages for codon in codons.keys(): aa = codon_lookup.codon_to_aa_dict[codon] aa_usage_counts[aa] += codons[codon]['count'] aa_usage_counts_nterm[aa] += codons[codon]['count_nterm'] # divide by aa usages to get frequencies, # normalize occupancy and rna seq to count for codon, stats in codons.items(): aa = codon_lookup.codon_to_aa_dict[codon] aa_total = aa_usage_counts[aa] aa_total_nterm = aa_usage_counts_nterm[aa] stats['freq'] = float(stats['count']) / float(aa_total) if aa_total_nterm > 0: stats['freq_nterm'] = (float(stats['count_nterm']) / float(aa_total_nterm)) else: stats['freq_nterm'] = 0 stats['norm_occup'] = stats['occup'] / stats['count'] stats['norm_occup_nterm'] = stats['occup_nterm'] / stats['count'] stats['rpkm'] = stats['rpkm'] / stats['rnaseq_count'] stats['dpkm'] = stats['dpkm'] / stats['rnaseq_count'] stats['rpkm_nterm'] = stats['rpkm_nterm'] / stats['rnaseq_count'] stats['dpkm_nterm'] = stats['dpkm_nterm'] / stats['rnaseq_count'] stats['aa'] = aa #Finally, we need to rescale these into what Frydman calls the cu_i, #which is the translational level rescaled to 1 per AA #get adjusted vlues relative to max get_max = lambda a_dict, a_key: max([v[a_key] for v in a_dict.values()] + [sys.float_info[3]]) max_m_occup = get_max(codons, 'mean_occup') max_m_occup_nt = get_max(codons, 'mean_occup_nterm') max_lm_occup = get_max(codons, 'logmean_occup') max_lm_occup_nt = get_max(codons, 'logmean_occup_nterm') max_drs_occup = get_max(codons, 'dpkm') max_drs_occup_nt = get_max(codons, 'dpkm_nterm') max_rs_occup = get_max(codons, 'rpkm') max_rs_occup_nt = get_max(codons, 'rpkm_nterm') for stats in codons.values(): stats['mean_cu_i'] = stats['mean_occup'] / max_m_occup stats['logmean_cu_i'] = stats['logmean_occup'] / max_lm_occup stats['dpkm_cu_i'] = stats['dpkm'] / max_drs_occup stats['rpkm_cu_i'] = stats['rpkm'] / max_rs_occup stats['mean_cu_i_nt'] = stats['mean_occup_nterm'] / max_m_occup_nt stats[ 'logmean_cu_i_nt'] = stats['logmean_occup_nterm'] / max_lm_occup_nt stats['dpkm_cu_i_nt'] = stats['dpkm_nterm'] / max_drs_occup_nt stats['rpkm_cu_i_nt'] = stats['rpkm_nterm'] / max_rs_occup_nt #get cTE scores and create nTE' scores cte_file = open(CTE_SCORE_FILE) for line in cte_file: (codon, cte) = line.split() codons[codon]['cte'] = float(cte) for codon, stats in codons.items(): stats['mean_nte'] = stats['cte'] / stats['mean_cu_i'] stats['logmean_nte'] = stats['cte'] / stats['logmean_cu_i'] stats['drs_nte'] = stats['cte'] / stats['dpkm_cu_i'] stats['rs_nte'] = stats['cte'] / stats['rpkm_cu_i'] stats['mean_nte_nt'] = stats['cte'] / (stats['mean_cu_i_nt'] + sys.float_info[3]) stats['logmean_nte_nt'] = stats['cte'] / (stats['logmean_cu_i_nt'] + sys.float_info[3]) stats['drs_nte_nt'] = stats['cte'] / (stats['dpkm_cu_i_nt'] + sys.float_info[3]) stats['rs_nte_nt'] = stats['cte'] / (stats['rpkm_cu_i_nt'] + sys.float_info[3]) #get adjusted nTE' out of max to find nTE max_m_nte = get_max(codons, 'mean_nte') max_lm_nte = get_max(codons, 'logmean_nte') max_rs_nte = get_max(codons, 'rs_nte') max_drs_nte = get_max(codons, 'drs_nte') max_m_nte_nt = get_max(codons, 'mean_nte_nt') max_lm_nte_nt = get_max(codons, 'logmean_nte_nt') max_rs_nte_nt = get_max(codons, 'rs_nte_nt') max_drs_nte_nt = get_max(codons, 'drs_nte_nt') for stats in codons.values(): stats['mean_nte'] = stats['mean_nte'] / max_m_nte stats['logmean_nte'] = stats['logmean_nte'] / max_lm_nte stats['rs_nte'] = stats['rs_nte'] / max_rs_nte stats['drs_nte'] = stats['drs_nte'] / max_drs_nte stats['mean_nte_nt'] = stats['mean_nte_nt'] / max_m_nte_nt stats['logmean_nte_nt'] = stats['logmean_nte_nt'] / max_lm_nte_nt stats['rs_nte_nt'] = stats['rs_nte_nt'] / max_rs_nte_nt stats['drs_nte_nt'] = stats['rs_nte_nt'] / max_rs_nte_nt return (genome, codons, genes, hexamers, bad_cds)
def analyze_codon_usage(refactor_config_yaml, original_record_path, refactored_record_path): """Analyze MDS42 codon usage before and after and refactoring. Args: refactor_config_yaml: File containing the configuration for this particular refactor (e.g. codons to remove, etc.). """ with open(refactor_config_yaml) as yaml_fh: YAML_CONFIG_DICT = yaml.load(yaml_fh) CODONS_TO_REMOVE = YAML_CONFIG_DICT['forbidden_codons'] ignore_feature_ids = [ 'CDS_fdnG_ECMDS42_1186_1254228_1257276', 'CDS_fdhF_ECMDS42_3518_3726321_3728469', 'CDS_fdoG_ECMDS42_3333_3511985_3515036' ] original_genome_record = get_genome_record(original_record_path, use_old_id_strategy=True, only_get_features=['CDS']) refactored_genome_record = get_genome_record(refactored_record_path, use_old_id_strategy=True, only_get_features=['CDS']) OUTPUT_FILE = refactored_record_path + '.codon_usage_analysis.csv' FIELD_NAMES = [ 'codon', 'attempted_remove', 'amino_acid', 'original_usage', 'refactored_usage', 'target_usage', 'delta_usage', 'original_count', 'original_amino_acid_count', 'refactored_count', 'refactored_amino_acid_count', ] CODON_USAGE_REPORT = os.path.join(CONFIG_DIR, YAML_CONFIG_DICT['codon_usage']) TARGET_CODON_USAGE_MEMEX = ( CodonUsageMemex.build_from_removed_codons_usage_report( CODON_USAGE_REPORT)) print 'Calculating original source usage...' original_codon_usage = determine_codon_usage_table(original_genome_record) print 'Calculating refactored genome codon usage...' refactored_codon_usage = determine_codon_usage_table( refactored_genome_record, ignore_feature_ids=ignore_feature_ids) print 'Writing result ...' with open(OUTPUT_FILE, 'w') as csvfile: writer = csv.DictWriter(csvfile, FIELD_NAMES) writer.writeheader() for codon, original_data in original_codon_usage.iteritems(): refactored_data = refactored_codon_usage[codon] row = { 'codon': codon, 'attempted_remove': codon in CODONS_TO_REMOVE, 'amino_acid': original_data['amino_acid'], 'original_usage': "{0:.2f}".format(original_data['usage']), 'refactored_usage': "{0:.2f}".format(refactored_data['usage']), 'target_usage': TARGET_CODON_USAGE_MEMEX.get_codon_usage(codon), 'delta_usage': "{0:.2f}".format(refactored_data['usage'] - original_data['usage']), 'original_count': original_data['count'], 'original_amino_acid_count': original_data['amino_acid_count'], 'refactored_count': refactored_data['count'], 'refactored_amino_acid_count': refactored_data['amino_acid_count'], } writer.writerow(row)
def test_generate_permitted_feature_seq_variants(self): CODONS_TO_REMOVE = ['ACC', 'AGG'] # Simple table for testing. AA_TO_CODON_LIST_DICT = { 'M': { 'ATG': {}, }, 'T': { 'ACC': {}, 'ATT': {}, 'ACU': {}, }, 'R': { 'AGG': {}, 'AGA': {}, 'GGG': {}, } } CODON_USAGE_MEMEX = CodonUsageMemex(AA_TO_CODON_LIST_DICT) CODON_USAGE_MEMEX.start_codons = ['ATG'] feature_1_seq = 'ATGACCAGGACU' random = 'TTTTCCCTTCGGTT' whole_seq = feature_1_seq + random seq_record = SeqRecord(whole_seq) feature_1_loc = FeatureLocation(0, len(feature_1_seq), strand=1) feature_1 = SeqFeature(feature_1_loc, type='CDS', id=1) seq_record.features.append(feature_1) # Region covers whole feature. region = (0, len(feature_1_seq)) actual_variants = _generate_permitted_feature_seq_variants( feature_1, None, # other_feature 'NA_test', # conflict_type seq_record, region, CODONS_TO_REMOVE, CODON_USAGE_MEMEX) EXPECTED_VARIANT_SET = set([ 'ATGATTAGAACU', 'ATGATTGGGACU', 'ATGACUAGAACU', 'ATGACUGGGACU', 'ATGATTAGAATT', 'ATGATTGGGATT', 'ATGACUAGAATT', 'ATGACUGGGATT', ]) self.assertEqual(len(EXPECTED_VARIANT_SET), len(actual_variants)) self.assertEqual(EXPECTED_VARIANT_SET, set(actual_variants)) # Region covers partial feature. for region_start in [6, 7, 8]: region = (region_start, 15) actual_variants = _generate_permitted_feature_seq_variants( feature_1, None, # other_feature 'NA_test', # conflict_type seq_record, region, CODONS_TO_REMOVE, CODON_USAGE_MEMEX) EXPECTED_VARIANT_SET = set([ 'ATGACCAGAACU', 'ATGACCGGGACU', 'ATGACCAGAATT', 'ATGACCGGGATT' ]) self.assertEqual(len(EXPECTED_VARIANT_SET), len(actual_variants)) self.assertEqual(EXPECTED_VARIANT_SET, set(actual_variants)) # Region covers inner part of feature. for region_start in [6, 7, 8]: for region_end in range(region_start + 1, 9): region = (region_start, region_end) actual_variants = _generate_permitted_feature_seq_variants( feature_1, None, # other_feature 'NA_test', # conflict_type seq_record, region, CODONS_TO_REMOVE, CODON_USAGE_MEMEX) EXPECTED_VARIANT_SET = set(['ATGACCAGAACU', 'ATGACCGGGACU']) self.assertEqual(len(EXPECTED_VARIANT_SET), len(actual_variants)) self.assertEqual(EXPECTED_VARIANT_SET, set(actual_variants))
NUM_CORES = YAML_CONFIG_DICT.get('num_cores', 1) SELENOCYSTEINE_CODON = 'TGA' # Design constraints based on Gen9 Rules. # See http://gen9bio.com/faq/ HOMOPOLYMER_RUN_LIMIT_A = 8 HOMOPOLYMER_RUN_LIMIT_C = 8 HOMOPOLYMER_RUN_LIMIT_G = 5 HOMOPOLYMER_RUN_LIMIT_T = 8 CODONS_TO_REMOVE = YAML_CONFIG_DICT['forbidden_codons'] CODON_USAGE_REPORT = os.path.join(CONFIG_DIR, YAML_CONFIG_DICT['codon_usage']) ORIGINAL_CODON_USAGE_MEMEX = CodonUsageMemex(build_codon_usage_dict()) REFACTORED_CODON_USAGE_MEMEX = ( CodonUsageMemex.build_from_removed_codons_usage_report(CODON_USAGE_REPORT)) ############################################################################### # Manual feature handling. # # Functions that capture manual changes that aren't handled by any other part # of the pipeline. These should be executed before the rest of the refactoring # pipeline. There might be a more generic way to do this, but just doing it # manually for now until we figure that out. ############################################################################### def handle_prfB(genome_record):