Example #1
0
    def test_get_synonymous_codons(self):
        """Test getting synonymous codons.
        """
        # Re-assign random function.
        import random

        def fake_random():
            return 0.1

        random.random = fake_random

        CODON_USAGE_MEMEX = CodonUsageMemex(build_codon_usage_dict())

        print CODON_USAGE_MEMEX.get_synonymous_codons('TTA')
    def test_codon_rarity_profile(self):
        # Override sliding_window_size for testing.
        CodonRarityFeatureProfile.sliding_window_size = 15

        before = 'TTTAAACCCTTTGGG'
        feature_1_seq = 'ATGTTTGGG'
        whole_seq = before + feature_1_seq
        seq = Seq(whole_seq, generic_dna)
        seq_record = SeqRecord(seq)

        feature_1_loc = FeatureLocation(
                len(before), len(before) + len(feature_1_seq), strand=1)
        feature_1 = SeqFeature(feature_1_loc, type='CDS', id=1)
        seq_record.features.append(feature_1)

        # Simple table for testing.
        AA_TO_CODON_LIST_DICT = {
                'M': {
                    'ATG': {'usage': 1.0},
                },
                'T': {
                    'TTT': {'usage': 0.1},
                    'AAA': {'usage': 0.9},
                },
                'R': {
                    'CCC': {'usage': 0.8},
                    'GGG': {'usage': 0.1},
                    'TTT': {'usage': 0.1},
                }
        }
        CODON_USAGE_MEMEX = CodonUsageMemex(AA_TO_CODON_LIST_DICT)

        kwargs = {
                'original_codon_usage_memex': CODON_USAGE_MEMEX,
                'refactored_codon_usage_memex': CODON_USAGE_MEMEX
        }

        profile = CodonRarityFeatureProfile(
                feature_1.id, seq_record, **kwargs)

        EXPECTED_PROFILE_VALUES = [
                (0.9 + 0.8 + 0.1 + 0.1 + 1.0) / 5,
                (0.8 + 0.1 + 0.1 + 1.0 + 0.1) / 5,
                (0.1 + 0.1 + 1.0 + 0.1 + 0.1) / 5,
        ]
        self.assertEqual(EXPECTED_PROFILE_VALUES, profile.values)
Example #3
0
    def test_replacer__start_codons_unchanged(self):
        """Tests that the replacer doesn't swap out forbidden codons in the
        that are serving as a start codon.
        """
        CODONS_TO_REMOVE = ['TTG', 'CTA']

        # Simple table for testing.
        AA_TO_CODON_LIST_DICT = {
                'L': {
                        'TTG': {},
                        'CTT': {},
                        'CTA': {},
                },
                '*': {'TAG': {}, 'TAA': {}, 'AGC': {}},
        }

        CODON_USAGE_MEMEX = CodonUsageMemex(AA_TO_CODON_LIST_DICT)

        feature_1_seq = 'TTG' + 'GCT' + 'AAT' + 'TTG' + 'TAA'
        whole_seq = feature_1_seq
        seq = Seq(whole_seq, generic_dna)
        seq_record = SeqRecord(seq)

        feature_1_loc = FeatureLocation(0, len(feature_1_seq), strand=1)
        feature_1 = SeqFeature(feature_1_loc, type='CDS', id='1')
        seq_record.features.append(feature_1)

        codon_replacer = GraphSearchCodonReplacer(
                CODONS_TO_REMOVE, CODON_USAGE_MEMEX)

        # Perform replacement.
        replace_result = codon_replacer.replace_codons_in_feature(
                feature_1.id, seq_record)

        # Assert successful fix.
        self.assertTrue(replace_result['is_success'])

        # Assert the new sequence has only the TTG that is not a start codon
        # removed.
        EXPECTED_NEW_FEATURE_SEQUENCE = 'TTG' + 'GCT' + 'AAT' + 'CTT' + 'TAA'
        self.assertEqual(
                EXPECTED_NEW_FEATURE_SEQUENCE,
                str(replace_result['new_feature_seq']))
Example #4
0
    def test_graph_search_codon_replacer_gc_content(self):
        CODONS_TO_REMOVE = ['ACC', 'AGG', 'TAG']

        # Simple table for testing.
        AA_TO_CODON_LIST_DICT = {
                'T': {'ACC': {}, 'AAA': {}, 'ACG': {}},
                'R': {'AGG': {}, 'ATT': {}, 'AGC': {}},
                '*': {'TAG': {}, 'TAA': {}, 'AGC': {}},
        }

        prefix = 'CCCAAAGGGCCCAAATTTAAAGGGCCC'
        feature_1_seq = 'ATGACCTAG'
        other = 'TTTAAACCCTTT'
        whole_seq = prefix + feature_1_seq + other
        seq = Seq(whole_seq, generic_dna)
        seq_record = SeqRecord(seq)

        feature_1_start = len(prefix)
        feature_1_end = len(prefix) + len(feature_1_seq)
        feature_1_loc = FeatureLocation(
                feature_1_start, feature_1_end, strand=1)
        feature_1 = SeqFeature(feature_1_loc, type='CDS', id=1)
        seq_record.features.append(feature_1)

        profile_kwargs = {
                'sliding_window_size': 20
        }
        gc_content_profile = GCContentFeatureProfile(
                feature_1.id, seq_record, **profile_kwargs)
        CODON_USAGE_MEMEX = CodonUsageMemex(AA_TO_CODON_LIST_DICT)
        codon_replacer = GraphSearchCodonReplacer(
                CODONS_TO_REMOVE, CODON_USAGE_MEMEX, [gc_content_profile])

        # Perform replacement.
        replace_result = codon_replacer.replace_codons_in_feature(
                feature_1.id, seq_record)

        EXPECTED_NEW_FEATURE_SEQUENCE = 'ATGACGTAA'
        self.assertEqual(
                EXPECTED_NEW_FEATURE_SEQUENCE,
                str(replace_result['new_feature_seq']))
Example #5
0
    def test_replacer__TGA_codon_mid_seq(self):
        """Tests that TGA codon mis-sequence is not replaced,
        as it codes for Selenocysteine.
        """
        CODONS_TO_REMOVE = ['TGA']

        # Simple table for testing.
        AA_TO_CODON_LIST_DICT = {
                '*': {'TGA': {}, 'TAG': {}},
        }

        CODON_USAGE_MEMEX = CodonUsageMemex(AA_TO_CODON_LIST_DICT)

        feature_1_seq = 'TTG' + 'GCT' + 'TGA' + 'TTG' + 'TGA'
        whole_seq = feature_1_seq
        seq = Seq(whole_seq, generic_dna)
        seq_record = SeqRecord(seq)

        feature_1_loc = FeatureLocation(0, len(feature_1_seq), strand=1)
        feature_1 = SeqFeature(feature_1_loc, type='CDS', id='1')
        seq_record.features.append(feature_1)

        codon_replacer = GraphSearchCodonReplacer(
                CODONS_TO_REMOVE, CODON_USAGE_MEMEX)

        # Perform replacement.
        replace_result = codon_replacer.replace_codons_in_feature(
                feature_1.id, seq_record)

        # Assert successful fix.
        self.assertTrue(replace_result['is_success'])

        # Assert the new sequence has only the TTG that is not a start codon
        # removed.
        EXPECTED_NEW_FEATURE_SEQUENCE = 'TTG' + 'GCT' + 'TGA' + 'TTG' + 'TAG'
        self.assertEqual(
                EXPECTED_NEW_FEATURE_SEQUENCE,
                str(replace_result['new_feature_seq']))
Example #6
0
    def test_simple_codon_replacer(self):
        CODONS_TO_REMOVE = ['ACC', 'AGG']

        # Simple table for testing.
        AA_TO_CODON_LIST_DICT = {
                'T': {
                        'ACC':{},
                        'ACU': {},
                },
                'R': {
                        'AGG': {},
                        'AGA': {},
                }
        }

        feature_1_seq = 'ATGACCAGG'
        other = 'TTTAAACCCTTT'
        whole_seq = feature_1_seq + other
        seq = Seq(whole_seq, generic_dna)
        seq_record = SeqRecord(seq)

        feature_1_loc = FeatureLocation(0, len(feature_1_seq), strand=1)
        feature_1 = SeqFeature(feature_1_loc, type='CDS', id=1)
        seq_record.features.append(feature_1)

        CODON_USAGE_MEMEX = CodonUsageMemex(AA_TO_CODON_LIST_DICT)
        codon_replacer = SimpleCodonReplacer(
                CODONS_TO_REMOVE, CODON_USAGE_MEMEX)

        # Perform replacement.
        replace_result = codon_replacer.replace_codons_in_feature(
                feature_1.id, seq_record)

        EXPECTED_NEW_FEATURE_SEQUENCE = 'ATGACUAGA'
        self.assertEqual(
                EXPECTED_NEW_FEATURE_SEQUENCE,
                str(replace_result['new_feature_seq']))
def add_translation_info(genome, profile):
    '''
    This function generates a large amount of per-gene and per-codon data 
    based on ribosome profiling, per-codon occupancy, NTE/CTE data, etc. 
    
    It creates three objects, (genome, codons, genes, bad_cds) containing
    this data. The new genome SeqRecord object has additional data appended
    to each SeqFeature regarding occupancy and mRNA transcription level. 
    
    Additionally, return a codon table with CTE and NTE usage for E. coli.
    '''

    codons = defaultdict(
        lambda: {
            'count': 0,
            'count_total': 0,
            'count_nterm': 0,
            'rnaseq_count': 0,
            'occup': 0,
            'logmean_occup': 0,
            'mean_occup': 0,
            'cte': 0,
            'rpkm': 0,
            'dpkm': 0,
            'rpkm_nterm': 0,
            'dpkm_nterm': 0,
            'occup_nterm': 0,
            'logmean_occup_nterm': 0,
            'mean_occup_nterm': 0
        })

    genes = defaultdict(
        lambda: {
            'logmean_occup': 0,
            'mean_occup': 0,
            'logmean_occup_nterm': 0,
            'mean_occup_nterm': 0,
            'rpkm': 0,
            'dpkm': 0
        })

    hexamers = defaultdict(
        lambda: {
            'count': 0,
            'count_total': 0,
            'count_nterm': 0,
            'rnaseq_count': 0,
            'occup': 0,
            'logmean_occup': 0,
            'mean_occup': 0,
            'occup_nterm': 0,
            'logmean_occup_nterm': 0,
            'mean_occup_nterm': 0,
            'rpkm': 0,
            'dpkm': 0,
            'rpkm_nterm': 0,
            'dpkm_nterm': 0
        })

    cds = defaultdict(dict)

    bad_cds = {}

    for feature in genome.features:

        #only take CDS features:
        if feature.type != 'CDS': continue

        #skip if no gene id
        try:
            gene_id = feature.qualifiers['gene'][0]
        except (KeyError):
            continue

        #Li/Weissman don't use these genes due to frameshift/homology
        if gene_id in ('tufA', 'tufB', 'prfX', 'dnaX'):
            continue

        #get the profile for this region
        strand = 'fwd' if feature.strand == 1 else 'rev'

        #skip if this gene is too short - less than 30 aa
        if len(feature) < 90:
            bad_cds[feature.qualifiers['gene']
                    [0]] = 'Too short (%d)' % len(feature)
            continue

        #get the DNA sequence for this feature
        feature_location_slice = slice(feature.location.start.position,
                                       feature.location.end.position)
        feature_seq = genome.seq[feature_location_slice]
        if strand == 'rev': feature_seq = feature_seq.reverse_complement()

        #skip if this CDS is not a multiple of 3
        if not (len(feature_seq) % 3 == 0):
            bad_cds[feature.qualifiers['gene'][0]] = 'Not codon divisible'
            continue

        #skip if this CDS does not start with 'NTG'
        if not (feature_seq[1:3].tostring() == 'TG'):
            bad_cds[feature.qualifiers['gene'][0]] = 'First codon is ' + \
                    feature_seq[0:3].tostring()
            continue

        #skip if this CDS does not end with a stop codon (TGA, TAT, TAG)
        final_codon = feature_seq[len(feature) - 3:len(feature)].tostring()
        if not (final_codon in ['TGA', 'TAA', 'TAG']):
            bad_cds[feature.qualifiers['gene'][0]] = 'Last codon is ' + \
                    final_codon
            continue

        #get the ribosomal occupancy
        region_profile = extract_region_profile(
            profile, strand, feature.location.start.position,
            feature.location.end.position)

        #get ribosome occupancy for this gene, but we want to ignore the
        #first 10 and last 10 codons as Li does
        occupancies = region_profile.values()[30:len(region_profile) - 30]
        occupancies_nterm = region_profile.values()[0:30]

        #replace 0 occupancy with float min to avoid log of 0 errors
        replace_zero = lambda occ: sys.float_info[3] if occ == 0 else occ
        occupancies = map(replace_zero, occupancies)
        occupancies_nterm = map(replace_zero, occupancies_nterm)

        #mean occup for this gene (and for first 10 aa)
        mean_occup = float(sum(occupancies)) / float(len(occupancies))
        mean_occup_nterm = float(sum(occupancies_nterm)) / float(30)

        #logmean occup for this gene (and for first 10 aa)
        logmean_occup = math.exp(
            sum(map(math.log, occupancies)) / float(len(occupancies)))
        logmean_occup_nterm = math.exp(
            sum(map(math.log, occupancies_nterm)) / float(30))

        #create new rbs occupancy qualifiers for this region
        feature.qualifiers['mean_rbs_occupancy'] = mean_occup
        feature.qualifiers['logmean_rbs_occupancy'] = logmean_occup
        genes[gene_id]['mean_rbs_occupancy'] = mean_occup
        genes[gene_id]['logmean_rbs_occupancy'] = logmean_occup
        genes[gene_id]['mean_rbs_occupancy_nterm'] = mean_occup_nterm
        genes[gene_id]['logmean_rbs_occupancy_nterm'] = logmean_occup_nterm
        if 'rpkm' in feature.qualifiers:
            genes[gene_id]['rpkm'] = feature.qualifiers['rpkm']
            genes[gene_id]['dpkm'] = feature.qualifiers['dpkm']

        #update the scores for each codon in this CDS
        feature_codons = enumerate(
            zip(range(0,
                      len(feature) - 3, 3), range(3, len(feature), 3)))

        #for each codon in this feature, update occupancy and mRNA stats
        for codon_i, (codon_start, codon_end) in feature_codons:

            codon = feature_seq[codon_start:codon_end].tostring()
            codon_occup = sum(occupancies[codon_start:codon_end + 1]) / 4
            codons[codon]['count_total'] += 1
            codons[codon]['mean_occup'] += mean_occup
            codons[codon]['logmean_occup'] += logmean_occup

            #skip this if this feature does not have rna-seq data
            if 'rpkm' in feature.qualifiers:
                codons[codon]['rnaseq_count'] += 1

            #ignore the first and last 10 codons for occupancy measures
            if codon_start > 30 and (len(feature) - codon_end > 30):
                codons[codon]['count'] += 1
                codons[codon]['occup'] += codon_occup / mean_occup
                codons[codon]['mean_occup'] += mean_occup
                codons[codon]['logmean_occup'] += logmean_occup
                if 'rpkm' in feature.qualifiers:
                    codons[codon]['rpkm'] += feature.qualifiers['rpkm']
                    codons[codon]['dpkm'] += feature.qualifiers['dpkm']

            #but record the first 10 codons separately
            if codon_start < 30:
                codons[codon]['count_nterm'] += 1
                codons[codon]['occup_nterm'] += codon_occup / mean_occup
                codons[codon]['mean_occup_nterm'] += mean_occup
                codons[codon]['logmean_occup_nterm'] += logmean_occup
                if 'rpkm' in feature.qualifiers:
                    codons[codon]['rpkm_nterm'] += feature.qualifiers['rpkm']
                    codons[codon]['dpkm_nterm'] += feature.qualifiers['dpkm']

        #do the same thing for all sliding hexamers
        feature_hexamers = enumerate(
            zip(range(0, len(feature)), range(6,
                                              len(feature) + 6)))

        #for each codon in this feature, update occupancy and mRNA stats
        for hexamer_i, (hex_start, hex_end) in feature_hexamers:

            hexamer = feature_seq[hex_start:hex_end].tostring()
            hex_occup = sum(occupancies[hex_start:hex_end + 1]) / 7
            hexamers[hexamer]['count_total'] += 1
            hexamers[hexamer]['mean_occup'] += mean_occup
            hexamers[hexamer]['logmean_occup'] += logmean_occup

            #some features do not have rnaseq data
            if 'rpkm' in feature.qualifiers:
                hexamers[hexamer]['rnaseq_count'] += 1

            #ignore the first and last 10 codons for occupancy measures
            if hex_start > 25 and (len(feature) - hex_end > 30):
                hexamers[hexamer]['count'] += 1
                hexamers[hexamer]['occup'] += hex_occup / mean_occup
                hexamers[hexamer]['mean_occup'] += mean_occup
                hexamers[hexamer]['logmean_occup'] += logmean_occup
                if 'rpkm' in feature.qualifiers:
                    hexamers[hexamer]['rpkm'] += feature.qualifiers['rpkm']
                    hexamers[hexamer]['dpkm'] += feature.qualifiers['dpkm']

            #but record the first 10 codons separately
            if hex_start < 25:
                hexamers[hexamer]['count_nterm'] += 1
                hexamers[hexamer]['occup_nterm'] += codon_occup / mean_occup
                hexamers[hexamer]['mean_occup_nterm'] += mean_occup
                hexamers[hexamer]['logmean_occup_nterm'] += logmean_occup
                if 'rpkm' in feature.qualifiers:
                    hexamers[hexamer]['rpkm_nterm'] += feature.qualifiers[
                        'rpkm']
                    hexamers[hexamer]['dpkm_nterm'] += feature.qualifiers[
                        'dpkm']

    #Next, let's compute the codon usage per amino acid
    codon_lookup = CodonUsageMemex(build_codon_usage_dict())
    aa_usage_counts = defaultdict(int)
    aa_usage_counts_nterm = defaultdict(int)

    #calc aa usages
    for codon in codons.keys():
        aa = codon_lookup.codon_to_aa_dict[codon]
        aa_usage_counts[aa] += codons[codon]['count']
        aa_usage_counts_nterm[aa] += codons[codon]['count_nterm']

    # divide by aa usages to get frequencies,
    # normalize occupancy and rna seq to count
    for codon, stats in codons.items():
        aa = codon_lookup.codon_to_aa_dict[codon]
        aa_total = aa_usage_counts[aa]
        aa_total_nterm = aa_usage_counts_nterm[aa]
        stats['freq'] = float(stats['count']) / float(aa_total)
        if aa_total_nterm > 0:
            stats['freq_nterm'] = (float(stats['count_nterm']) /
                                   float(aa_total_nterm))
        else:
            stats['freq_nterm'] = 0

        stats['norm_occup'] = stats['occup'] / stats['count']
        stats['norm_occup_nterm'] = stats['occup_nterm'] / stats['count']

        stats['rpkm'] = stats['rpkm'] / stats['rnaseq_count']
        stats['dpkm'] = stats['dpkm'] / stats['rnaseq_count']
        stats['rpkm_nterm'] = stats['rpkm_nterm'] / stats['rnaseq_count']
        stats['dpkm_nterm'] = stats['dpkm_nterm'] / stats['rnaseq_count']

        stats['aa'] = aa

    #Finally, we need to rescale these into what Frydman calls the cu_i,
    #which is the translational level rescaled to 1 per AA

    #get adjusted vlues relative to max
    get_max = lambda a_dict, a_key: max([v[a_key] for v in a_dict.values()] +
                                        [sys.float_info[3]])
    max_m_occup = get_max(codons, 'mean_occup')
    max_m_occup_nt = get_max(codons, 'mean_occup_nterm')
    max_lm_occup = get_max(codons, 'logmean_occup')
    max_lm_occup_nt = get_max(codons, 'logmean_occup_nterm')
    max_drs_occup = get_max(codons, 'dpkm')
    max_drs_occup_nt = get_max(codons, 'dpkm_nterm')
    max_rs_occup = get_max(codons, 'rpkm')
    max_rs_occup_nt = get_max(codons, 'rpkm_nterm')

    for stats in codons.values():
        stats['mean_cu_i'] = stats['mean_occup'] / max_m_occup
        stats['logmean_cu_i'] = stats['logmean_occup'] / max_lm_occup
        stats['dpkm_cu_i'] = stats['dpkm'] / max_drs_occup
        stats['rpkm_cu_i'] = stats['rpkm'] / max_rs_occup

        stats['mean_cu_i_nt'] = stats['mean_occup_nterm'] / max_m_occup_nt
        stats[
            'logmean_cu_i_nt'] = stats['logmean_occup_nterm'] / max_lm_occup_nt
        stats['dpkm_cu_i_nt'] = stats['dpkm_nterm'] / max_drs_occup_nt
        stats['rpkm_cu_i_nt'] = stats['rpkm_nterm'] / max_rs_occup_nt

    #get cTE scores and create nTE' scores
    cte_file = open(CTE_SCORE_FILE)
    for line in cte_file:
        (codon, cte) = line.split()
        codons[codon]['cte'] = float(cte)

    for codon, stats in codons.items():
        stats['mean_nte'] = stats['cte'] / stats['mean_cu_i']
        stats['logmean_nte'] = stats['cte'] / stats['logmean_cu_i']
        stats['drs_nte'] = stats['cte'] / stats['dpkm_cu_i']
        stats['rs_nte'] = stats['cte'] / stats['rpkm_cu_i']

        stats['mean_nte_nt'] = stats['cte'] / (stats['mean_cu_i_nt'] +
                                               sys.float_info[3])
        stats['logmean_nte_nt'] = stats['cte'] / (stats['logmean_cu_i_nt'] +
                                                  sys.float_info[3])
        stats['drs_nte_nt'] = stats['cte'] / (stats['dpkm_cu_i_nt'] +
                                              sys.float_info[3])
        stats['rs_nte_nt'] = stats['cte'] / (stats['rpkm_cu_i_nt'] +
                                             sys.float_info[3])

    #get adjusted nTE' out of max to find nTE
    max_m_nte = get_max(codons, 'mean_nte')
    max_lm_nte = get_max(codons, 'logmean_nte')
    max_rs_nte = get_max(codons, 'rs_nte')
    max_drs_nte = get_max(codons, 'drs_nte')

    max_m_nte_nt = get_max(codons, 'mean_nte_nt')
    max_lm_nte_nt = get_max(codons, 'logmean_nte_nt')
    max_rs_nte_nt = get_max(codons, 'rs_nte_nt')
    max_drs_nte_nt = get_max(codons, 'drs_nte_nt')

    for stats in codons.values():
        stats['mean_nte'] = stats['mean_nte'] / max_m_nte
        stats['logmean_nte'] = stats['logmean_nte'] / max_lm_nte
        stats['rs_nte'] = stats['rs_nte'] / max_rs_nte
        stats['drs_nte'] = stats['drs_nte'] / max_drs_nte

        stats['mean_nte_nt'] = stats['mean_nte_nt'] / max_m_nte_nt
        stats['logmean_nte_nt'] = stats['logmean_nte_nt'] / max_lm_nte_nt
        stats['rs_nte_nt'] = stats['rs_nte_nt'] / max_rs_nte_nt
        stats['drs_nte_nt'] = stats['rs_nte_nt'] / max_rs_nte_nt

    return (genome, codons, genes, hexamers, bad_cds)
Example #8
0
def analyze_codon_usage(refactor_config_yaml, original_record_path,
                        refactored_record_path):
    """Analyze MDS42 codon usage before and after and refactoring.

    Args:
        refactor_config_yaml: File containing the configuration for this
            particular refactor (e.g. codons to remove, etc.).
    """
    with open(refactor_config_yaml) as yaml_fh:
        YAML_CONFIG_DICT = yaml.load(yaml_fh)

    CODONS_TO_REMOVE = YAML_CONFIG_DICT['forbidden_codons']

    ignore_feature_ids = [
        'CDS_fdnG_ECMDS42_1186_1254228_1257276',
        'CDS_fdhF_ECMDS42_3518_3726321_3728469',
        'CDS_fdoG_ECMDS42_3333_3511985_3515036'
    ]

    original_genome_record = get_genome_record(original_record_path,
                                               use_old_id_strategy=True,
                                               only_get_features=['CDS'])

    refactored_genome_record = get_genome_record(refactored_record_path,
                                                 use_old_id_strategy=True,
                                                 only_get_features=['CDS'])

    OUTPUT_FILE = refactored_record_path + '.codon_usage_analysis.csv'
    FIELD_NAMES = [
        'codon',
        'attempted_remove',
        'amino_acid',
        'original_usage',
        'refactored_usage',
        'target_usage',
        'delta_usage',
        'original_count',
        'original_amino_acid_count',
        'refactored_count',
        'refactored_amino_acid_count',
    ]

    CODON_USAGE_REPORT = os.path.join(CONFIG_DIR,
                                      YAML_CONFIG_DICT['codon_usage'])
    TARGET_CODON_USAGE_MEMEX = (
        CodonUsageMemex.build_from_removed_codons_usage_report(
            CODON_USAGE_REPORT))

    print 'Calculating original source usage...'
    original_codon_usage = determine_codon_usage_table(original_genome_record)

    print 'Calculating refactored genome codon usage...'
    refactored_codon_usage = determine_codon_usage_table(
        refactored_genome_record, ignore_feature_ids=ignore_feature_ids)

    print 'Writing result ...'
    with open(OUTPUT_FILE, 'w') as csvfile:
        writer = csv.DictWriter(csvfile, FIELD_NAMES)
        writer.writeheader()

        for codon, original_data in original_codon_usage.iteritems():
            refactored_data = refactored_codon_usage[codon]
            row = {
                'codon':
                codon,
                'attempted_remove':
                codon in CODONS_TO_REMOVE,
                'amino_acid':
                original_data['amino_acid'],
                'original_usage':
                "{0:.2f}".format(original_data['usage']),
                'refactored_usage':
                "{0:.2f}".format(refactored_data['usage']),
                'target_usage':
                TARGET_CODON_USAGE_MEMEX.get_codon_usage(codon),
                'delta_usage':
                "{0:.2f}".format(refactored_data['usage'] -
                                 original_data['usage']),
                'original_count':
                original_data['count'],
                'original_amino_acid_count':
                original_data['amino_acid_count'],
                'refactored_count':
                refactored_data['count'],
                'refactored_amino_acid_count':
                refactored_data['amino_acid_count'],
            }
            writer.writerow(row)
    def test_generate_permitted_feature_seq_variants(self):
        CODONS_TO_REMOVE = ['ACC', 'AGG']

        # Simple table for testing.
        AA_TO_CODON_LIST_DICT = {
            'M': {
                'ATG': {},
            },
            'T': {
                'ACC': {},
                'ATT': {},
                'ACU': {},
            },
            'R': {
                'AGG': {},
                'AGA': {},
                'GGG': {},
            }
        }
        CODON_USAGE_MEMEX = CodonUsageMemex(AA_TO_CODON_LIST_DICT)
        CODON_USAGE_MEMEX.start_codons = ['ATG']

        feature_1_seq = 'ATGACCAGGACU'
        random = 'TTTTCCCTTCGGTT'
        whole_seq = feature_1_seq + random
        seq_record = SeqRecord(whole_seq)

        feature_1_loc = FeatureLocation(0, len(feature_1_seq), strand=1)
        feature_1 = SeqFeature(feature_1_loc, type='CDS', id=1)
        seq_record.features.append(feature_1)

        # Region covers whole feature.
        region = (0, len(feature_1_seq))
        actual_variants = _generate_permitted_feature_seq_variants(
            feature_1,
            None,  # other_feature
            'NA_test',  # conflict_type
            seq_record,
            region,
            CODONS_TO_REMOVE,
            CODON_USAGE_MEMEX)
        EXPECTED_VARIANT_SET = set([
            'ATGATTAGAACU',
            'ATGATTGGGACU',
            'ATGACUAGAACU',
            'ATGACUGGGACU',
            'ATGATTAGAATT',
            'ATGATTGGGATT',
            'ATGACUAGAATT',
            'ATGACUGGGATT',
        ])
        self.assertEqual(len(EXPECTED_VARIANT_SET), len(actual_variants))
        self.assertEqual(EXPECTED_VARIANT_SET, set(actual_variants))

        # Region covers partial feature.
        for region_start in [6, 7, 8]:
            region = (region_start, 15)
            actual_variants = _generate_permitted_feature_seq_variants(
                feature_1,
                None,  # other_feature
                'NA_test',  # conflict_type
                seq_record,
                region,
                CODONS_TO_REMOVE,
                CODON_USAGE_MEMEX)
            EXPECTED_VARIANT_SET = set([
                'ATGACCAGAACU', 'ATGACCGGGACU', 'ATGACCAGAATT', 'ATGACCGGGATT'
            ])
            self.assertEqual(len(EXPECTED_VARIANT_SET), len(actual_variants))
            self.assertEqual(EXPECTED_VARIANT_SET, set(actual_variants))

        # Region covers inner part of feature.
        for region_start in [6, 7, 8]:
            for region_end in range(region_start + 1, 9):
                region = (region_start, region_end)
                actual_variants = _generate_permitted_feature_seq_variants(
                    feature_1,
                    None,  # other_feature
                    'NA_test',  # conflict_type
                    seq_record,
                    region,
                    CODONS_TO_REMOVE,
                    CODON_USAGE_MEMEX)
                EXPECTED_VARIANT_SET = set(['ATGACCAGAACU', 'ATGACCGGGACU'])
                self.assertEqual(len(EXPECTED_VARIANT_SET),
                                 len(actual_variants))
                self.assertEqual(EXPECTED_VARIANT_SET, set(actual_variants))
Example #10
0
NUM_CORES = YAML_CONFIG_DICT.get('num_cores', 1)

SELENOCYSTEINE_CODON = 'TGA'

# Design constraints based on Gen9 Rules.
# See http://gen9bio.com/faq/
HOMOPOLYMER_RUN_LIMIT_A = 8
HOMOPOLYMER_RUN_LIMIT_C = 8
HOMOPOLYMER_RUN_LIMIT_G = 5
HOMOPOLYMER_RUN_LIMIT_T = 8

CODONS_TO_REMOVE = YAML_CONFIG_DICT['forbidden_codons']
CODON_USAGE_REPORT = os.path.join(CONFIG_DIR, YAML_CONFIG_DICT['codon_usage'])

ORIGINAL_CODON_USAGE_MEMEX = CodonUsageMemex(build_codon_usage_dict())

REFACTORED_CODON_USAGE_MEMEX = (
    CodonUsageMemex.build_from_removed_codons_usage_report(CODON_USAGE_REPORT))

###############################################################################
# Manual feature handling.
#
# Functions that capture manual changes that aren't handled by any other part
# of the pipeline. These should be executed before the rest of the refactoring
# pipeline. There might be a more generic way to do this, but just doing it
# manually for now until we figure that out.
###############################################################################


def handle_prfB(genome_record):