Beispiel #1
0
def handle_codon_usage(genes):
    """ Generate codon usage histograms
    """
    def extract(marker, data):
        """ Extract marked codon usage from each gene
        """
        out = []
        for gene, codu in data.items():
            if not codu[marker] is None:
                out.append(codu[marker])
        return out

    print('Computing codon statistics')
    dnana = DNAAnalyzer(strict=False)
    data = dnana.get_gene_codon_usages(genes)

    plot_data = []
    bin_width = 0.01
    for marker in ['AAA', 'GAA', 'CAA']:
        cur = {}
        usage = extract(marker, data)

        cur['marker'] = marker
        cur['counts'], cur['edges'] = do_binning(usage, bin_width)

        plot_data.append(cur)

    json.dump(plot_data, open('results/gene_codon_usages.json', 'w'))

    print('Plotting')
    subprocess.check_call(
        ['Rscript', 'plotting/codon_usage_histogram_maker.R'],
        stdout=subprocess.DEVNULL,
        stderr=subprocess.DEVNULL)
Beispiel #2
0
 def setUp(self):
     self.dnana = DNAAnalyzer()
     self.seq = 'AAAAAGAAA'
     self.genes = [
         SeqRecord(Seq('AAAAAAAAG')),
         SeqRecord(Seq('AAAAAGAAA')),
         SeqRecord(Seq('TTTTTCTTT')),
         SeqRecord(Seq('TTCTTTTTC'))
     ]
Beispiel #3
0
def get_codu_per_group(groups):
    """ Compute codon usage per group
    """
    group_codu = {}
    dnana = DNAAnalyzer(strict=False)

    for label, genes in groups.items():
        group_codu[label] = dnana.get_avg_codon_usage(genes)

    return group_codu
def generate_codon_usage_summary(groups, out_fname):
    """ Save codon usage tables per group
    """
    dnana = DNAAnalyzer(strict=False)
    with open(out_fname, 'w') as fd:
        for label, genes in groups.items():
            codu = dnana.get_avg_codon_usage(genes)

            fd.write(label + '\n')
            pprint.pprint(codu, fd)
            fd.write('\n')
Beispiel #5
0
def main():
    """ Generate overview
    """
    farser = FastaParser(sys.argv[1])
    genes = farser.parse()

    dnana = DNAAnalyzer(strict=False)
    codu = dnana.get_avg_codon_usage(genes)

    with open('results/plain_codon_usage_table.txt', 'w') as fd:
        output_data(codu, fd)
Beispiel #6
0
def get_codu(genes, group):
    """ Compute codon usage for all genes or only for certain expression group if file is given
    """
    exprs = extract_expression_levels(sys.argv[2]) if len(
        sys.argv) == 3 else None
    groups = {
        'all': genes
    } if exprs is None else group_expression_levels(genes, exprs)
    select = 'all' if exprs is None else group

    dnana = DNAAnalyzer(strict=False)
    codu = dnana.get_avg_codon_usage(groups[select])

    return codu, select
Beispiel #7
0
def store_low_CAA_genes(genes):
    """ Create list of genes where CAA usage < 0.9
    """
    # compute codon usage
    print('Computing codon statistics')
    dnana = DNAAnalyzer(strict=False)
    data = dnana.get_gene_codon_usages(genes)

    def compute_norm(gene, *args):
        """ Compute normalized occurrence frequency of aa
        """
        all_codon_num = dnana._count_codons(str(gene.seq))
        aa_num = sum([all_codon_num[codon] for codon in args])
        norm = aa_num * 1000 / len(gene.seq)
        return norm

    avg_codon_freqs = dnana.get_codon_freqs(genes)
    print('  LYS freq: %f\n' %
          (avg_codon_freqs['AAA'] + avg_codon_freqs['AAG']) +
          '  GLU freq: %f\n' %
          (avg_codon_freqs['GAA'] + avg_codon_freqs['GAG']) +
          '  GLN freq: %f' % (avg_codon_freqs['CAA'] + avg_codon_freqs['CAG']))

    # filter for genes
    low_CAA_genes = []
    for gene, codu in data.items():
        if not codu['CAA'] is None and codu['CAA'] < 0.9:
            lys_freq = (compute_norm(gene, 'AAA', 'AAG') / 1000) / (
                avg_codon_freqs['AAA'] + avg_codon_freqs['AAG'])
            glu_freq = (compute_norm(gene, 'GAA', 'GAG') / 1000) / (
                avg_codon_freqs['GAA'] + avg_codon_freqs['GAG'])
            gln_freq = (compute_norm(gene, 'CAA', 'CAG') / 1000) / (
                avg_codon_freqs['CAA'] + avg_codon_freqs['CAG'])

            low_CAA_genes.append(
                (gene.id, extract_gene_name(gene), lys_freq, codu['AAA'],
                 glu_freq, codu['GAA'], gln_freq, codu['CAA']))

    # store results
    with open('results/low_CAA_genes.csv', 'w') as fd:
        wrtr = csv.writer(fd)
        wrtr.writerow([
            'ID', 'name', 'LYS rel freq', 'CU: AAA', 'GLU rel freq', 'CU: GAA',
            'GLN rel freq', 'CU: CAA'
        ])

        for entry in low_CAA_genes:
            wrtr.writerow(entry)
Beispiel #8
0
class TestCodonUsage(TestCase):
    def setUp(self):
        self.dnana = DNAAnalyzer()
        self.seq = 'AAAAAGAAA'
        self.genes = [
            SeqRecord(Seq('AAAAAAAAG')),
            SeqRecord(Seq('AAAAAGAAA')),
            SeqRecord(Seq('TTTTTCTTT')),
            SeqRecord(Seq('TTCTTTTTC'))
        ]

    def test_codon_counter(self):
        count = self.dnana._count_codons(self.seq)

        self.assertEqual(count['AAA'], 2)
        self.assertEqual(count['AAG'], 1)
        self.assertEqual(count['AAT'], 0)
        self.assertEqual(count['AAC'], 0)

    def test_codon_usage(self):
        codu = self.dnana.get_codon_usage(self.seq)

        self.assertEqual(round(codu['AAA'], 3), round(0.6666, 3))
        self.assertEqual(round(codu['AAG'], 3), round(0.3333, 3))
        self.assertEqual(codu['AAT'], None)
        self.assertEqual(codu['AAC'], None)

    def test_average_codon_usage(self):
        avg_codu = self.dnana.get_avg_codon_usage(self.genes)

        self.assertEqual(round(avg_codu['AAA'], 3), round(0.6666, 3))
        self.assertEqual(round(avg_codu['AAG'], 3), round(0.3333, 3))
        self.assertEqual(avg_codu['AAT'], None)
        self.assertEqual(avg_codu['AAC'], None)
        self.assertEqual(round(avg_codu['TTT'], 3), round(0.5, 3))
        self.assertEqual(round(avg_codu['TTC'], 3), round(0.5, 3))

    def test_codon_frequencies(self):
        avg_cod_freqs = self.dnana.get_codon_freqs(self.genes)

        self.assertEqual(round(avg_cod_freqs['AAA'], 3), 0.333)
        self.assertEqual(round(avg_cod_freqs['AAG'], 3), 0.167)
        self.assertEqual(round(avg_cod_freqs['TTT'], 3), 0.25)
        self.assertEqual(round(avg_cod_freqs['TTC'], 3), 0.25)
Beispiel #9
0
class LysineAbundanceFilter(BaseFilter):
    """ Only allow gene which code for more than 76.6 lysines if scaled to a length of 1000 bases
    """
    def __init__(self):
        self.dnaa = DNAAnalyzer(strict=False)

    def apply(self, record):
        res = self.dnaa._count_codons(str(record.seq))
        lysin = res['AAA'] + res['AAG']
        norm = lysin * 1000 / len(record.seq)

        return norm > 76.6
Beispiel #10
0
def find_special_AAA_freqs(genes):
    #id_filter = ['DDB0305421|DDB_G0276433', 'DDB0347990|DDB_G0289359', 'DDB0347948|DDB_G0270662', 'DDB0349097|DDB_G0279651', 'DDB0306784|DDB_G0293038', 'DDB0218505|DDB_G0283527', 'DDB0348150|DDB_G0285779', 'DDB0347690|DDB_G0286087'] # AAA=0
    #id_filter = ['DDB0230164|DDB_G0293360', 'DDB0186263|DDB_G0284929', 'DDB0232396|DDB_G0282423', 'DDB0238636|DDB_G0269008', 'DDB0234236|DDB_G0289721', 'DDB0229439|DDB_G0270122'] # AAA=1
    id_filter = ['DDB0348668|DDB_G0276223', 'DDB0307442|DDB_G0269954', 'DDB0307413|DDB_G0269350', 'DDB0308362|DDB_G0269090', 'DDB0216219|DDB_G0269132'] # long AAA=1

    def get_record(gene_id):
        for gene in genes:
            if gene.id == gene_id:
                return gene
        return None

    dnaa = DNAAnalyzer()
    for gid in id_filter:
        rec = get_record(gid)

        if not rec is None:
            print(rec.id)
            print(' ', 'gene length:', len(rec.seq))
            coco = dnaa._count_codons(str(rec.seq))
            print(' ', 'AAA:', coco['AAA'])
            print(' ', 'AAG:', coco['AAG'])
            print()
Beispiel #11
0
def group_genes(Classifier, genes, fname_out):
    """ Group genes given in filename and save results elsewhere
    """
    gegro = GeneGrouper(Classifier)
    genes = Classifier.preprocess(genes)
    groups = gegro.group(genes)

    foo = []
    filter_stats = collections.defaultdict(int)
    dnana = DNAAnalyzer(strict=False)
    for group_name, group_genes in groups.items():
        # apply post-annotation filters
        filters = parse_filters(post_annotation=True)
        genes = []
        for gene in group_genes:
            skip = False
            for f in filters:
                if not f.skip and not f().apply(gene):
                    filter_stats[f.__name__] += 1
                    skip = True
            if skip: continue

            genes.append(gene)

        if len(genes) == 0: continue

        # compute codon usage
        cum_codu = dnana.get_cum_codon_usage(genes)
        foo.append({
            'group': group_name,
            'cumulative_codon_usage': cum_codu
        })

    if len(filter_stats) > 0: print('Post-Annotation filters:')
    for k, v in filter_stats.items(): print(' ', k, '->', v)

    json.dump(foo, open(os.path.join(Classifier.RESULTS_DIR, fname_out), 'w'))
Beispiel #12
0
def stretch_codu_histogram(genes):
    """ Generate 2D histogram of stretch length versus codon usage
    """
    dnana = DNAAnalyzer()

    def get_stretches(gene, codons):
        """ Find stretches in ORF of given gene and codon usage
        """
        cods = '|'.join(['(?:%s)' % c for c in codons])
        pat = re.compile(r'((?:' + cods + ')+)')
        stretches = pat.finditer(str(gene.seq), overlapped=True)

        return parse_stretches(
            gene, stretches, lambda gene, stretch: dnana.get_codon_usage(
                stretch.group())[codons[0]])

    data = []
    for codon_pair in [('CAA', 'CAG'), ('AAA', 'AAG'), ('AAT', 'AAC')]:
        stretch_lens = []
        stretch_codus = []

        for gene in genes:
            stretches, codu = get_stretches(gene, codon_pair)
            stretch_lens.extend([len(stretch) / 3. for stretch in stretches])
            stretch_codus.extend(codu)

        # make 2D-Histogram
        coords = do_2d_binning(stretch_lens, stretch_codus, 1, 0.01,
                               max(stretch_lens), 1)

        data.append({'codon': ','.join(codon_pair), 'data': coords})

    with open('results/longest_stretches.json', 'w') as fd:
        json.dump(data, fd)

    print('Plotting')
    subprocess.check_call(['Rscript', 'plotting/stretch_histogram2.R'],
                          stdout=subprocess.DEVNULL,
                          stderr=subprocess.DEVNULL)
Beispiel #13
0
 def __init__(self):
     self.dnaa = DNAAnalyzer(strict=False)