Exemple #1
0
def find_longest_A_stretch(genes):
    threshold = 8
    regex = re.compile("(A+A)")

    data = []
    A_lengths = []
    for gene in genes:
        stretches = regex.findall(str(gene.seq))
        A_lengths.extend([len(s) for s in stretches])

        longest_stretch = max(stretches)
        if len(longest_stretch) > threshold:
            data.append({
                'gene_id': gene.id,
                'gene_name': utils.extract_gene_name(gene),
                'gene_len': len(gene.seq),
                'stretch': longest_stretch,
                'stretch_len': len(longest_stretch)
            })

    data = list(reversed(sorted(data, key=operator.itemgetter('stretch'))))

    with open('longest_A_stretches.json', 'w') as fd:
        json.dump(data, fd)

    with open('all_A_stretch_lengths.json', 'w') as fd:
        json.dump(A_lengths, fd)
Exemple #2
0
    def test_gene_name_extraction(self):
        record = SeqRecord(
            Seq('AGTC'),
            description=
            'DDB0191165|DDB_G0267380 |DNA coding sequence|gene: argE on chromosome: 1 position 414980 to 416538'
        )

        self.assertEqual(utils.extract_gene_name(record), 'argE')
Exemple #3
0
def store_low_CAA_genes(genes):
    """ Create list of genes where CAA usage < 0.9
    """
    # compute codon usage
    print('Computing codon statistics')
    dnana = DNAAnalyzer(strict=False)
    data = dnana.get_gene_codon_usages(genes)

    def compute_norm(gene, *args):
        """ Compute normalized occurrence frequency of aa
        """
        all_codon_num = dnana._count_codons(str(gene.seq))
        aa_num = sum([all_codon_num[codon] for codon in args])
        norm = aa_num * 1000 / len(gene.seq)
        return norm

    avg_codon_freqs = dnana.get_codon_freqs(genes)
    print('  LYS freq: %f\n' %
          (avg_codon_freqs['AAA'] + avg_codon_freqs['AAG']) +
          '  GLU freq: %f\n' %
          (avg_codon_freqs['GAA'] + avg_codon_freqs['GAG']) +
          '  GLN freq: %f' % (avg_codon_freqs['CAA'] + avg_codon_freqs['CAG']))

    # filter for genes
    low_CAA_genes = []
    for gene, codu in data.items():
        if not codu['CAA'] is None and codu['CAA'] < 0.9:
            lys_freq = (compute_norm(gene, 'AAA', 'AAG') / 1000) / (
                avg_codon_freqs['AAA'] + avg_codon_freqs['AAG'])
            glu_freq = (compute_norm(gene, 'GAA', 'GAG') / 1000) / (
                avg_codon_freqs['GAA'] + avg_codon_freqs['GAG'])
            gln_freq = (compute_norm(gene, 'CAA', 'CAG') / 1000) / (
                avg_codon_freqs['CAA'] + avg_codon_freqs['CAG'])

            low_CAA_genes.append(
                (gene.id, extract_gene_name(gene), lys_freq, codu['AAA'],
                 glu_freq, codu['GAA'], gln_freq, codu['CAA']))

    # store results
    with open('results/low_CAA_genes.csv', 'w') as fd:
        wrtr = csv.writer(fd)
        wrtr.writerow([
            'ID', 'name', 'LYS rel freq', 'CU: AAA', 'GLU rel freq', 'CU: GAA',
            'GLN rel freq', 'CU: CAA'
        ])

        for entry in low_CAA_genes:
            wrtr.writerow(entry)
Exemple #4
0
    def get_direct_annotation(self, record):
        """ Extract direct annotation information from amigo query result
        """
        soup = self._query_amigo(utils.extract_gene_name(record))

        table = soup.find('table',
                          attrs={'class': 'bbop-js-search-pane-results-table'})
        table_body = table.find('tbody')
        rows = table_body.find_all('tr')

        # only check first row
        annotations = []
        for ele in rows[0].find_all('a'):
            if ele['href'].startswith(
                    'http://amigo.geneontology.org/amigo/term/GO:'):
                annotations.append(ele.text.lower())

        return annotations
Exemple #5
0
    def get_groupname(self, record):
        """ Choose "best" annotation out of list of possible ones
        """
        gnames = []
        gnames.append('all')

        egn = extract_gene_name(record)
        if egn.endswith('_RTE'): gnames.append('rte')

        annos = ' | '.join(record.annotations['manual'])

        for kw in self.keywords:
            if kw.lower() in annos.lower():
                gnames.append(kw)

        if len(gnames) == 1: gnames.append('other')

        return gnames
Exemple #6
0
 def apply(self, record):
     name = extract_gene_name(record)
     return not name.endswith('_RTE')
Exemple #7
0
 def apply(self, record):
     name = extract_gene_name(record)
     return not name.startswith('DDB_')