Ejemplo n.º 1
0
    def get_mirna_precursors(self):
        """
        Get miRNA precursors from miRBase for mature miRNAs.
        """
        sql = """
		SELECT
		    t1.upi as precursor, t4.upi as mature, t1.taxid
		FROM xref t1
		INNER JOIN rnc_accessions t2
		ON (t1.ac = t2.accession)
		INNER JOIN rnc_accessions t3
		ON (t2.external_id = t3.external_id)
		INNER JOIN xref t4
		ON (t3.accession = t4.ac)
		WHERE
            t1.dbid = 4
            AND t1.deleted = 'N'
            AND t2.accession != t3.accession
            AND t2.feature_name = 'precursor_RNA'
            AND t3.feature_name != 'precursor_RNA'
            AND t4.dbid = t1.dbid
            AND t1.upi != t4.upi
            AND t1.taxid = t4.taxid
        """
        self.logger.info('Looking for miRBase miRNA precursors')
        with cursor() as cur:
            cur.execute(sql)
            for result in cur:
                upi_taxid = '%s_%i' % (result['mature'], result['taxid'])
                self.precursor_rna[upi_taxid].append(result['precursor'])
        count = len(self.precursor_rna)
        self.logger.info('Found %i mature RNAs with precursors', count)
Ejemplo n.º 2
0
 def write_gpi_file(self):
     """
     Write GPI file.
     """
     sql = """
     SELECT upi, taxid, description, rna_type
     FROM rnc_rna_precomputed
     WHERE taxid IS NOT NULL
     AND rna_type IS NOT NULL
     AND description IS NOT NULL
     AND is_active = true
     {test}
     """
     if self.test:
         test = "AND rna_type='miRNA'"
     else:
         test = ''
     with cursor() as cur:
         cur.execute(sql.format(test=test))
         with open(self.filepath, 'w') as filehandle:
             filehandle.write('!gpi-version: 1.2\n')
             for counter, result in enumerate(cur):
                 line = self.format_gpi_line(result)
                 filehandle.write(line)
                 if self.test and counter > self.test_entries:
                     break
     assert os.path.exists(self.filepath)
     self.test_unique_ids(self.filepath)
     self.test_none_taxids(self.filepath)
     self.gzip_file(self.filepath)
Ejemplo n.º 3
0
    def get_mirna_precursors(self):
        """
        Get miRNA precursors from miRBase for mature miRNAs.
        """
        sql = """
		SELECT
		    t1.upi as precursor, t4.upi as mature, t1.taxid
		FROM xref t1
		INNER JOIN rnc_accessions t2
		ON (t1.ac = t2.accession)
		INNER JOIN rnc_accessions t3
		ON (t2.external_id = t3.external_id)
		INNER JOIN xref t4
		ON (t3.accession = t4.ac)
		WHERE
            t1.dbid = 4
            AND t1.deleted = 'N'
            AND t2.accession != t3.accession
            AND t2.feature_name = 'precursor_RNA'
            AND t3.feature_name != 'precursor_RNA'
            AND t4.dbid = t1.dbid
            AND t1.upi != t4.upi
            AND t1.taxid = t4.taxid
        """
        self.logger.info('Looking for miRBase miRNA precursors')
        with cursor() as cur:
            cur.execute(sql)
            for result in cur:
                upi_taxid = '%s_%i' % (result['mature'], result['taxid'])
                self.precursor_rna[upi_taxid].append(result['precursor'])
        count = len(self.precursor_rna)
        self.logger.info('Found %i mature RNAs with precursors', count)
Ejemplo n.º 4
0
 def write_gpi_file(self):
     """
     Write GPI file.
     """
     sql = """
     SELECT upi, taxid, description, rna_type
     FROM rnc_rna_precomputed
     WHERE taxid IS NOT NULL
     AND rna_type IS NOT NULL
     AND description IS NOT NULL
     {test}
     """
     if self.test:
         test = "AND rna_type='miRNA'"
     else:
         test = ''
     with cursor() as cur:
         cur.execute(sql.format(test=test))
         with open(self.filepath, 'w') as filehandle:
             filehandle.write('!gpi-version: 1.2\n')
             for counter, result in enumerate(cur):
                 line = self.format_gpi_line(result)
                 filehandle.write(line)
                 if self.test and counter > self.test_entries:
                     break
     assert os.path.exists(self.filepath)
     self.test_unique_ids(self.filepath)
     self.test_none_taxids(self.filepath)
     self.gzip_file(self.filepath)
Ejemplo n.º 5
0
        def process_xref_entries():
            """
            Write output for each xref.
            """
            def format_output(accession):
                """
                Format data into a string.
                """
                template = '{upi}\t{database}\t{accession}\t{taxid}\t{rna_type}\t{gene}\n'
                return template.format(upi=upi,
                                       database=database,
                                       accession=accession,
                                       taxid=taxid,
                                       gene=gene,
                                       rna_type=rna_type)

            with cursor() as cur:
                cur.execute(self.get_xrefs_sql())
                for counter, result in enumerate(cur):
                    if self.test and counter > self.test_entries:
                        break
                    accession_source = get_accession_source()
                    upi = result['upi']
                    database = result['descr']
                    taxid = result['taxid']
                    gene = result['gene'] or ''
                    if result['feature_name'] == 'ncRNA':
                        rna_type = result['ncrna_class'] or 'RNA'
                    else:
                        rna_type = result['feature_name'] or 'RNA'
                    gene = gene.replace('\t', '')

                    # use PDB instead of PDB because PDB ids come from wwPDB
                    if database == 'PDBE':
                        database = 'PDB'

                    if database in accession_source['xref']:
                        line = format_output(result['accession'])
                    else:
                        line = format_output(result['external_id'])
                    self.filehandles['xrefs'].write(line)
                    # write out optional ids, if necessary
                    if database in accession_source['optional_id']:
                        line = format_output(result['optional_id'])
                        self.filehandles['xrefs'].write(line)
                    if counter < self.examples:
                        self.filehandles['example'].write(line)
                    counter += 1
Ejemplo n.º 6
0
        def process_xref_entries():
            """
            Write output for each xref.
            """
            def format_output(accession):
                """
                Format data into a string.
                """
                template = '{upi}\t{database}\t{accession}\t{taxid}\t{rna_type}\t{gene}\n'
                return template.format(upi=upi,
                                       database=database,
                                       accession=accession,
                                       taxid=taxid,
                                       gene=gene,
                                       rna_type=rna_type)

            with cursor() as cur:
                cur.execute(self.get_xrefs_sql())
                for counter, result in enumerate(cur):
                    if self.test and counter > self.test_entries:
                        break
                    accession_source = get_accession_source()
                    upi = result['upi']
                    database = result['descr']
                    taxid = result['taxid']
                    gene = result['gene'] or ''
                    if result['feature_name'] == 'ncRNA':
                        rna_type = result['ncrna_class'] or 'RNA'
                    else:
                        rna_type = result['feature_name'] or 'RNA'
                    gene = gene.replace('\t', '')

                    # use PDB instead of PDB because PDB ids come from wwPDB
                    if database == 'PDBE':
                        database = 'PDB'

                    if database in accession_source['xref']:
                        line = format_output(result['accession'])
                    else:
                        line = format_output(result['external_id'])
                    self.filehandles['xrefs'].write(line)
                    # write out optional ids, if necessary
                    if database in accession_source['optional_id']:
                        line = format_output(result['optional_id'])
                        self.filehandles['xrefs'].write(line)
                    if counter < self.examples:
                        self.filehandles['example'].write(line)
                    counter += 1
Ejemplo n.º 7
0
 def export_md5(self):
     """
     Write out the data.
     """
     try:
         with cursor() as cur:
             cur.execute(self.get_distinct_md5_sql())
             for counter, result in enumerate(cur):
                 if self.test and counter > self.test_entries:
                     break
                 md5 = '{upi}\t{md5}\n'.format(upi=result['upi'],
                                               md5=result['md5'])
                 self.filehandles['md5'].write(md5)
                 if counter < self.examples:
                     self.filehandles['md5_example'].write(md5)
     except Exception as exc:
         self.log_database_error(exc)
         sys.exit(1)
Ejemplo n.º 8
0
 def export_md5(self):
     """
     Write out the data.
     """
     try:
         with cursor() as cur:
             cur.execute(self.get_distinct_md5_sql())
             for counter, result in enumerate(cur):
                 if self.test and counter > self.test_entries:
                     break
                 md5 = '{upi}\t{md5}\n'.format(upi=result['upi'],
                                               md5=result['md5'])
                 self.filehandles['md5'].write(md5)
                 if counter < self.examples:
                     self.filehandles['md5_example'].write(md5)
     except Exception as exc:
         self.log_database_error(exc)
         sys.exit(1)
Ejemplo n.º 9
0
 def export_active_sequences(self):
     """
     Export RNA sequences with active cross-references.
     """
     try:
         previous_upi = ''
         iupac_chars = re.compile('^[ABCDGHKMNRSTVWXYU]+$', re.IGNORECASE)
         with cursor() as cur:
             cur.execute(self.get_active_sequences_sql())
             for counter, result in enumerate(cur):
                 if self.test and counter >= self.test_entries:
                     return
                 if result['upi'] == previous_upi:
                     continue
                 else:
                     previous_upi = result['upi']
                 rna = Rna(upi=result['upi'],
                           seq_short=result['seq_short'],
                           seq_long=result['seq_long'])
                 fasta = rna.get_sequence_fasta()
                 self.filehandles['seq_active'].write(fasta)
                 if counter < self.examples:
                     self.filehandles['seq_example'].write(fasta)
                 if iupac_chars.match(rna.get_sequence()):
                     self.filehandles['nhmmer_db'].write(fasta)
                 else:
                     self.filehandles['nhmmer_db_excluded'].write(fasta)
                 # species specific identifiers
                 sequence = re.sub(r'^>.+?\n', '',
                                   fasta)  # delete first line
                 template = ">{upi}_{taxid} {description}\n{sequence}"
                 queryset = rna.xrefs.filter(deleted='N')
                 for taxid in set(queryset.values_list('taxid', flat=True)):
                     description = rna.get_description(taxid=taxid)
                     species_specific_fasta = template.format(
                         upi=result['upi'],
                         taxid=taxid,
                         sequence=sequence,
                         description=description)
                     self.filehandles['species_specific'].write(
                         species_specific_fasta)
     except psycopg2.Error as exc:
         self.log_database_error(exc)
         sys.exit(1)
Ejemplo n.º 10
0
 def export_active_sequences(self):
     """
     Export RNA sequences with active cross-references.
     """
     try:
         previous_upi = ''
         iupac_chars = re.compile('^[ABCDGHKMNRSTVWXYU]+$', re.IGNORECASE)
         with cursor() as cur:
             cur.execute(self.get_active_sequences_sql())
             for counter, result in enumerate(cur):
                 if self.test and counter >= self.test_entries:
                     return
                 if result['upi'] == previous_upi:
                     continue
                 else:
                     previous_upi = result['upi']
                 rna = Rna(upi=result['upi'],
                           seq_short=result['seq_short'],
                           seq_long=result['seq_long'])
                 fasta = rna.get_sequence_fasta()
                 self.filehandles['seq_active'].write(fasta)
                 if counter < self.examples:
                     self.filehandles['seq_example'].write(fasta)
                 if iupac_chars.match(rna.get_sequence()):
                     self.filehandles['nhmmer_db'].write(fasta)
                 else:
                     self.filehandles['nhmmer_db_excluded'].write(fasta)
                 # species specific identifiers
                 sequence = re.sub(r'^>.+?\n', '', fasta) # delete first line
                 template = ">{upi}_{taxid} {description}\n{sequence}"
                 queryset = rna.xrefs.filter(deleted='N')
                 for taxid in set(queryset.values_list('taxid', flat=True)):
                     description = rna.get_description(taxid=taxid)
                     species_specific_fasta = template.format(upi=result['upi'],
                                                              taxid=taxid,
                                                              sequence=sequence,
                                                              description=description)
                     self.filehandles['species_specific'].write(species_specific_fasta)
     except psycopg2.Error as exc:
         self.log_database_error(exc)
         sys.exit(1)
Ejemplo n.º 11
0
 def export_inactive_sequences(self):
     """
     Export RNA sequences without active cross-references.
     """
     try:
         previous_upi = ''
         with cursor() as cur:
             cur.execute(self.get_inactive_sequences_sql())
             for counter, result in enumerate(cur):
                 if self.test and counter > self.test_entries:
                     return
                 if result['upi'] == previous_upi:
                     continue
                 else:
                     previous_upi = result['upi']
                 rna = Rna(upi=result['upi'],
                           seq_short=result['seq_short'],
                           seq_long=result['seq_long'])
                 fasta = rna.get_sequence_fasta()
                 self.filehandles['seq_inactive'].write(fasta)
     except psycopg2.Error as exc:
         self.log_database_error(exc)
         sys.exit(1)
Ejemplo n.º 12
0
 def export_inactive_sequences(self):
     """
     Export RNA sequences without active cross-references.
     """
     try:
         previous_upi = ''
         with cursor() as cur:
             cur.execute(self.get_inactive_sequences_sql())
             for counter, result in enumerate(cur):
                 if self.test and counter > self.test_entries:
                     return
                 if result['upi'] == previous_upi:
                     continue
                 else:
                     previous_upi = result['upi']
                 rna = Rna(upi=result['upi'],
                           seq_short=result['seq_short'],
                           seq_long=result['seq_long'])
                 fasta = rna.get_sequence_fasta()
                 self.filehandles['seq_inactive'].write(fasta)
     except psycopg2.Error as exc:
         self.log_database_error(exc)
         sys.exit(1)