Esempio n. 1
0
 def import_ncrna_org_regions(self):
     path_to_file = os.path.join(get_vespucci_path(),
                    'genomereference/pipeline/data/{0}/{0}.bed'.format(self.genome))
     
     output = []
     try: 
         f_bed = open(path_to_file)
     except IOError: 
         # No ncRNA regions for this genome. Silently skip.
         pass
     else:
         for l in f_bed:
             fields = l.split('\t')
             #chr8    119597933       119597953       FR408228        1000    +       
             output.append("""
                 INSERT into "{schema_name}"."bed" 
                     ("name","chrom","strand","start","end") 
                     VALUES ('{0}', '{1}', '{2}', {3}, {4});
                 """.format(fields[3], fields[0], fields[5], fields[1], fields[2],
                            schema_name=self.schema_name))
     
     s = """
     CREATE TABLE "{schema_name}"."bed" (
         "name" varchar(50) NOT NULL DEFAULT NULL,
         "chrom" varchar(25) NOT NULL DEFAULT NULL,
         "strand" varchar(1) NOT NULL DEFAULT NULL,
         "start" int8 NOT NULL DEFAULT NULL,
         "end" int8 NOT NULL DEFAULT NULL
     );
     """.format(schema_name=self.schema_name) \
     + '\n'.join(output)
     
     return s
Esempio n. 2
0
 def insert_chromosome_values(self):
     '''
     Pulls from chromInfo.txt file, which is the UCSC
     list at http://hgdownload.soe.ucsc.edu/goldenPath/mm9/database/chromInfo.txt.gz
     
     '''
     table_name = 'chromosome'
     path_to_file = os.path.join(get_vespucci_path(),
                    'genomereference/pipeline/data/{0}/chromInfo.txt'.format(self.genome))
     f = open(path_to_file)
     output = []
     for l in f:
         fields = l.strip('\n').split('\t')
         
         # Only include non-random chromosomes, and those without weird suffixes:
         if re.match(current_settings.CHR_MATCH,fields[0]):
             # We want length inclusive, not last basepair,
             # so add 1 to the value from UCSC, which is last basepair
             fields[1] = int(fields[1]) + 1
             output.append("""
                 INSERT INTO "{schema_name}"."{table_name}" (name, length) 
                     VALUES ('{0}', {1});
                 """.format(fields[0], fields[1] or 'NULL',
                            schema_name=self.schema_name, table_name=table_name))
     return '\n'.join(output)
Esempio n. 3
0
 def import_ucsc_sequence_values(self):
     '''
     Create a temp table to be normalized and associated appropriately.
     
     File downloaded at: 
     http://hgdownload.soe.ucsc.edu/goldenPath/mm9/database/refGene.txt.gz
     
     This is hardcoded to work with the UCSC download as it is. 
     More flexible import logic can be created here.
     '''
     path_to_file = os.path.join(get_vespucci_path(),
                    'genomereference/pipeline/data/{0}/refGene.txt'.format(self.genome))
     f = open(path_to_file)
     output = []
     for l in f:
         fields = l.strip('\n').split('\t')
         output.append("""
             INSERT into "{schema_name}"."refGene" 
                 ("name","chrom","strand","txStart","txEnd") 
                 VALUES ('{0}', '{1}', '{2}', {3}, {4});
             """.format(fields[1], fields[2], fields[3], fields[4], fields[5],
                        schema_name=self.schema_name))
     
     return """
     CREATE TABLE "{schema_name}"."refGene" (
         "name" varchar(50) NOT NULL DEFAULT NULL,
         "chrom" varchar(25) NOT NULL DEFAULT NULL,
         "strand" varchar(1) NOT NULL DEFAULT NULL,
         "txStart" int8 NOT NULL DEFAULT NULL,
         "txEnd" int8 NOT NULL DEFAULT NULL
     );
     """.format(schema_name=self.schema_name) \
     + '\n'.join(output)
Esempio n. 4
0
 def import_ncrna_org_values(self):
     '''
     Create a temp table to be normalized and associated appropriately.
     
     File downloaded at: http://www.ncrna.org/frnadb/files/summary.zip
     and: http://www.ncrna.org/frnadb/files/mm9_bed.zip --> mm9_bed/mm9.bed
     
     Note that the dm3 file excludes the set of fly smallRNA, which is large.
     This can easily be added by downloading the file dm3.bed,
     but is not included in the codebase to avoid bloat.
     
     The hg19 file was derived from the fRNAdb hg18 file using UCSC's
     liftOver utility.
     
     This is hardcoded to work with the fRNAdb download as it is. 
     More flexible import logic can be created here.
     '''
     path_to_file = os.path.join(get_vespucci_path(),
                    'genomereference/pipeline/data/summary.csv')
     f_summary = csv.reader(open(path_to_file, 'rt'))
     output = []
     for fields in f_summary:
         # ID,acc,Description,SO name,Oranism,Xref,Length
         if fields[0] == 'ID': continue
         
         if not fields[0] or current_settings.GENOME_CHOICES[self.genome]['name']\
                 not in fields[4]: continue
         
         # Shorten and clean up some apostrophes
         fields = [val[:255].replace("'", "''") for val in fields]
         output.append("""
             INSERT into "{schema_name}"."summary" 
                 ("ID","Description","SO name","Xref") 
                 VALUES ('{0}', '{1}', '{2}', '{3}');
             """.format(fields[0], fields[2], fields[3], fields[5],
                        schema_name=self.schema_name))
     
     s = """
     CREATE TABLE "{schema_name}"."summary" (
         "ID" varchar(50) NOT NULL DEFAULT NULL,
         "Description" varchar(255) NOT NULL DEFAULT NULL,
         "SO name" varchar(255) NOT NULL DEFAULT NULL,
         "Xref" varchar(255) NOT NULL DEFAULT NULL
     );
     """.format(schema_name=self.schema_name) \
     + '\n'.join(output)
     
     return s
Esempio n. 5
0
    for chr_id in chr_ids:
        q += """CREATE TABLE "{0}_{chr_id}" AS 
                    SELECT * FROM "{0}" WHERE chromosome_id = {chr_id};
                ALTER TABLE "{0}_{chr_id}" 
                    RENAME COLUMN "transcription_start" TO "start";
                ALTER TABLE "{0}_{chr_id}" 
                    RENAME COLUMN "transcription_end" TO "end";
                ALTER TABLE "{0}_{chr_id}" 
                    ADD COLUMN "refseq" bool DEFAULT true;
                    """.format(SequenceTranscriptionRegion._meta.db_table,
                               chr_id=chr_id)
    execute_query(q)

    try:
        print('Adding data...')
        path = os.path.join(get_vespucci_path(), 'atlas/pipeline/scripts')
        print(subprocess.check_output(path +
                                      '/set_up_database.sh -g {0} -c refseq --prep'.format(
                                          current_settings.GENOME),
                                      shell=True))
        print(subprocess.check_output(
            path + '/transcripts_from_tags.sh -g {0} -c refseq '.format(
                current_settings.GENOME)
            + ' --schema_name=genome_reference_{0} '.format(
                current_settings.GENOME)
            + ' --tag_table=sequence_transcription_region '
            + ' --stitch --stitch_processes=2 --set_density '
            + ' --no_extended_gaps', shell=True))

    except Exception as e:
        print(e)