def main():

    special = None
    no_threads = 1
    db = connect_to_mysql()

    cursor = db.cursor()
    [all_species, ensembl_db_name] = get_species(cursor)
    total = 0
    for species in all_species:
        print species
        switch_to_db(cursor, ensembl_db_name[species])
        qry = "select count(1) from usearch_exon"
        rows = search_db(cursor, qry)
        count = int(rows[0][0])
        print "\t usearch exons: ", count
        total += count
        qry = "select count(1) from sw_exon"
        rows = search_db(cursor, qry)
        count = int(rows[0][0])
        print "\t sw exons: ", count
        total += count
    print
    print 'total: ', total
    cursor.close()
    db.close()
def main():
    
    special    = None
    no_threads = 1
    db  = connect_to_mysql()

    cursor = db.cursor()
    [all_species, ensembl_db_name] = get_species (cursor)
    total = 0
    for species in all_species:
        print species
        switch_to_db (cursor, ensembl_db_name[species])
        qry  = "select count(1) from usearch_exon"
        rows = search_db (cursor, qry)
        count = int(rows[0][0])
        print "\t usearch exons: ", count 
        total += count
        qry  = "select count(1) from sw_exon"
        rows = search_db (cursor, qry)
        count = int(rows[0][0])
        print "\t sw exons: ", count 
        total += count
    print
    print 'total: ', total
    cursor.close()
    db.close()
def  make_parameter_table (cursor):

    """
    Creates parameter table in the config database.
    @param [cursor] db cursor, assumed top be pointing to the config database
    @retval True  on success
    @retval False on failure;  in that case the seach_db() call is repeated in verbose mode.
    """

    table = 'parameter'

    print "making ", table

    qry  = "create table " + table + "  (id int(10) primary key auto_increment)"
    rows = search_db (cursor, qry, verbose=True)
    if (rows):
        return False

    # make the columns
    for column  in  ['name', 'value']:
        qry = "alter table  %s  add  %s  varchar (50)" % (table, column)
        rows = search_db (cursor, qry, verbose=True)
        if (rows):
            return False
    return False
Esempio n. 4
0
def main():

    db = connect_to_mysql()
    cr = ConfigurationReader()

    cursor = db.cursor()
    fasta_path = cr.get_path('ensembl_fasta')

    [all_species, ensembl_db_name] = get_species (cursor)

    for species in all_species:
    #for species in ['danio_rerio']:
        print species
        dna_path = "{0}/{1}/dna".format(fasta_path, species)
        if (not os.path.exists(dna_path)):
            print "problem:", dna_path, "not found"
            exit(1)

        fasta_files = []
        for r,d,files in os.walk(dna_path):
            for file in files:
                if (not file[-3:] == ".fa"):
                    continue
                fasta_files.append(file)
        
        name2file = {}
        for file in fasta_files:
            print dna_path, file
            cmd = "grep '>' {0}/{1}".format(dna_path, file)
            ret = commands.getoutput(cmd)
            headers = ret.split("\n")
            print "number of headers: ", len(headers)
            for hdr in headers:
                fields = hdr.split(" ")
                name = fields[0].replace (">", "")
                #print name
                if (not name2file.has_key(name)):
                    name2file[name] = []
                name2file[name].append(file)

        qry = "use "+ensembl_db_name[species]
        search_db (cursor, qry)

        for name in name2file.keys():
            file_names = ""
            for file in  name2file[name]:
                if file_names:
                    file_names += " "
                file_names += file
            store_seq_filenames (cursor, name, file_names)
 
    cursor.close()
    db    .close()
Esempio n. 5
0
def check_table_sizes(cursor, all_species, ensembl_db_name):
    for species in all_species:
        print
        print "##########################"
        print species
        qry = "use " + ensembl_db_name[species]
        search_db(cursor, qry)
        qry = "show tables"
        rows = search_db(cursor, qry)
        for row in rows:
            table = row[0]
            qry = " select count(1) from " + table
            rows = search_db(cursor, qry)
            table_size = rows[0][0]
            print "\t ", table, table_size
Esempio n. 6
0
def check_table_sizes (cursor, all_species, ensembl_db_name):
    for species in all_species:
        print
        print "##########################"
        print species
        qry  = "use "+ensembl_db_name[species]
        search_db(cursor, qry)
        qry  = "show tables"
        rows = search_db(cursor, qry)
        for row in rows:
            table = row[0]
            qry = " select count(1) from "+table
            rows = search_db(cursor, qry)
            table_size = rows[0][0]
            print "\t ", table, table_size
def main():

    db     = connect_to_mysql()
    cursor = db.cursor()

    [all_species, ensembl_db_name] = get_species (cursor)

    for species in all_species:
        print species

        switch_to_db (cursor, ensembl_db_name[species])

        qry  = "select seq_region.name, seq_region.file_name from seq_region, gene "
        qry += " where gene.biotype='protein_coding' and gene.seq_region_id =  seq_region.seq_region_id "
            

        rows = search_db (cursor, qry)
        if (not rows):
            print "\t no seq region info found "
            continue
        tot = 0
        no_file = 0
        for row in rows:
            [name,  file_name] = row
            #print name, file_name
            tot += 1
            if (not file_name):
                no_file += 1
                print name, file_name
                #exit (1)

        print "\t tot seq_regions: ", tot, " no file: ", no_file
 
    cursor.close()
    db    .close()
def main ():

    
    db_name = "exolocator_db"
    db      = connect_to_mysql(user="******", passwd="tooiram")
    cursor  = db.cursor()
    switch_to_db (cursor, db_name)

    cfg     = ConfigurationReader (user="******", passwd="tooiram", check=False)

    inpath = cfg.get_path('afs_dumps')
    indir   = "%s/exon_map"     % inpath
    infile  = "%s/exon_map.sql" % indir
    if (not os.path.exists(infile)):
        print "not found: ", infile
        sys.exit(1)
    print "reading", infile

    qry = "drop table exon_map"
    rows = search_db(cursor, qry)
    # I could not get this to run, though it runs fine directly from the mysql shell:
    #qry = "source %s" % infile
    #rows = search_db(cursor, qry, verbose=True)
    cursor.close()
    db.close()

    credentials = " -u marioot -ptooiram"
    cmd = "mysql %s  exolocator_db  <  %s" % (credentials, infile)
    print cmd
    ret = commands.getoutput(cmd)
    print ret

 
    return True
def get_seq_region_info(cursor, name):
    qry = "select * from seq_region where name = '%s'" % name
    rows = search_db (cursor, qry)
    if(len(rows) > 1):
        print "more than one entry associated with ", name
        exit (1)
    return rows[0]
Esempio n. 10
0
def main():

    db_name = "exolocator_db"
    db = connect_to_mysql(user="******", passwd="tooiram")
    cursor = db.cursor()
    switch_to_db(cursor, db_name)

    cfg = ConfigurationReader(user="******", passwd="tooiram", check=False)

    inpath = cfg.get_path('afs_dumps')
    indir = "%s/exon_map" % inpath
    infile = "%s/exon_map.sql" % indir
    if (not os.path.exists(infile)):
        print "not found: ", infile
        sys.exit(1)
    print "reading", infile

    qry = "drop table exon_map"
    rows = search_db(cursor, qry)
    # I could not get this to run, though it runs fine directly from the mysql shell:
    #qry = "source %s" % infile
    #rows = search_db(cursor, qry, verbose=True)
    cursor.close()
    db.close()

    credentials = " -u marioot -ptooiram"
    cmd = "mysql %s  exolocator_db  <  %s" % (credentials, infile)
    print cmd
    ret = commands.getoutput(cmd)
    print ret

    return True
def search_description(cursor, gene_name):
    qry = "select gene_id, description from gene "
    qry += "where description like '%" + gene_name + "%'"
    rows = search_db(cursor, qry)
    if not rows:
        return ["", ""]
    else:
        return rows[0]
def search_description (cursor, gene_name):
    qry  = "select gene_id, description from gene "
    qry += "where description like '%"+gene_name+"%'"
    rows = search_db (cursor, qry)
    if not rows:
        return ["", ""]
    else:
        return rows[0]
def get_exon_end(cursor, exon_id):
 
    qry  = "select seq_region_end from exon "
    qry += "where exon_id = %d " % exon_id
    rows = search_db (cursor, qry)
    if (not rows or 'Error' in rows[0]):
        print "start not found for ", exon_id
        return None

    return rows[0][0]
Esempio n. 14
0
def get_phase(cursor, exon_id):

    qry  = "select is_coding, phase, gene_id from gene2exon where exon_id = %d" % exon_id
    rows = search_db(cursor, qry)
    if (rows):
        [is_coding, phase, gene_id] = rows[0]
    else:
        [is_coding, phase, gene_id] = [0,0,0]

    return [is_coding, phase, gene_id]
def get_exon_end(cursor, exon_id):

    qry = "select seq_region_end from exon "
    qry += "where exon_id = %d " % exon_id
    rows = search_db(cursor, qry)
    if (not rows or 'Error' in rows[0]):
        print "start not found for ", exon_id
        return None

    return rows[0][0]
def  cleanup_endphase (cursor, exon):


    qry = "select phase, end_phase from exon where exon_id =  %d " % exon.exon_id
    rows  = search_db (cursor, qry)
    if not rows:
        exon.phase     = 0
        exon.end_phase = 0
    else:
        exon.phase     = rows[0][0]
        exon.end_phase = rows[0][1]
Esempio n. 17
0
def  make_path_table (cursor, table):

    print "making ", table

    qry  = "create table " + table + "  (id int(10) primary key auto_increment)"
    rows = search_db (cursor, qry, verbose=True)
    if (rows):
        return False

    # make the columns
    column  = 'name'
    qry = "alter table  %s  add  %s  varchar (20)" % (table, column)
    rows = search_db (cursor, qry, verbose=True)
    if (rows):
        return False

    column  = 'path'
    qry = "alter table  %s  add  %s  blob" % (table, column)
    rows = search_db (cursor, qry, verbose=True)
    if (rows):
        return False
def transcript_id2exon_ids (cursor, transcript_id):

    exon_ids = []
    qry = "select exon_id from exon_transcript "
    qry += " where transcript_id = %d " %  transcript_id
    rows   = search_db (cursor, qry)
    if (not rows):
        return []
    for row in rows:
        exon_ids.append(row[0])

    return exon_ids
def gene_name2gene_id(cursor, gene_name):

    qry = "select ensembl_id from object_xref, external_synonym "
    qry += "where object_xref.ensembl_object_type = 'Gene'  "
    qry += "and object_xref.xref_id= external_synonym.xref_id "
    qry += "and external_synonym.synonym = '%s' " % gene_name
    qry += "group by synonym"
    rows = search_db(cursor, qry)
    if not rows:
        return ""
    else:
        return rows[0][0]
def  map_cleanup (cursor, ensembl_db_name, human_exons):
    
    switch_to_db(cursor,ensembl_db_name['homo_sapiens']) 
    for exon in human_exons:
        qry  = "delete from exon_map where exon_id = %d " % exon.exon_id
        qry += " and exon_known = %d " % exon.is_known
        qry += " and cognate_exon_known > 1 " 
        qry += " and similarity is NULL" 
        rows = search_db (cursor, qry, verbose=False)


    return True
def gene_name2gene_id(cursor, gene_name):
    
    qry  = "select ensembl_id from object_xref, external_synonym "
    qry += "where object_xref.ensembl_object_type = 'Gene'  "
    qry += "and object_xref.xref_id= external_synonym.xref_id "
    qry += "and external_synonym.synonym = '%s' " % gene_name
    qry += "group by synonym"
    rows = search_db (cursor, qry)
    if not rows:
        return ""
    else:
        return rows[0][0]
def transcript_id2exon_ids(cursor, transcript_id):

    exon_ids = []
    qry = "select exon_id from exon_transcript "
    qry += " where transcript_id = %d " % transcript_id
    rows = search_db(cursor, qry)
    if (not rows):
        return []
    for row in rows:
        exon_ids.append(row[0])

    return exon_ids
Esempio n. 23
0
def  make_seqregion2file_table (cursor):

    table = 'seqregion2file'

    
    qry  = "create table " + table + "  (seqregion_id int(10) primary key)"
    rows = search_db (cursor, qry)
    if (rows):
        return False

    # make the columns
    column  = 'seq_name'
    qry = "alter table  %s  add  %s  varchar (100)" % (table, column)
    rows = search_db (cursor, qry)
    if (rows):
        return False

    column  = 'file_name'
    qry = "alter table  %s  add  %s  blob" % (table, column)
    rows = search_db (cursor, qry)
    if (rows):
        return False
def main():

    db_name = "exolocator_db"
    db      = connect_to_mysql(user="******", passwd="tooiram")
    cursor  = db.cursor()
    switch_to_db (cursor, db_name)

    cfg      = ConfigurationReader (user="******", passwd="tooiram", check=False)
    in_path  = cfg.get_path('afs_dumps')
    in_path += "/para_dump"
    if (not os.path.exists(in_path)):
        print in_path, "not found"
        sys.exit(1) # exit on non-existent outdir

    
    ###############
    if 1:
        qry = "drop table paralog"
        search_db (cursor, qry)
        qry = "create table paralog (id int(10) primary key auto_increment) "
        search_db (cursor, qry)
        qry = "alter table paralog  ADD gene_id1 varchar(30) " 
        search_db (cursor, qry)
        qry = "alter table paralog  ADD gene_id2 varchar(30) " 
        search_db (cursor, qry)
        create_index (cursor, db_name,'gene_id_index', 'paralog', ['gene_id1', 'gene_id2'])
        

    ###############
    os.chdir(in_path)
    filenames = glob.glob("*_para_dump.txt")

    ###############
    for infile in filenames:
        print infile
        store(cursor, infile)

    cursor.close()
    db    .close()
Esempio n. 25
0
def main():

    db_name = "exolocator_db"
    db = connect_to_mysql(user="******", passwd="tooiram")
    cursor = db.cursor()
    switch_to_db(cursor, db_name)

    cfg = ConfigurationReader(user="******", passwd="tooiram", check=False)
    in_path = cfg.get_path('afs_dumps')
    in_path += "/para_dump"
    if (not os.path.exists(in_path)):
        print in_path, "not found"
        sys.exit(1)  # exit on non-existent outdir

    ###############
    if 1:
        qry = "drop table paralog"
        search_db(cursor, qry)
        qry = "create table paralog (id int(10) primary key auto_increment) "
        search_db(cursor, qry)
        qry = "alter table paralog  ADD gene_id1 varchar(30) "
        search_db(cursor, qry)
        qry = "alter table paralog  ADD gene_id2 varchar(30) "
        search_db(cursor, qry)
        create_index(cursor, db_name, 'gene_id_index', 'paralog',
                     ['gene_id1', 'gene_id2'])

    ###############
    os.chdir(in_path)
    filenames = glob.glob("*_para_dump.txt")

    ###############
    for infile in filenames:
        print infile
        store(cursor, infile)

    cursor.close()
    db.close()
def check_ccds(cursor, transcript_stable_id):

    ccds = ""

    qry = "select dna_align_feature.hit_name "
    qry += "from dna_align_feature, transcript, transcript_supporting_feature "
    qry += "   where dna_align_feature.dna_align_feature_id =  transcript_supporting_feature.feature_id "
    qry += "   and transcript_supporting_feature.feature_type ='dna_align_feature' "
    qry += "   and transcript_supporting_feature.transcript_id =transcript.transcript_id "
    qry += "   and transcript.stable_id = '%s' " % transcript_stable_id

    rows = search_db(cursor, qry)

    if not rows:
        return ccds

    for row in rows:
        if 'CCDS' in row[0]:
            ccds = row[0]

    return ccds
def dump_orthos (species_list, db_info):

    
    [local_db, ensembl_db_name] = db_info
    db     = connect_to_mysql()
    cfg    = ConfigurationReader()
    cursor = db.cursor()

     # find db ids adn common names for each species db
    [all_species, ensembl_db_name] = get_species (cursor)

    # in the afa headers use 'trivial' names for the species: cow, dog, pig, ...
    trivial_name   = translate_to_trivial(cursor, all_species)

    out_path = cfg.get_path('afs_dumps')
    outfile  = "{0}/orthologue_dump.txt".format(out_path)
    print outfile
    of       = erropen (outfile,"w")

    species  = 'homo_sapiens'
    switch_to_db (cursor,  ensembl_db_name[species])


    qry = "select * from orthologue"
    rows = search_db (cursor, qry)
    for row in rows:
        [pair_id, human_gene_id, cognate_gene_id, genome_db_id, source] =  row
        species = genome_db_id2species (cursor, genome_db_id)
        switch_to_db (cursor,  ensembl_db_name['homo_sapiens'])
        human_stable_id = gene2stable(cursor, human_gene_id)
        switch_to_db (cursor,  ensembl_db_name[species])
        cognate_stable_id = gene2stable(cursor, cognate_gene_id)
        print  >>of,  orthos_tabstring ([human_stable_id, cognate_stable_id, species, trivial_name[species]])


    of.close()
    
    cursor.close()
    db    .close()
def check_ccds (cursor, transcript_stable_id):

    ccds = ""

    qry  = "select dna_align_feature.hit_name "
    qry += "from dna_align_feature, transcript, transcript_supporting_feature "
    qry += "   where dna_align_feature.dna_align_feature_id =  transcript_supporting_feature.feature_id "
    qry += "   and transcript_supporting_feature.feature_type ='dna_align_feature' "
    qry += "   and transcript_supporting_feature.transcript_id =transcript.transcript_id "
    qry += "   and transcript.stable_id = '%s' " % transcript_stable_id

    rows = search_db(cursor, qry)

    if not rows:
        return ccds


    for row in rows:
        if 'CCDS' in row[0]:
            ccds = row[0]

    return  ccds
Esempio n. 29
0
def get_theme_ids(cursor, cfg, theme_name):
    resources = cfg.dir_path['resources']
    fnm = resources + '/' + theme_name+'.txt'
    if not os.path.exists(fnm):
        print fnm, "not found"
        exit(1)

    if not os.path.getsize(fnm) > 0:
        print fnm, "empty"
        exit(1)
        
    inf = erropen(fnm, "r")
    gene_ids = []
    for line in inf:
        line.rstrip()
        [stable_id, name] = line.split("\t")
        qry = "select gene_id, description from gene where stable_id='%s'"% stable_id
        rows = search_db (cursor, qry)
        if not rows: continue
        gene_ids.append(rows[0][0])
    inf.close()

    return gene_ids
def alt_splice_almt (cursor, cfg, acg, species, ensembl_db_name):

    flank_length = 10

    print "############################"
    print 'checking alt splicing in ', species

    qry = "use " + ensembl_db_name[species]
    search_db(cursor, qry)
    gene_ids = get_gene_ids (cursor, biotype='protein_coding', is_known=1)

    if species == 'homo_sapiens':
        spec_short = 'HSA'
    else:
        spec_short = 'MMU'

    outdir   = "{0}/alt/{1}".format(cfg.dir_path['afs_dumps'], spec_short)
    if not os.path.exists(outdir):
        os.makedirs(outdir)

    ########################################
    ########################################
    ########################################

    #gene_ids.reverse()

    for gene_id in gene_ids:
    #for gene_id in [429349]:
    #for count in range(1000):
        #gene_id = choice (gene_ids)

        stable_gene_id = gene2stable(cursor, gene_id)
        if verbose: print  gene_id, stable_gene_id, get_description (cursor, gene_id)
        transcript_ids = get_transcript_ids(cursor, gene_id)

        tr_w_ccds = []
        for [tr_id, tr_stable] in transcript_ids:
            ccds = check_ccds (cursor, tr_stable)
            if not ccds: continue
            tr_w_ccds.append([tr_id, tr_stable])

        if not tr_w_ccds: continue

        # get all exons for this gene
        all_exons    = gene2exon_list (cursor, gene_id)
        
        exons_w_ccds = set([]) # get the unique_ids

        # find exons which are on the ccds list
        for [tr_id, tr_stable] in tr_w_ccds:
            exon_ids =  transcript_id2exon_ids (cursor, tr_id)
            exons_w_ccds.update( set(exon_ids))
           
        # for these exons check sequence
        is_known = 1
        bad_exon = set([])
        for exon_id in exons_w_ccds:
            exon = get_exon      (cursor, exon_id, is_known)
            seq  = get_exon_seqs (cursor, exon_id, is_known)
            if not seq:
                bad_exon.add(exon_id)
                continue
            [exon_seq_id, protein_seq, pepseq_transl_start, 
             pepseq_transl_end, left_flank, right_flank, dna_seq] = seq

            if exon.covering_exon < 0:
                if not dna_seq:
                     bad_exon.add(exon_id)
            else:
                if exon.covering_exon_known and exon.covering_exon in exons_w_ccds:
                    pass
                else:
                    all_exon_ids =  map(lambda exon: exon.exon_id, all_exons)
                    if not exon.covering_exon in all_exon_ids:
                        bad_exon.add(exon_id)
                        
        # which transcripts seem to be completely ok?
        if verbose: print  "reconstructing alt splice almts for "
        if verbose: print  gene_id,  gene2stable(cursor, gene_id), get_description (cursor, gene_id)
        if verbose: print "there are ", len(tr_w_ccds), " transscripts with ccds"

        # get the gene_sequence
        ret = get_gene_seq(acg, cursor, gene_id, species)
        [gene_seq, canonical_exon_pepseq, file_name, seq_name, seq_region_start, seq_region_end]  = ret
        output_seq    = {}
        global_boundaries = []
        local_boundaries  = {}



        # sort exons by the order in which they appear in the gene
        all_exons.sort(key=lambda exon: exon.start_in_gene)


        # a bit of a cleanup
        for exon in all_exons:
            cleanup_endphase (cursor, exon)


        # check if any of the translations is complete:
        no_ok_transcripts = True
        for [tr_id, tr_stable] in tr_w_ccds:
            tr_exon_ids =  transcript_id2exon_ids (cursor, tr_id)
            if bad_exon & set(tr_exon_ids): continue

            if verbose: print tr_stable, " ok "
            no_ok_transcripts = False

        if no_ok_transcripts:
            if verbose: print " no ok transcripts found"
            continue

        # main loop
        cary = "" # for patching up codons split by intron
        for [tr_id, tr_stable] in tr_w_ccds:
            tr_exon_ids =  transcript_id2exon_ids (cursor, tr_id)
            if bad_exon & set(tr_exon_ids): continue

            # translation is from where to where?
            ret = get_translation_coords (cursor, tr_id)
            if not ret:
                continue
            [seq_start, start_exon_id, seq_end, end_exon_id] = ret
            for exon in all_exons:
                if exon.exon_id == start_exon_id: start_exon=exon
                if exon.exon_id == end_exon_id:   end_exon=exon
           
            transl_start_in_gene = start_exon.start_in_gene + seq_start
            transl_end_in_gene   =   end_exon.start_in_gene + seq_end
                

            local_boundaries[tr_stable] = []
            output_seq[tr_stable] = "-"*len(gene_sequence)
            output_seq[tr_stable+"_pep"] = "-"*len(gene_sequence)
            transl_end = ""

            for exon in all_exons:
                if not exon.exon_id in tr_exon_ids: continue
                
                start       = exon.start_in_gene
                start_flank = exon.start_in_gene - flank_length
                if start_flank  < 0: 
                    start_flank  = 0
                else:
                    if not start_flank-1 in global_boundaries: global_boundaries.append(start_flank-1)
                    local_boundaries[tr_stable].append(start_flank)

                end       = exon.end_in_gene
                end_flank = exon.end_in_gene + flank_length
                if end_flank > len(gene_sequence): 
                    end_flank = len(gene_sequence)
                else:
                    if not end_flank in global_boundaries: global_boundaries.append(end_flank)
                    local_boundaries[tr_stable].append(end_flank)
                
                tmp_dna  = output_seq[tr_stable][:start_flank]  + gene_sequence[start_flank:start].lower()
                tmp_dna += gene_sequence[start:end]
                tmp_dna += gene_sequence[end:end_flank].lower() + output_seq[tr_stable][end_flank:]

                output_seq[tr_stable] = tmp_dna


                #################################################
                # now try and handle translation to protein
                prev_transl_end = transl_end

                # where does translation start:
                if exon.end_in_gene < transl_start_in_gene:
                    transl_start = -1
                elif exon.exon_id == start_exon_id:
                    # if this is the first exon, the transl start given above
                    transl_start =  exon.start_in_gene+seq_start-1
                else:
                    # otherwise it is the exon start - except that if this is not the
                    # first exon and the codon is split, we want to start with the
                    # translation of the stitched up exon
                    transl_start = exon.start_in_gene
                    start_flank  = exon.phase

                # where does translation end: 
                if exon.start_in_gene >  transl_end_in_gene:
                    transl_end  = -1
                elif exon.exon_id == end_exon_id:
                    # if this is the first exon, the transl start given above
                    transl_end = exon.start_in_gene+seq_end
                else:
                    # otherwise it is the exon start - except that if this is not the
                    # first exon and the codon is split, we want to start with the
                    # translation of the stitched up exon
                    transl_end  = exon.end_in_gene - exon.end_phase+1
                    end_flank   = exon.end_phase


                if transl_start < 0 or transl_end < 0 :
                     continue

                if exon.phase > 0 and  prev_transl_end:
                    cary = gene_sequence[prev_transl_end:prev_transl_end+exon.phase]
                else:
                    cary = ""

                [phase, pepseq] = translate (cary + gene_sequence[transl_start:transl_end], 
                                             0,  mitochondrial, strip_stop = False)
                prev_transl_end = transl_end

                pepseq_padded = ""
                for aa in pepseq:
                    pepseq_padded += "-"+aa+"-"

                pepseq_name = tr_stable+"_pep"

                tmp_pep  = output_seq[pepseq_name][:transl_start-len(cary)] 
                tmp_pep += pepseq_padded
                tmp_pep += output_seq[pepseq_name][transl_end:]

                output_seq[pepseq_name] = tmp_pep



        global_boundaries.sort()
        for [tr_id, tr_stable] in tr_w_ccds:
            seq =  output_seq[tr_stable]
            tmp_seq   = ""
            prev_bdry = 0
            for bdry in global_boundaries:
                tmp_seq += seq[prev_bdry:bdry] 
                if bdry >= len(seq): continue
                if bdry in local_boundaries[tr_stable]:
                    marker = "-Z-"
                else:
                    marker = "---" 
                tmp_seq += marker 

                prev_bdry = bdry

            output_seq[tr_stable] = tmp_seq

            pepseq_name = tr_stable+"_pep"
            seq =  output_seq[pepseq_name]
            tmp_seq   = ""
            prev_bdry = 0
            for bdry in global_boundaries:
                tmp_seq += seq[prev_bdry:bdry] 
                if bdry >= len(seq): continue
                if bdry in local_boundaries[tr_stable]: # note here
                    marker = "-Z-"
                else:
                    marker = "---" 
                tmp_seq += marker 

                prev_bdry = bdry

            output_seq[pepseq_name] = tmp_seq


        output_seq = strip_gaps(output_seq)


        # define the order in which we  want the sequences output
        name_order = []
        for [tr_id, tr_stable] in tr_w_ccds:
           pepseq_name = tr_stable+"_pep"
           name_order.append (pepseq_name)
           name_order.append (tr_stable)


        afa_fnm  = "{0}/{1}.afa".format(outdir, stable_gene_id)
        ret = output_fasta (afa_fnm, name_order, output_seq)

        print afa_fnm

    return True
def get_translated_region_talkative(cursor, gene_id, species):

    # get the region on the gene
    is_known = (species == 'homo_sapiens')
    ret = get_gene_region(cursor, gene_id, is_known)
    if ret:
        [gene_seq_id, gene_region_start, gene_region_end,
         gene_region_strand] = ret
    else:
        print "region not retrived for ", species, gene_id, species
        return []

    canonical_transcript_id = get_canonical_transcript_id(cursor, gene_id)
    transcript_ids = get_transcript_ids(cursor, gene_id)
    print transcript_ids
    print "canonical: ", canonical_transcript_id

    transl_region_start = gene_region_end
    transl_region_end = gene_region_start

    print "transl region start:", transl_region_start
    print "transl region end:", transl_region_end

    for [transcript_id, transcript_stable] in transcript_ids:

        qry = "SELECT seq_start, start_exon_id, seq_end, end_exon_id "
        qry += " FROM translation WHERE transcript_id=%d" % transcript_id
        rows = search_db(cursor, qry)
        if (not rows):
            continue
        exon_seq_start = rows[0][0]
        start_exon_id = rows[0][1]
        exon_seq_end = rows[0][2]
        end_exon_id = rows[0][3]

        print
        if transcript_id == canonical_transcript_id:
            print "canonical: "
        print "transcript id: ", transcript_id
        print "start exon id:", start_exon_id, "transl start (in the exon) ", exon_seq_start
        print "end exon id:", end_exon_id, "transl end (in the exon)", exon_seq_end

        if (gene_region_strand > 0):
            start = {}

            start[start_exon_id] = get_exon_start(cursor, start_exon_id)
            start[end_exon_id] = get_exon_start(cursor, end_exon_id)

            this_translation_region_start = start[
                start_exon_id] + exon_seq_start - 1
            this_translation_region_end = start[end_exon_id] + exon_seq_end - 1

        else:
            end = {}

            end[start_exon_id] = get_exon_end(cursor, start_exon_id)
            end[end_exon_id] = get_exon_end(cursor, end_exon_id)

            this_translation_region_start = end[end_exon_id] - exon_seq_end + 1
            this_translation_region_end = end[
                start_exon_id] - exon_seq_start + 1

        if (this_translation_region_start <= transl_region_start):
            transl_region_start = this_translation_region_start

        if (this_translation_region_end >= transl_region_end):
            transl_region_end = this_translation_region_end

    return
Esempio n. 32
0
def pep_seqs (cursor, gene_id, exons):
    

    for exon in exons:
        #####################################                
        if (not exon.is_coding):
            if verbose: print exon.exon_id,  "is not coding "
            continue
        if (exon.covering_exon > 0):
            if verbose: print exon.exon_id,  "has covering exon"
            continue 
        exon_seqs = get_exon_seqs(cursor, exon.exon_id, exon.is_known)
        if (not exon_seqs):
            if verbose: print exon.exon_id,  "no exon_seqs"
            continue                   
        [exon_seq_id, pepseq, pepseq_transl_start, 
         pepseq_transl_end, left_flank, right_flank, dna_seq] = exon_seqs
        if len(dna_seq)<4:
            if verbose: print exon.exon_id,  "short dna"
            continue

        #####################################                
        mitochondrial        = is_mitochondrial(cursor, gene_id)
        [seq_start, seq_end] = translation_bounds (cursor, exon.exon_id, verbose)
        if verbose: print " ** ", seq_start, seq_end
        dna_cropped          = crop_dna (seq_start, seq_end, dna_seq)
        if verbose: print " ** ", dna_cropped
        [offset, length_translated, pepseq, phase_corrected] = translate (dna_cropped, exon.phase, mitochondrial, verbose)

        if ( offset < 0): #  translation failure; usually some short pieces (end in pos 4 and such)
            if verbose: 
                print exon.exon_id,  "translation failure"
                print "mitochondrial:", mitochondrial
                print seq_start, seq_end
            continue

        if seq_start is None: seq_start = 1
        if seq_start == 0: seq_start = 1
        start = seq_start+offset-1
        end   = start + length_translated

        dnaseq  = Seq (dna_seq[start:end], generic_dna)
        if (mitochondrial):
            pepseq2 = dnaseq.translate(table="Vertebrate Mitochondrial").tostring()
        else:
            pepseq2 = dnaseq.translate().tostring()

        if (not pepseq == pepseq2):
            start = -10
            end   = -10
            
        if verbose: 
            print exon.exon_id
            print "pep from translate:", pepseq
            print "dna transl:", pepseq2
            print "start:" , start
            print "end:",  end
            print

        if True:
            qry  = "update exon_seq "
            qry += " set protein_seq   = '%s',  " %  pepseq
            qry += " pepseq_transl_start =  %d, " %  start
            qry += " pepseq_transl_end   =  %d  " %  end
            qry += " where exon_seq_id =  %d    " %  exon_seq_id
            rows = search_db (cursor, qry)
            if (rows):
                rows = search_db (cursor, qry, verbose = True)
                continue
def get_translated_region_talkative(cursor, gene_id, species):

    # get the region on the gene
    is_known = (species == 'homo_sapiens')
    ret = get_gene_region (cursor, gene_id, is_known)
    if  ret:
        [gene_seq_id,gene_region_start, gene_region_end, 
         gene_region_strand] = ret
    else:
        print "region not retrived for ", species, gene_id, species
        return []

    canonical_transcript_id = get_canonical_transcript_id (cursor, gene_id)
    transcript_ids = get_transcript_ids(cursor, gene_id)
    print transcript_ids
    print "canonical: ", canonical_transcript_id

    transl_region_start = gene_region_end
    transl_region_end   = gene_region_start

    print "transl region start:", transl_region_start
    print "transl region end:", transl_region_end

    for[ transcript_id, transcript_stable] in transcript_ids:
   
        qry  = "SELECT seq_start, start_exon_id, seq_end, end_exon_id " 
        qry += " FROM translation WHERE transcript_id=%d"  %  transcript_id
        rows = search_db (cursor, qry)
        if (not rows):
            continue
        exon_seq_start = rows[0][0]
        start_exon_id  = rows[0][1]
        exon_seq_end   = rows[0][2]
        end_exon_id    = rows[0][3]
        
        print
        if transcript_id == canonical_transcript_id:
            print "canonical: "
        print "transcript id: ", transcript_id
        print "start exon id:",  start_exon_id, "transl start (in the exon) ", exon_seq_start
        print "end exon id:",    end_exon_id,   "transl end (in the exon)", exon_seq_end
        

        if (gene_region_strand > 0):
            start = {}
 
            start[start_exon_id] = get_exon_start(cursor, start_exon_id)
            start[end_exon_id]   = get_exon_start(cursor, end_exon_id)

            this_translation_region_start = start[start_exon_id] + exon_seq_start - 1
            this_translation_region_end   = start[end_exon_id]   + exon_seq_end   - 1

        else: 
            end   = {}  

            end[start_exon_id] = get_exon_end (cursor, start_exon_id)
            end[end_exon_id]   = get_exon_end (cursor, end_exon_id)

            this_translation_region_start = end[end_exon_id]   - exon_seq_end   + 1
            this_translation_region_end   = end[start_exon_id] - exon_seq_start + 1

        if (this_translation_region_start <= transl_region_start):
            transl_region_start = this_translation_region_start
        
        if (this_translation_region_end >= transl_region_end):
            transl_region_end = this_translation_region_end

    return
def check_alt_splices (cursor, species, ensembl_db_name):

    
    print "############################"
    print 'checking alt splicing in ', species

    qry = "use " + ensembl_db_name[species]
    search_db(cursor, qry)
    gene_ids = get_gene_ids (cursor, biotype='protein_coding', is_known=1)

    no_cover_and_no_seq = 0  
    no_cover_and_no_dna_seq = 0  
    no_seq_info_in_database = 0
    all_ok   = 0   
    cover_already_present  = 0
    cover_not_in_exon_set  = 0     
    cov_exon_not_in_ccds   = 0 
    genes_w_ccds           = 0
    tot_exons = 0
        
    #for gene_id in gene_ids[:100]:
    #for gene_id in [413198]:
    for count in range(1000):
        gene_id = choice (gene_ids)

        #print  gene_id,  gene2stable(cursor, gene_id), get_description (cursor, gene_id)
        transcript_ids = get_transcript_ids(cursor, gene_id)

        tr_w_ccds = []
        for [tr_id, tr_stable] in transcript_ids:
            ccds = check_ccds (cursor, tr_stable)
            if not ccds: continue
            tr_w_ccds.append([tr_id, tr_stable])

        if not tr_w_ccds: continue

        genes_w_ccds += 1

        # get all exons for this gene
        all_exons    = gene2exon_list (cursor, gene_id)
        
        exons_w_ccds = set([]) # get the unique_ids

        # find exons which are on the ccds list
        for [tr_id, tr_stable] in tr_w_ccds:
            exon_ids =  transcript_id2exon_ids (cursor, tr_id)
            exons_w_ccds.update( set(exon_ids))
           
        # for these exons check sequence
        is_known=1
        for exon_id in exons_w_ccds:
            tot_exons += 1
            exon = get_exon      (cursor, exon_id, is_known)
            seq  = get_exon_seqs (cursor, exon_id, is_known)
            if not seq:
                #print  gene_id,  gene2stable(cursor, gene_id), get_description (cursor, gene_id)
                #print exon_id, " no seq", "covered: ", exon.covering_exon
                #exit (1)
                no_seq_info_in_database += 1
                continue
            [exon_seq_id, protein_seq, pepseq_transl_start, 
             pepseq_transl_end, left_flank, right_flank, dna_seq] = seq

            #print exon_id, exon_seq_id, 
            #print "  %7d   %7d  %7d  " % (exon.start_in_gene, exon.end_in_gene,  exon.covering_exon),
            
            if exon.covering_exon < 0:
                #print protein_seq
                if not protein_seq:
                    no_cover_and_no_seq += 1
                elif not dna_seq:
                    no_cover_and_no_dna_seq += 1
                else:
                    all_ok += 1
            else:
                if exon.covering_exon_known and exon.covering_exon in exons_w_ccds:
                    #print " <<<< "
                    cover_already_present += 1
                else:
                    all_exon_ids =  map(lambda exon: exon.exon_id, all_exons)
                    if not exon.covering_exon in all_exon_ids:
                        cover_not_in_exon_set += 1
                        print "cover_not_in_exon_set: "
                        print  gene_id,  gene2stable(cursor, gene_id), get_description (cursor, gene_id)
                        print  exon_id, exon_seq_id, 
                        print "  %7d   %7d  %7d  " % (exon.start_in_gene, exon.end_in_gene,  exon.covering_exon)
                        #print covering_exon
                        print " ************"
                        for e in all_exons:
                            print e
                        exit (1)

                    elif not exon.covering_exon in exons_w_ccds:
                        cov_exon_not_in_ccds += 1
                        #print "covering exon is not in ccds set "
                       


    print "genes_w_ccds", genes_w_ccds
    print "tot_exons", tot_exons
    print "no_seq_info_in_database ",   no_seq_info_in_database
    print "all_ok",  all_ok 
    print "cover_already_present ", cover_already_present
    print "no_cover_and_no_seq   ", no_cover_and_no_seq 
    print "no_cover_and_no_dna_seq   ", no_cover_and_no_dna_seq 
    print "cov_exon_not_in_ccds",   cov_exon_not_in_ccds
    print "cover_not_in_exon_set ", cover_not_in_exon_set

    return True
def check_alt_splices(cursor, species, ensembl_db_name):

    print "############################"
    print 'checking alt splicing in ', species

    qry = "use " + ensembl_db_name[species]
    search_db(cursor, qry)
    gene_ids = get_gene_ids(cursor, biotype='protein_coding', is_known=1)

    no_cover_and_no_seq = 0
    no_cover_and_no_dna_seq = 0
    no_seq_info_in_database = 0
    all_ok = 0
    cover_already_present = 0
    cover_not_in_exon_set = 0
    cov_exon_not_in_ccds = 0
    genes_w_ccds = 0
    tot_exons = 0

    #for gene_id in gene_ids[:100]:
    #for gene_id in [413198]:
    for count in range(1000):
        gene_id = choice(gene_ids)

        #print  gene_id,  gene2stable(cursor, gene_id), get_description (cursor, gene_id)
        transcript_ids = get_transcript_ids(cursor, gene_id)

        tr_w_ccds = []
        for [tr_id, tr_stable] in transcript_ids:
            ccds = check_ccds(cursor, tr_stable)
            if not ccds: continue
            tr_w_ccds.append([tr_id, tr_stable])

        if not tr_w_ccds: continue

        genes_w_ccds += 1

        # get all exons for this gene
        all_exons = gene2exon_list(cursor, gene_id)

        exons_w_ccds = set([])  # get the unique_ids

        # find exons which are on the ccds list
        for [tr_id, tr_stable] in tr_w_ccds:
            exon_ids = transcript_id2exon_ids(cursor, tr_id)
            exons_w_ccds.update(set(exon_ids))

        # for these exons check sequence
        is_known = 1
        for exon_id in exons_w_ccds:
            tot_exons += 1
            exon = get_exon(cursor, exon_id, is_known)
            seq = get_exon_seqs(cursor, exon_id, is_known)
            if not seq:
                #print  gene_id,  gene2stable(cursor, gene_id), get_description (cursor, gene_id)
                #print exon_id, " no seq", "covered: ", exon.covering_exon
                #exit (1)
                no_seq_info_in_database += 1
                continue
            [
                exon_seq_id, protein_seq, pepseq_transl_start,
                pepseq_transl_end, left_flank, right_flank, dna_seq
            ] = seq

            #print exon_id, exon_seq_id,
            #print "  %7d   %7d  %7d  " % (exon.start_in_gene, exon.end_in_gene,  exon.covering_exon),

            if exon.covering_exon < 0:
                #print protein_seq
                if not protein_seq:
                    no_cover_and_no_seq += 1
                elif not dna_seq:
                    no_cover_and_no_dna_seq += 1
                else:
                    all_ok += 1
            else:
                if exon.covering_exon_known and exon.covering_exon in exons_w_ccds:
                    #print " <<<< "
                    cover_already_present += 1
                else:
                    all_exon_ids = map(lambda exon: exon.exon_id, all_exons)
                    if not exon.covering_exon in all_exon_ids:
                        cover_not_in_exon_set += 1
                        print "cover_not_in_exon_set: "
                        print gene_id, gene2stable(cursor,
                                                   gene_id), get_description(
                                                       cursor, gene_id)
                        print exon_id, exon_seq_id,
                        print "  %7d   %7d  %7d  " % (exon.start_in_gene,
                                                      exon.end_in_gene,
                                                      exon.covering_exon)
                        #print covering_exon
                        print " ************"
                        for e in all_exons:
                            print e
                        exit(1)

                    elif not exon.covering_exon in exons_w_ccds:
                        cov_exon_not_in_ccds += 1
                        #print "covering exon is not in ccds set "

    print "genes_w_ccds", genes_w_ccds
    print "tot_exons", tot_exons
    print "no_seq_info_in_database ", no_seq_info_in_database
    print "all_ok", all_ok
    print "cover_already_present ", cover_already_present
    print "no_cover_and_no_seq   ", no_cover_and_no_seq
    print "no_cover_and_no_dna_seq   ", no_cover_and_no_dna_seq
    print "cov_exon_not_in_ccds", cov_exon_not_in_ccds
    print "cover_not_in_exon_set ", cover_not_in_exon_set

    return True
Esempio n. 36
0
def  feed_trivial_names (cursor, all_species):

    tax_id  = {}
    trivial = {}

    trivial['ailuropoda_melanoleuca'] = 'panda' 
    trivial['anas_platyrhynchos']     = 'duck'
    trivial['anolis_carolinensis']    = 'anole_lizard' 
    trivial['astyanax_mexicanus']     = 'blind_cavefish'
    trivial['bos_taurus']             = 'cow' 
    trivial['callithrix_jacchus']     = 'marmoset' 
    trivial['canis_familiaris']       = 'dog' 
    trivial['cavia_porcellus']        = 'guinea_pig' 
    trivial['choloepus_hoffmanni']    = 'sloth' 
    trivial['danio_rerio']            = 'zebrafish' 
    trivial['dasypus_novemcinctus']   = 'armadillo' 
    trivial['dipodomys_ordii']        = 'kangaroo_rat' 
    trivial['echinops_telfairi']      = 'madagascar_hedgehog' 
    trivial['equus_caballus']         = 'horse' 
    trivial['erinaceus_europaeus']    = 'european_hedgehog' 
    trivial['felis_catus']            = 'cat' 
    trivial['ficedula_albicollis']    = 'flycatcher'
    trivial['gadus_morhua']           = 'cod' 
    trivial['gallus_gallus']          = 'chicken' 
    trivial['gasterosteus_aculeatus'] = 'stickleback' 
    trivial['gorilla_gorilla']        = 'gorilla' 
    trivial['homo_sapiens']           = 'human' 
    trivial['ictidomys_tridecemlineatus']  = 'squirrel' 
    trivial['latimeria_chalumnae']         = 'coelacanth' 
    trivial['lepisosteus_oculatus']        = 'spotted_gar'
    trivial['loxodonta_africana']          = 'elephant' 
    trivial['macaca_mulatta']              = 'macaque' 
    trivial['macropus_eugenii']            = 'wallaby' 
    trivial['meleagris_gallopavo']         = 'turkey' 
    trivial['microcebus_murinus']          = 'lemur' 
    trivial['monodelphis_domestica']       = 'opossum' 
    trivial['mus_musculus']                = 'mouse' 
    trivial['mustela_putorius_furo']       = 'ferret' 
    trivial['myotis_lucifugus']            = 'bat' 
    trivial['nomascus_leucogenys']         = 'gibbon' 
    trivial['ochotona_princeps']           = 'pika' 
    trivial['oreochromis_niloticus']       = 'tilapia' 
    trivial['ornithorhynchus_anatinus']    = 'platypus' 
    trivial['oryctolagus_cuniculus']       = 'rabbit' 
    trivial['oryzias_latipes']             = 'medaka' 
    trivial['otolemur_garnettii']          = 'galago_lemur' 
    trivial['ovis_aries']                  = 'sheep'
    trivial['pan_troglodytes']             = 'chimpanzee' 
    trivial['papio_anubis']                = 'baboon' 
    trivial['pelodiscus_sinensis']         = 'turtle' 
    trivial['petromyzon_marinus']          = 'lamprey' 
    trivial['poecilia_formosa']            = 'amazon_molly'
    trivial['pongo_abelii']                = 'orangutan' 
    trivial['procavia_capensis']           = 'hyrax' 
    trivial['pteropus_vampyrus']           = 'flying_fox' 
    trivial['rattus_norvegicus']           = 'rat' 
    trivial['sarcophilus_harrisii']        = 'tasmanian_devil' 
    trivial['sorex_araneus']               = 'european_shrew' 
    trivial['sus_scrofa']                  = 'pig' 
    trivial['taeniopygia_guttata']         = 'zebra_finch' 
    trivial['takifugu_rubripes']           = 'fugu' 
    trivial['tarsius_syrichta']            = 'tarsier' 
    trivial['tetraodon_nigroviridis']      = 'pufferfish' 
    trivial['tupaia_belangeri']            = 'tree_shrew' 
    trivial['tursiops_truncatus']          = 'dolphin' 
    trivial['vicugna_pacos']               = 'alpaca' 
    trivial['xenopus_tropicalis']          = 'xenopus' 
    trivial['xiphophorus_maculatus']       = 'platyfish' 



    db_name = get_compara_name (cursor)
    if (not db_name):
        print "compara db not found"
        exit(1)
    qry = "use %s " % db_name
    search_db (cursor, qry)
    for species in all_species:
        tax_id[species] = species2taxid (cursor, species)

    # switch to ncbi taxonomy database
    db_name = get_ncbi_tax_name (cursor)
    if (not db_name):
        print "ncbi taxonomy db not found"
        exit(1)

    qry = "use %s " % db_name
    search_db (cursor, qry)
    for species in all_species:
        if trivial.has_key(species):
            fixed_fields  = {}
            update_fields = {}
            fixed_fields ['tax_id']     = tax_id[species]
            fixed_fields ['name_class'] = 'trivial'
            update_fields['name_txt']   = trivial[species]
            store_or_update (cursor, 'names', fixed_fields, update_fields)
        else:
            print "trivial for ", species, " not found "
            trivial[species] = ""

    return True
Esempio n. 37
0
def  feed_name_shorthands (cursor, all_species):

    short = {}
    short['ailuropoda_melanoleuca'] = 'AME'
    short['anas_platyrhynchos']     = 'APL'
    short['anolis_carolinensis']    = 'ACA'
    short['astyanax_mexicanus']     = 'AMX'
    short['bos_taurus']             = 'BTA'
    short['callithrix_jacchus'] = 'CJA'
    short['canis_familiaris'] = 'CAF'
    short['cavia_porcellus'] = 'CPO'
    short['choloepus_hoffmanni'] = 'CHO'
    short['danio_rerio'] = 'DAR'
    short['dasypus_novemcinctus'] = 'DNO'
    short['dipodomys_ordii'] = 'DOR'
    short['echinops_telfairi'] = 'ETE'
    short['equus_caballus'] = 'ECA'
    short['erinaceus_europaeus'] = 'EEU'
    short['felis_catus'] = 'FCA'
    short['ficedula_albicollis'] = 'FAL'
    short['gadus_morhua'] = 'GMO'
    short['gallus_gallus'] = 'GAL'
    short['gasterosteus_aculeatus'] = 'GAC'
    short['gorilla_gorilla'] = 'GGO'
    short['homo_sapiens'] = ''
    short['ictidomys_tridecemlineatus'] = 'STO'
    short['latimeria_chalumnae'] = 'LAC'
    short['lepisosteus_oculatus'] = 'LOC'
    short['loxodonta_africana'] = 'LAF'
    short['macaca_mulatta'] = 'MMU'
    short['macropus_eugenii'] = 'MEU'
    short['meleagris_gallopavo'] = 'MGA'
    short['microcebus_murinus'] = 'MIC'
    short['monodelphis_domestica'] = 'MOD'
    short['mus_musculus'] = 'MUS'
    short['mustela_putorius_furo'] = 'MPU'
    short['myotis_lucifugus'] = 'MLU'
    short['nomascus_leucogenys'] = 'NLE'
    short['ochotona_princeps'] = 'OPR'
    short['oreochromis_niloticus'] = 'ONI'
    short['ornithorhynchus_anatinus'] = 'OAN'
    short['oryctolagus_cuniculus'] = 'OCU'
    short['oryzias_latipes'] = 'ORL'
    short['ovis_aries'] = 'OAR'
    short['otolemur_garnettii'] = 'OGA'
    short['pan_troglodytes'] = 'PTR'
    short['papio_anubis'] = 'PAN'
    short['poecilia_formosa'] = 'PFO'
    short['pelodiscus_sinensis'] = 'PSI'
    short['petromyzon_marinus'] = 'PMA'
    short['pongo_abelii'] = 'PPY'
    short['procavia_capensis'] = 'PCA'
    short['pteropus_vampyrus'] = 'PVA'
    short['rattus_norvegicus'] = 'RNO'
    short['sarcophilus_harrisii'] = 'SHA'
    short['sorex_araneus'] = 'SAR'
    short['sus_scrofa'] = 'SSC'
    short['taeniopygia_guttata'] = 'TGU'
    short['takifugu_rubripes'] = 'TRU'
    short['tarsius_syrichta'] = 'TSY'
    short['tetraodon_nigroviridis'] = 'TNI'
    short['tupaia_belangeri'] = 'TBE'
    short['tursiops_truncatus'] = 'TTR'
    short['vicugna_pacos'] = 'VPA'
    short['xenopus_tropicalis'] = 'XET'
    short['xiphophorus_maculatus'] = 'XMA'


    db_name = get_compara_name (cursor)
    qry = "use %s " % db_name
    search_db (cursor, qry)

    table = 'species_name_shorthands'
    # if the table does not exist, make it
    if not check_table_exists (cursor, db_name, table):
        qry  = "CREATE TABLE " + table + "  (id INT(10) PRIMARY KEY AUTO_INCREMENT)"
        rows = search_db (cursor, qry)
        if (rows): return False

        qry = "ALTER TABLE %s  ADD %s VARCHAR(100)" % (table, 'species')
        rows = search_db (cursor, qry)
        if (rows): return False
        qry = "ALTER TABLE %s  ADD %s VARCHAR(10)" % (table, 'shorthand')
        rows = search_db (cursor, qry)
        if (rows): return False


    for species in all_species:
        if short.has_key(species):
            fixed_fields  = {}
            update_fields = {}
            fixed_fields  ['species']   = species
            update_fields ['shorthand'] = short[species]
            store_or_update (cursor, table, fixed_fields, update_fields)
        else:
            print "short for ", species, " not found "
            short[species] = ""
Esempio n. 38
0
def main():

    parameter = {}
    # in case I ever have to handle multiple versions of ensembl
    # (but for now I don't have enough space)
    # note though that there are functions in el_utils/mysql.py that assume
    # that whatever ensembl stuff is available to the mysql server corresponds to the same release 
    release_number = '76'
    parameter['ensembl_release_number'] = release_number
    parameter['blastp_e_value']         = "1.e-10" # it will be used as a string  when fmting the blastp cmd
    parameter['min_accptbl_exon_sim']   = 0.33333 #minimum acceptable exon similarity

    dir_path = {}
    dir_path['ensembl_fasta'] = '/mnt/ensembl-mirror/release-'+release_number+'/fasta'
    # local juggling of data from one database base to the other
    dir_path['afs_dumps']     = '/afs/bii.a-star.edu.sg/dept/biomodel_design/Group/ivana/'
    dir_path['afs_dumps']    += 'ExoLocator/results/dumpster'
    dir_path['resources']     = '/afs/bii.a-star.edu.sg/dept/biomodel_design/Group/ivana/'
    dir_path['resources']    += 'pypeworks/exolocator/resources'
    dir_path['scratch']       = '/tmp'
    dir_path['maxentscan']    = '/afs/bii.a-star.edu.sg/dept/biomodel_design/Group/ivana/'
    dir_path['maxentscan']   += 'pypeworks/exolocator/pl_utils/maxentscan'

    util_path = {}
    util_path['mafft']    = '/usr/bin/mafft'
    util_path['blastall'] = '/usr/bin/blastall'
    util_path['fastacmd'] = '/usr/bin/fastacmd'
    util_path['sw#']      = '/usr/bin/swsharp'
    util_path['usearch']  = '/usr/bin/usearch'
    util_path['score3']   = dir_path['maxentscan'] + '/score3.pl'
    util_path['score5']   = dir_path['maxentscan'] + '/score5.pl'

    if 1:
        # check if the paths are functioning (at this point at least)
        for util in util_path.values():
            if (not os.path.exists(util)):
                print util, " not found "
                sys.exit (1)

        for dir in dir_path.values():
            if (not os.path.exists(dir)):
                print dir, " not found "
                sys.exit (1)
            if (not os.path.isdir (dir)):
                print dir, " is not a directory "
                sys.exit (1)
            
    db     = connect_to_mysql()
    cursor = db.cursor()


    #######################################################
    # check if the config db exists -- if not, make it
    db_name   = "exolocator_config"
    qry  = "show databases like'%s'" % db_name
    rows = search_db (cursor, qry)
    if (not rows):
        print db_name, "database not found"
        qry = "create database %s " % db_name
        rows = search_db (cursor, qry)
        if (rows):
            print "some problem creating the database ..."
            rows = search_db (cursor, qry, verbose = True)
    else:
        print db_name, "database found"

    qry = "use %s " % db_name
    search_db (cursor, qry)
        
    # make tables
    for table in ['util_path', 'dir_path', 'parameter']:
        if ( check_table_exists (cursor, db_name, table)):
            print table, " found in ", db_name
        else:
            print table, " not found in ", db_name
            make_table (cursor, table)
   
    # fill util, dir and path tables 
    fixed_fields  = {}
    update_fields = {}
    for [name, path] in util_path.iteritems():
        fixed_fields['name']  = name
        update_fields['path'] = path
        store_or_update (cursor, 'util_path', fixed_fields, update_fields)

    fixed_fields  = {}
    update_fields = {}
    for [name, path] in dir_path.iteritems():
        fixed_fields['name'] = name
        update_fields['path'] = path
        store_or_update (cursor, 'dir_path', fixed_fields, update_fields)

    fixed_fields  = {}
    update_fields = {}
    for [name, value] in parameter.iteritems():
        fixed_fields['name']  = name
        update_fields['value'] = value
        store_or_update (cursor, 'parameter', fixed_fields, update_fields)

    #######################################################
    # add trivial names to ncbi_taxonomy.names
    [all_species, ensembl_db_name] = get_species (cursor)
    feed_trivial_names (cursor, all_species)

    #######################################################
    # add species shorthands (used in ENS* names formation)
    # though we will not needed unit the paralogue alignment reconstruction point)
    feed_name_shorthands (cursor, all_species)

    cursor.close()
    db.close()