Example #1
0
def main():

    db = connect_to_mysql()
    cfg = ConfigurationReader()
    cursor = db.cursor()

    [all_species, ensembl_db_name] = get_species(cursor)
    cursor.close()
    db.close()

    outpath = cfg.get_path('afs_dumps')
    outdir = "{0}/exon_map".format(outpath)
    if (not os.path.exists(outdir)):
        mkdir_p(outdir)

    outfile = "{0}/exon_map.sql".format(outdir)
    if os.path.exists('.creds'):
        [user, passwd, host, port] = read_creds()
    else:
        print "creds not found"
        exit(1)
    credentials = " -h {0} -P {1} -u {2}  -p{3}".format(
        host, port, user, password)
    cmd = "mysqldump {0} {1} exon_map > {2}".format(
        credentials, ensembl_db_name['homo_sapiens'], outfile)

    print cmd
    ret = commands.getoutput(cmd)

    print ret

    return True
def main ():

    
    db     = connect_to_mysql()
    cfg    = ConfigurationReader()
    cursor = db.cursor()

    [all_species, ensembl_db_name] = get_species (cursor)
    cursor.close()
    db    .close()

    outpath = cfg.get_path('afs_dumps')
    outdir   = "{0}/exon_map".format(outpath)
    if (not os.path.exists(outdir)):
        mkdir_p(outdir)

    outfile  = "{0}/exon_map.sql".format(outdir)
    if os.path.exists('.creds'):
        [user, passwd, host, port] = read_creds()
    else:
        print "creds not found"
        exit(1)
    credentials = " -h {0} -P {1} -u {2}  -p{3}".format(host, port, user, password)
    cmd = "mysqldump {0} {1} exon_map > {2}".format (credentials, ensembl_db_name['homo_sapiens'], outfile)

    print cmd
    ret = commands.getoutput(cmd)
    
    print ret

    return True
Example #3
0
def main():

    db = connect_to_mysql(Config.mysql_conf_file)
    cursor = db.cursor()
    [all_species, ensembl_db_name] = get_species(cursor)

    # add orthologue table to human - we are human-centered here
    # ditto for map (which exons from other species map onto human exons)
    table = "exon_map"
    qry = "select representative_species from exolocator_meta.taxonomy_groups"
    for representative_species in [
            line[0] for line in hard_landing_search(cursor, qry)
    ]:
        print(f"adding exon_map to {representative_species}")
        db_name = ensembl_db_name[representative_species]

        if check_table_exists(cursor, db_name, table):
            check_and_drop_table(cursor, db_name, table)
            make_exon_map_table(cursor, db_name)
            # print(table, " found in ", db_name)
        else:
            print(table, " not found in ", db_name)
            make_exon_map_table(cursor, db_name)
        #            cursor, db_name, table, index_name, columns,
        create_index(cursor, db_name, table, 'exon_index', ['exon_id'])
        create_index(cursor, db_name, table, 'cognate_exon_index',
                     ['cognate_exon_id', 'cognate_genome_db_id'])

    cursor.close()
    db.close()
Example #4
0
def main():

    db = connect_to_mysql()
    cursor = db.cursor()
    [all_species, ensembl_db_name] = get_species(cursor)

    if 1:
        check_genome_sizes(cursor, all_species, ensembl_db_name)

    if 0:
        check_table_sizes(cursor, all_species, ensembl_db_name)

    cursor.close()
    db.close()
Example #5
0
def main():
    
    db    = connect_to_mysql()
    cursor = db.cursor()
    [all_species, ensembl_db_name] = get_species (cursor)

    if 1:
        check_genome_sizes (cursor, all_species, ensembl_db_name)

    if 0:
        check_table_sizes (cursor, all_species, ensembl_db_name)
      

    cursor.close()
    db.close()
Example #6
0
def main():

    db = connect_to_mysql(Config.mysql_conf_file)
    cursor = db.cursor()
    [all_species, ensembl_db_name] = get_species(cursor)

    # add exon tables to all species
    for species in all_species:
        print(species)
        db_name = ensembl_db_name[species]
        switch_to_db(cursor, db_name)
        #make_exon_seq_table(cursor)

        #for table in ['gene2exon', 'exon_seq', 'sw_exon', 'usearch_exon', 'coding_region', 'problems']:
        for table in ['gene2exon']:
            check_and_drop_table(cursor, db_name, table)
            make_table(cursor, db_name, table)
            # if check_table_exists(cursor, db_name, table):
            # 	print(table, " found in ", db_name)
            # else:
            # 	print(table, " not found in ", db_name)
            # 	make_table (cursor, db_name, table)

        print("optimizing gene2exon")
        qry = "optimize table gene2exon"
        print(search_db(cursor, qry))
        # (cursor, db_name, table, index_name, columns, verbose=False)
        create_index(cursor,
                     db_name,
                     'gene2exon',
                     'eg_index', ['exon_id', 'gene_id'],
                     verbose=True)
        create_index(cursor,
                     db_name,
                     'gene2exon',
                     'gene_id_idx', ['gene_id'],
                     verbose=True)
        # create_index (cursor, db_name, 'ek_index',    'exon_seq',  ['exon_id', 'is_known'])
        # create_index (cursor, db_name, 'seq_index',   'exon_seq',  ['exon_seq_id'])
        # print("optimizing exon_seq")
        # qry = "optimize table exon_seq"
        # print(search_db(cursor, qry))

    cursor.close()
    db.close()
Example #7
0
def main():
    
    db     = connect_to_mysql()
    cursor = db.cursor()
    [all_species, ensembl_db_name] = get_species (cursor)
    
    tree   = Tree()
    for species in all_species:
        leaf = Node(species)
        tree.leafs.append(leaf)

    tree.build(cursor)

    print
    print tree.nhx_string()
    print
    
    cursor.close()
    db.close()
Example #8
0
def main():

	db     = connect_to_mysql(Config.mysql_conf_file)
	cursor = db.cursor()
	[all_species, ensembl_db_name] = get_species (cursor)

	tree   = Tree()
	for species in all_species:
		leaf = Node(species)
		tree.leafs.append(leaf)

	tree.build(cursor)

	print()
	print(tree.nhx_string())
	print()

	cursor.close()
	db.close()
def main():

    parameter = {}
    # in case I ever have to handle multiple versions of ensembl
    # (but for now I don't have enough space)
    # note though that there are functions in el_utils/mysql.py that assume
    # that whatever ensembl stuff is available to the mysql server corresponds to the same release 
    release_number = '76'
    parameter['ensembl_release_number'] = release_number
    parameter['blastp_e_value']         = "1.e-10" # it will be used as a string  when fmting the blastp cmd
    parameter['min_accptbl_exon_sim']   = 0.33333 #minimum acceptable exon similarity

    dir_path = {}
    dir_path['ensembl_fasta'] = '/mnt/ensembl-mirror/release-'+release_number+'/fasta'
    # local juggling of data from one database base to the other
    dir_path['afs_dumps']     = '/afs/bii.a-star.edu.sg/dept/biomodel_design/Group/ivana/'
    dir_path['afs_dumps']    += 'ExoLocator/results/dumpster'
    dir_path['resources']     = '/afs/bii.a-star.edu.sg/dept/biomodel_design/Group/ivana/'
    dir_path['resources']    += 'pypeworks/exolocator/resources'
    dir_path['scratch']       = '/tmp'
    dir_path['maxentscan']    = '/afs/bii.a-star.edu.sg/dept/biomodel_design/Group/ivana/'
    dir_path['maxentscan']   += 'pypeworks/exolocator/pl_utils/maxentscan'

    util_path = {}
    util_path['mafft']    = '/usr/bin/mafft'
    util_path['blastall'] = '/usr/bin/blastall'
    util_path['fastacmd'] = '/usr/bin/fastacmd'
    util_path['sw#']      = '/usr/bin/swsharp'
    util_path['usearch']  = '/usr/bin/usearch'
    util_path['score3']   = dir_path['maxentscan'] + '/score3.pl'
    util_path['score5']   = dir_path['maxentscan'] + '/score5.pl'

    if 1:
        # check if the paths are functioning (at this point at least)
        for util in util_path.values():
            if (not os.path.exists(util)):
                print util, " not found "
                sys.exit (1)

        for dir in dir_path.values():
            if (not os.path.exists(dir)):
                print dir, " not found "
                sys.exit (1)
            if (not os.path.isdir (dir)):
                print dir, " is not a directory "
                sys.exit (1)
            
    db     = connect_to_mysql()
    cursor = db.cursor()


    #######################################################
    # check if the config db exists -- if not, make it
    db_name   = "exolocator_config"
    qry  = "show databases like'%s'" % db_name
    rows = search_db (cursor, qry)
    if (not rows):
        print db_name, "database not found"
        qry = "create database %s " % db_name
        rows = search_db (cursor, qry)
        if (rows):
            print "some problem creating the database ..."
            rows = search_db (cursor, qry, verbose = True)
    else:
        print db_name, "database found"

    qry = "use %s " % db_name
    search_db (cursor, qry)
        
    # make tables
    for table in ['util_path', 'dir_path', 'parameter']:
        if ( check_table_exists (cursor, db_name, table)):
            print table, " found in ", db_name
        else:
            print table, " not found in ", db_name
            make_table (cursor, table)
   
    # fill util, dir and path tables 
    fixed_fields  = {}
    update_fields = {}
    for [name, path] in util_path.iteritems():
        fixed_fields['name']  = name
        update_fields['path'] = path
        store_or_update (cursor, 'util_path', fixed_fields, update_fields)

    fixed_fields  = {}
    update_fields = {}
    for [name, path] in dir_path.iteritems():
        fixed_fields['name'] = name
        update_fields['path'] = path
        store_or_update (cursor, 'dir_path', fixed_fields, update_fields)

    fixed_fields  = {}
    update_fields = {}
    for [name, value] in parameter.iteritems():
        fixed_fields['name']  = name
        update_fields['value'] = value
        store_or_update (cursor, 'parameter', fixed_fields, update_fields)

    #######################################################
    # add trivial names to ncbi_taxonomy.names
    [all_species, ensembl_db_name] = get_species (cursor)
    feed_trivial_names (cursor, all_species)

    #######################################################
    # add species shorthands (used in ENS* names formation)
    # though we will not needed unit the paralogue alignment reconstruction point)
    feed_name_shorthands (cursor, all_species)

    cursor.close()
    db.close()
def main():

    db     = connect_to_mysql()
    cursor = db.cursor()

    [all_species, ensembl_db_name] = get_species (cursor)    

    for species in all_species:

        if not species=='homo_sapiens': continue

        print
        print species

        switch_to_db (cursor,  ensembl_db_name[species])

        if (species=='homo_sapiens'):
            gene_ids = get_gene_ids (cursor, biotype='protein_coding', is_known=1)
        else:
            gene_ids = get_gene_ids (cursor, biotype='protein_coding')

        tot_exons   = 0
        no_exon_seq = 0
        short_dna   = 0
        pepseq_ok   = 0
        mismatch    = 0
        stored_incorrect = 0
        translation_fail = 0
        #####################################
        #for gene_id in [10092907]:
        for gene_id in gene_ids:
        #for tot in range(1000):
            #gene_id = choice(gene_ids)

            # get _all_ exons
            exons = gene2exon_list(cursor, gene_id)
            if (not exons):
                print 'no exons for gene', gene_id
                sys.exit(1)

            for exon in exons:

                #####################################                
                if not exon.is_coding:
                    print exon.exon_id, " not coding "
                    continue
                if exon.covering_exon >0:
                    print exon.exon_id, " is covered by ", exon.covering_exon 
                    continue
                    

                tot_exons += 1
                # exons seqs are its aa translation, left_flank, right_flank, and dna_seq
                exon_seqs = get_exon_seqs(cursor, exon.exon_id, exon.is_known)
                if (not exon_seqs):
                    no_exon_seq += 1
                    print "no exon seqs for  ", gene_id, exon.exon_id
                    #exit(1)
                    continue                   

                [exon_seq_id, pepseq, pepseq_transl_start, 
                 pepseq_transl_end, left_flank, right_flank, dna_seq] = exon_seqs

                if len(dna_seq)<3:
                    short_dna += 1
                    print "short_dna:", dna_seq
                    continue

                if (pepseq_transl_start == -10): # ??? what is this shit? adn what happens downstream if the pepseq_transl_start is None?
                    translation_fail += 1
                    print "pepseq_transl_start:", pepseq_transl_start
                    continue

                mitochondrial        = is_mitochondrial(cursor, gene_id)
                dnaseq  = Seq (dna_seq[pepseq_transl_start:pepseq_transl_end], generic_dna)
                if (mitochondrial):
                    pepseq2 = dnaseq.translate(table="Vertebrate Mitochondrial").tostring()
                else:
                    pepseq2 = dnaseq.translate().tostring()

                if True:
                    print exon.exon_id
                    print "pep stored:", pepseq
                    print "dna transl:", pepseq2
                    print "dna begin:", dna_seq[:12]
                    print "start:" , pepseq_transl_start, 
                    print "end:",  pepseq_transl_end
                    print

                if (not pepseq == pepseq2):
                    stored_incorrect += 1
                else:
                    pepseq_ok += 1

        print "total coding exons ", tot_exons
        print "no exon seq info   ", no_exon_seq
        print "short dna          ", short_dna
        print "transl failure     ", translation_fail
        print "stored pepseq does not correspond to the translation of stored dna:   ", stored_incorrect
        print "pepseq ok          ", pepseq_ok

    cursor.close()
    db    .close()
def main():

    db = connect_to_mysql(Config.mysql_conf_file)
    cursor = db.cursor()
    [all_species, ensembl_db_name] = get_species(cursor)

    # do we have the tree?
    ret = error_intolerant_search(
        cursor,
        "select value from exolocator_meta.taxonomy where name = 'species_tree'"
    )
    if not ret:
        print("building the species tree ...")
        tree = species_tree(cursor, all_species)
        print("                     tree done.")
        print("storing")
        qry = f"insert into  exolocator_meta.taxonomy  (name,value) values ('species_tree','{tree.nhx_string()}') "
        error_intolerant_search(cursor, qry)
    else:
        print("reading the species tree ...")
        tree = Tree(ret[0][0])
        print("                     tree done.")

    trivial_name = {
        'Archosauria': "birds_and_crocs",
        'Testudines': "turtles",
        'Lepidosauria': "lizards_and_snakes",
        'Eutheria': "mammals",
        'Marsupialia': "marsupials",
        'ornithorhynchus_anatinus': "platypus",
        'Anura': "frogs",
        'latimeria_chalumnae': "coelacanth",
        'Euteleosteomorpha': "euteleosts",
        'Otomorpha': "otomorpha",
        'Osteoglossiformes': "osteoglossiforms",  # ray-finned fish
        'lepisosteus_oculatus': "spotted_gar",
        'erpetoichthys_calabaricus': "snakefish",  # more ray-finned fish
        'callorhinchus_milii':
        "elephant_shark",  # Australian ghostshark or elephant shark
        'Cyclostomata': "nightmare stuff"
    }
    species_subtrees = trivial_name.keys()
    # which species in the subtree has the best annotation so far?

    switch_to_db(cursor, "exolocator_meta")
    for tax_group in species_subtrees:
        node = tree.get_node(tax_group)
        print()
        print(tax_group)
        number_of_genes = {}
        number_of_transcripts = {}

        group_species = [node.name] if node.is_leaf else node.subtree_leafs()
        for species in group_species:
            qry = f"select count(*) from {ensembl_db_name[species]}.transcript"
            number_of_transcripts[species] = hard_landing_search(cursor,
                                                                 qry)[0][0]
            qry = f"select count(*) from {ensembl_db_name[species]}.gene"
            number_of_genes[species] = hard_landing_search(cursor, qry)[0][0]
        # we are using the reported number of transcript as an ad hoc measure of reliability of the genome annotation
        sorted_species_in_the_group = sorted(
            group_species,
            key=lambda s: number_of_transcripts[s],
            reverse=True)
        for species in sorted_species_in_the_group:
            strformat = "%50s:      transcripts:  %6d      genes:  %6d"
            print(strformat % (species, number_of_transcripts[species],
                               number_of_genes[species]))
        fixed_fields = {'name': tax_group}
        update_fields = {
            'trivial_name': trivial_name[tax_group],
            'representative_species': sorted_species_in_the_group[0],
            'members': ",".join(sorted_species_in_the_group[1:])
        }
        print(fixed_fields)
        print(update_fields)
        print()
        store_or_update(cursor, "taxonomy_groups", fixed_fields, update_fields)

    cursor.close()
    db.close()
Example #12
0
def main():

    db = connect_to_mysql()
    cursor = db.cursor()

    [all_species, ensembl_db_name] = get_species(cursor)

    for species in all_species:

        if not species == 'homo_sapiens': continue

        print
        print species

        switch_to_db(cursor, ensembl_db_name[species])

        if (species == 'homo_sapiens'):
            gene_ids = get_gene_ids(cursor,
                                    biotype='protein_coding',
                                    is_known=1)
        else:
            gene_ids = get_gene_ids(cursor, biotype='protein_coding')

        tot_exons = 0
        no_exon_seq = 0
        short_dna = 0
        pepseq_ok = 0
        mismatch = 0
        stored_incorrect = 0
        translation_fail = 0
        #####################################
        #for gene_id in [10092907]:
        for gene_id in gene_ids:
            #for tot in range(1000):
            #gene_id = choice(gene_ids)

            # get _all_ exons
            exons = gene2exon_list(cursor, gene_id)
            if (not exons):
                print 'no exons for gene', gene_id
                sys.exit(1)

            for exon in exons:

                #####################################
                if not exon.is_coding:
                    print exon.exon_id, " not coding "
                    continue
                if exon.covering_exon > 0:
                    print exon.exon_id, " is covered by ", exon.covering_exon
                    continue

                tot_exons += 1
                # exons seqs are its aa translation, left_flank, right_flank, and dna_seq
                exon_seqs = get_exon_seqs(cursor, exon.exon_id, exon.is_known)
                if (not exon_seqs):
                    no_exon_seq += 1
                    print "no exon seqs for  ", gene_id, exon.exon_id
                    #exit(1)
                    continue

                [
                    exon_seq_id, pepseq, pepseq_transl_start,
                    pepseq_transl_end, left_flank, right_flank, dna_seq
                ] = exon_seqs

                if len(dna_seq) < 3:
                    short_dna += 1
                    print "short_dna:", dna_seq
                    continue

                if (
                        pepseq_transl_start == -10
                ):  # ??? what is this shit? adn what happens downstream if the pepseq_transl_start is None?
                    translation_fail += 1
                    print "pepseq_transl_start:", pepseq_transl_start
                    continue

                mitochondrial = is_mitochondrial(cursor, gene_id)
                dnaseq = Seq(dna_seq[pepseq_transl_start:pepseq_transl_end],
                             generic_dna)
                if (mitochondrial):
                    pepseq2 = dnaseq.translate(
                        table="Vertebrate Mitochondrial").tostring()
                else:
                    pepseq2 = dnaseq.translate().tostring()

                if True:
                    print exon.exon_id
                    print "pep stored:", pepseq
                    print "dna transl:", pepseq2
                    print "dna begin:", dna_seq[:12]
                    print "start:", pepseq_transl_start,
                    print "end:", pepseq_transl_end
                    print

                if (not pepseq == pepseq2):
                    stored_incorrect += 1
                else:
                    pepseq_ok += 1

        print "total coding exons ", tot_exons
        print "no exon seq info   ", no_exon_seq
        print "short dna          ", short_dna
        print "transl failure     ", translation_fail
        print "stored pepseq does not correspond to the translation of stored dna:   ", stored_incorrect
        print "pepseq ok          ", pepseq_ok

    cursor.close()
    db.close()