コード例 #1
0
def main():

    
    no_threads = 1
    
    db_name =  "exolocator_db"
    db      = connect_to_mysql(user="******", passwd="tooiram")
    cursor  = db.cursor()
    switch_to_db (cursor, db_name)

    cfg     = ConfigurationReader(user="******", passwd="tooiram", check=False)
    in_path = cfg.get_path('resources')
    if (not os.path.exists(in_path)):
        print in_path, "not found"

    
    ###############
    if not check_table_exists (cursor, db_name, 'name_resolution'):
        make_name_resolution_table (cursor)
   
    ###############
    os.chdir(in_path)
    filenames = glob.glob("*name_resolution.txt")
    
    for infile in filenames:
        store (cursor, in_path, infile)

    ###############
    cursor.close()
    db    .close()
コード例 #2
0
def main ():

    
    db     = connect_to_mysql()
    cfg    = ConfigurationReader()
    cursor = db.cursor()

    [all_species, ensembl_db_name] = get_species (cursor)
    cursor.close()
    db    .close()

    outpath = cfg.get_path('afs_dumps')
    outdir   = "{0}/exon_map".format(outpath)
    if (not os.path.exists(outdir)):
        mkdir_p(outdir)

    outfile  = "{0}/exon_map.sql".format(outdir)
    if os.path.exists('.creds'):
        [user, passwd, host, port] = read_creds()
    else:
        print "creds not found"
        exit(1)
    credentials = " -h {0} -P {1} -u {2}  -p{3}".format(host, port, user, password)
    cmd = "mysqldump {0} {1} exon_map > {2}".format (credentials, ensembl_db_name['homo_sapiens'], outfile)

    print cmd
    ret = commands.getoutput(cmd)
    
    print ret

    return True
コード例 #3
0
def main ():

    
    db_name = "exolocator_db"
    db      = connect_to_mysql(user="******", passwd="tooiram")
    cursor  = db.cursor()
    switch_to_db (cursor, db_name)

    cfg     = ConfigurationReader (user="******", passwd="tooiram", check=False)

    inpath = cfg.get_path('afs_dumps')
    indir   = "%s/exon_map"     % inpath
    infile  = "%s/exon_map.sql" % indir
    if (not os.path.exists(infile)):
        print "not found: ", infile
        sys.exit(1)
    print "reading", infile

    qry = "drop table exon_map"
    rows = search_db(cursor, qry)
    # I could not get this to run, though it runs fine directly from the mysql shell:
    #qry = "source %s" % infile
    #rows = search_db(cursor, qry, verbose=True)
    cursor.close()
    db.close()

    credentials = " -u marioot -ptooiram"
    cmd = "mysql %s  exolocator_db  <  %s" % (credentials, infile)
    print cmd
    ret = commands.getoutput(cmd)
    print ret

 
    return True
コード例 #4
0
def main():

    
    no_threads = 1
    
    db_name =  "exolocator_db"
    db      = connect_to_mysql(user="******", passwd="tooiram")
    cursor  = db.cursor()
    switch_to_db (cursor, db_name)

    cfg      = ConfigurationReader(user="******", passwd="tooiram", check=False)
    # afs is killing me here ...
    in_path  = cfg.get_path('afs_dumps')+"/exons"
    if (not os.path.exists(in_path)):
        print in_path, "not found"


    
    cursor.close()
    db    .close()
    
    ###############
    os.chdir(in_path)
    filenames = glob.glob("*exon_dump.txt")
    
    parallelize (no_threads, load_from_infiles, filenames, in_path)
コード例 #5
0
def main():

    db_name = "exolocator_db"
    db = connect_to_mysql(user="******", passwd="tooiram")
    cursor = db.cursor()
    switch_to_db(cursor, db_name)

    cfg = ConfigurationReader(user="******", passwd="tooiram", check=False)

    inpath = cfg.get_path('afs_dumps')
    indir = "%s/exon_map" % inpath
    infile = "%s/exon_map.sql" % indir
    if (not os.path.exists(infile)):
        print "not found: ", infile
        sys.exit(1)
    print "reading", infile

    qry = "drop table exon_map"
    rows = search_db(cursor, qry)
    # I could not get this to run, though it runs fine directly from the mysql shell:
    #qry = "source %s" % infile
    #rows = search_db(cursor, qry, verbose=True)
    cursor.close()
    db.close()

    credentials = " -u marioot -ptooiram"
    cmd = "mysql %s  exolocator_db  <  %s" % (credentials, infile)
    print cmd
    ret = commands.getoutput(cmd)
    print ret

    return True
コード例 #6
0
def main():

    db = connect_to_mysql()
    cfg = ConfigurationReader()
    cursor = db.cursor()

    [all_species, ensembl_db_name] = get_species(cursor)
    cursor.close()
    db.close()

    outpath = cfg.get_path('afs_dumps')
    outdir = "{0}/exon_map".format(outpath)
    if (not os.path.exists(outdir)):
        mkdir_p(outdir)

    outfile = "{0}/exon_map.sql".format(outdir)
    if os.path.exists('.creds'):
        [user, passwd, host, port] = read_creds()
    else:
        print "creds not found"
        exit(1)
    credentials = " -h {0} -P {1} -u {2}  -p{3}".format(
        host, port, user, password)
    cmd = "mysqldump {0} {1} exon_map > {2}".format(
        credentials, ensembl_db_name['homo_sapiens'], outfile)

    print cmd
    ret = commands.getoutput(cmd)

    print ret

    return True
コード例 #7
0
def main():

    db = connect_to_mysql()
    cr = ConfigurationReader()

    cursor = db.cursor()
    fasta_path = cr.get_path('ensembl_fasta')

    [all_species, ensembl_db_name] = get_species (cursor)

    for species in all_species:
    #for species in ['danio_rerio']:
        print species
        dna_path = "{0}/{1}/dna".format(fasta_path, species)
        if (not os.path.exists(dna_path)):
            print "problem:", dna_path, "not found"
            exit(1)

        fasta_files = []
        for r,d,files in os.walk(dna_path):
            for file in files:
                if (not file[-3:] == ".fa"):
                    continue
                fasta_files.append(file)
        
        name2file = {}
        for file in fasta_files:
            print dna_path, file
            cmd = "grep '>' {0}/{1}".format(dna_path, file)
            ret = commands.getoutput(cmd)
            headers = ret.split("\n")
            print "number of headers: ", len(headers)
            for hdr in headers:
                fields = hdr.split(" ")
                name = fields[0].replace (">", "")
                #print name
                if (not name2file.has_key(name)):
                    name2file[name] = []
                name2file[name].append(file)

        qry = "use "+ensembl_db_name[species]
        search_db (cursor, qry)

        for name in name2file.keys():
            file_names = ""
            for file in  name2file[name]:
                if file_names:
                    file_names += " "
                file_names += file
            store_seq_filenames (cursor, name, file_names)
 
    cursor.close()
    db    .close()
コード例 #8
0
def main():

    db_name = "exolocator_db"
    db      = connect_to_mysql(user="******", passwd="tooiram")
    cursor  = db.cursor()
    switch_to_db (cursor, db_name)

    cfg      = ConfigurationReader (user="******", passwd="tooiram", check=False)
    in_path  = cfg.get_path('afs_dumps')
    in_path += "/para_dump"
    if (not os.path.exists(in_path)):
        print in_path, "not found"
        sys.exit(1) # exit on non-existent outdir

    
    ###############
    if 1:
        qry = "drop table paralog"
        search_db (cursor, qry)
        qry = "create table paralog (id int(10) primary key auto_increment) "
        search_db (cursor, qry)
        qry = "alter table paralog  ADD gene_id1 varchar(30) " 
        search_db (cursor, qry)
        qry = "alter table paralog  ADD gene_id2 varchar(30) " 
        search_db (cursor, qry)
        create_index (cursor, db_name,'gene_id_index', 'paralog', ['gene_id1', 'gene_id2'])
        

    ###############
    os.chdir(in_path)
    filenames = glob.glob("*_para_dump.txt")

    ###############
    for infile in filenames:
        print infile
        store(cursor, infile)

    cursor.close()
    db    .close()
コード例 #9
0
def dump_orthos (species_list, db_info):

    
    [local_db, ensembl_db_name] = db_info
    db     = connect_to_mysql()
    cfg    = ConfigurationReader()
    cursor = db.cursor()

     # find db ids adn common names for each species db
    [all_species, ensembl_db_name] = get_species (cursor)

    # in the afa headers use 'trivial' names for the species: cow, dog, pig, ...
    trivial_name   = translate_to_trivial(cursor, all_species)

    out_path = cfg.get_path('afs_dumps')
    outfile  = "{0}/orthologue_dump.txt".format(out_path)
    print outfile
    of       = erropen (outfile,"w")

    species  = 'homo_sapiens'
    switch_to_db (cursor,  ensembl_db_name[species])


    qry = "select * from orthologue"
    rows = search_db (cursor, qry)
    for row in rows:
        [pair_id, human_gene_id, cognate_gene_id, genome_db_id, source] =  row
        species = genome_db_id2species (cursor, genome_db_id)
        switch_to_db (cursor,  ensembl_db_name['homo_sapiens'])
        human_stable_id = gene2stable(cursor, human_gene_id)
        switch_to_db (cursor,  ensembl_db_name[species])
        cognate_stable_id = gene2stable(cursor, cognate_gene_id)
        print  >>of,  orthos_tabstring ([human_stable_id, cognate_stable_id, species, trivial_name[species]])


    of.close()
    
    cursor.close()
    db    .close()
コード例 #10
0
def main():

    db_name = "exolocator_db"
    db = connect_to_mysql(user="******", passwd="tooiram")
    cursor = db.cursor()
    switch_to_db(cursor, db_name)

    cfg = ConfigurationReader(user="******", passwd="tooiram", check=False)
    in_path = cfg.get_path('afs_dumps')
    in_path += "/para_dump"
    if (not os.path.exists(in_path)):
        print in_path, "not found"
        sys.exit(1)  # exit on non-existent outdir

    ###############
    if 1:
        qry = "drop table paralog"
        search_db(cursor, qry)
        qry = "create table paralog (id int(10) primary key auto_increment) "
        search_db(cursor, qry)
        qry = "alter table paralog  ADD gene_id1 varchar(30) "
        search_db(cursor, qry)
        qry = "alter table paralog  ADD gene_id2 varchar(30) "
        search_db(cursor, qry)
        create_index(cursor, db_name, 'gene_id_index', 'paralog',
                     ['gene_id1', 'gene_id2'])

    ###############
    os.chdir(in_path)
    filenames = glob.glob("*_para_dump.txt")

    ###############
    for infile in filenames:
        print infile
        store(cursor, infile)

    cursor.close()
    db.close()
コード例 #11
0
def dump_exons(species_list, db_info):

    [local_db, ensembl_db_name] = db_info
    db = connect_to_mysql()
    cfg = ConfigurationReader()
    cursor = db.cursor()

    out_path = "{0}/exons".format(cfg.get_path('afs_dumps'))
    if not os.path.exists(out_path):
        print out_path, "not found"
        exit(1)  # exit on failed output dir check

    for species in species_list:
        #if (not species=='homo_sapiens'):
        #    continue
        outfile = "{0}/{1}_exon_dump.txt".format(out_path, species)
        of = erropen(outfile, "w")
        if not of: continue
        switch_to_db(cursor, ensembl_db_name[species])

        if (species == 'homo_sapiens'):
            gene_ids = get_gene_ids(cursor,
                                    biotype='protein_coding',
                                    is_known=1,
                                    ref_only=True)
        else:
            gene_ids = get_gene_ids(cursor, biotype='protein_coding')

        source = get_analysis_dict(cursor)

        ct = 0
        for gene_id in gene_ids:
            ct += 1
            if (not ct % 1000):
                print species, ct, len(gene_ids)

            # get _all_ exons
            exons = gene2exon_list(cursor, gene_id)
            if (not exons):
                print 'no exons for ', gene_id
                continue

            for exon in exons:

                if exon.covering_exon > 0: continue
                # exons seqs are its aa translation, left_flank, right_flank, and dna_seq
                exon_seqs = get_exon_seqs(cursor, exon.exon_id, exon.is_known)
                if (not exon_seqs):
                    continue
                # human readable string describing the source of annotation for this exon
                if exon.is_known == 2:
                    analysis = 'sw_sharp'
                elif exon.is_known == 3:
                    analysis = 'usearch'
                else:
                    analysis = source[exon.analysis_id]
                # the first field return by get_exon_seqs is the exon_seq_id, so get rid of it
                gene_stable_id = gene2stable(cursor, gene_id)
                if (exon.is_known == 1):
                    exon_stable_id = exon2stable(cursor, exon.exon_id)
                elif (exon.is_known == 2):
                    exon_stable_id = 'sw_sharp_' + str(exon.exon_id)
                elif (exon.is_known == 3):
                    exon_stable_id = 'usearch_' + str(exon.exon_id)
                else:
                    exon_stable_id = "anon"

                print >> of, exon_tabstring(exon, gene_stable_id,
                                            exon_stable_id, species, analysis,
                                            exon_seqs[1:])

        of.close()
        print species, "done"

    cursor.close()
    db.close()
コード例 #12
0
def dump_exons (species_list, db_info):

    
    [local_db, ensembl_db_name] = db_info
    db     = connect_to_mysql()
    cfg    = ConfigurationReader()
    cursor = db.cursor()

    out_path = "{0}/exons".format(cfg.get_path('afs_dumps'))
    if not os.path.exists(out_path):
        print out_path, "not found"
        exit (1) # exit on failed output dir check

    for species in species_list:
        #if (not species=='homo_sapiens'):
        #    continue
        outfile  = "{0}/{1}_exon_dump.txt".format(out_path, species)
        of       = erropen (outfile,"w")
        if not of:  continue
        switch_to_db (cursor,  ensembl_db_name[species])

        if (species=='homo_sapiens'):
            gene_ids = get_gene_ids (cursor, biotype='protein_coding', is_known=1, ref_only=True)
        else:
            gene_ids = get_gene_ids (cursor, biotype='protein_coding')

        source = get_analysis_dict(cursor)

        ct     = 0
        for gene_id in gene_ids:
            ct += 1
            if (not  ct%1000):
                print species, ct, len(gene_ids)

            # get _all_ exons
            exons = gene2exon_list(cursor, gene_id)
            if (not exons):
                print 'no exons for ', gene_id
                continue

            for exon in exons:

                if exon.covering_exon  > 0: continue
                # exons seqs are its aa translation, left_flank, right_flank, and dna_seq
                exon_seqs = get_exon_seqs(cursor, exon.exon_id, exon.is_known)
                if (not exon_seqs):
                    continue
                # human readable string describing the source of annotation for this exon
                if exon.is_known==2:
                    analysis = 'sw_sharp'
                elif exon.is_known==3:
                    analysis = 'usearch'
                else:
                    analysis = source[exon.analysis_id] 
                # the first field return by get_exon_seqs is the exon_seq_id, so get rid of it
                gene_stable_id = gene2stable(cursor,gene_id)
                if ( exon.is_known == 1):
                    exon_stable_id = exon2stable(cursor,exon.exon_id)
                elif ( exon.is_known == 2):
                    exon_stable_id = 'sw_sharp_'+str(exon.exon_id)
                elif ( exon.is_known == 3):
                    exon_stable_id = 'usearch_'+str(exon.exon_id)
                else:
                    exon_stable_id = "anon"

                print >> of, exon_tabstring (exon, gene_stable_id, exon_stable_id, species, analysis, exon_seqs[1:])


        of.close()
        print species, "done"
    
    cursor.close()
    db    .close()