Esempio n. 1
0
def get_pairwise_connexions(accession_1, accession_2, biodb):
    import manipulate_biosqldb
    import numpy
    import pandas

    server, db = manipulate_biosqldb.load_db(biodb)

    sql1 = 'select seqfeature_id, start, stop from biosqldb.orthology_detail_%s where accession in ("%s","%s") ' % (biodb,
                                                                                                                   accession_1,
                                                                                                                   accession_2)
    seqfeature_id2location = manipulate_biosqldb.to_dict(server.adaptor.execute_and_fetchall(sql1,))

    print seqfeature_id2location.keys()[0:10]

    sql2 = 'select accession, taxon_id from biodatabase t1 inner join bioentry t2 on t1.biodatabase_id=t2.biodatabase_id' \
           ' where t1.name="%s"' % biodb
    print sql2


    accession2taxon_id = manipulate_biosqldb.to_dict(server.adaptor.execute_and_fetchall(sql2,))

    comp1_sql = 'select locus_1,locus_2,identity from (select * from ' \
                ' comparative_tables.identity_closest_homolog2_%s where taxon_1=%s and taxon_2=%s) A ' \
                ' inner join biosqldb.orthology_detail_%s B on A.locus_1=B.seqfeature_id;' % (biodb,
                                                                                              accession2taxon_id[accession_1],
                                                                                              accession2taxon_id[accession_2],
                                                                                              biodb)


    data = server.adaptor.execute_and_fetchall(comp1_sql,)

    comparison_table = []
    for row in data:
        print row
        try:
            start1 = seqfeature_id2location[int(row[0])][0]
            stop1 = seqfeature_id2location[int(row[0])][0]
            start2 = seqfeature_id2location[int(row[1])][0]
            stop2 = seqfeature_id2location[int(row[1])][0]
            identity = row[2]
            comparison_table.append([start1, stop1, start2, stop2, identity])
        except:
            pass
    data = numpy.array(comparison_table)
    columns = ['start1', 'end1', 'start2', 'end2', 'identity']
    df = pandas.DataFrame(data, columns=columns)
    return df
Esempio n. 2
0
def get_profile_fasta(biodb, taxon_id):
    '''

    - ordered taxon
    - transposed orthology table => each row is a different taxon

    :return:
    '''

    import manipulate_biosqldb
    import pandas
    import numpy
    from Bio.Seq import Seq
    from Bio.SeqRecord import SeqRecord
    from Bio import SeqIO

    server, db = manipulate_biosqldb.load_db(biodb)

    sql = 'select taxon_id, accession from biodatabase t1 inner join bioentry t2 on t1.biodatabase_id=t2.biodatabase_id' \
          ' where (t1.name="%s" and t2.description not like "%%%%plasmid%%%%")' % biodb

    taxon2accession = manipulate_biosqldb.to_dict(
        server.adaptor.execute_and_fetchall(sql, ))

    taxon_id_filter = '`' + '`,`'.join(taxon2accession.keys()) + '`'

    sql = 'select t2.locus_tag,%s from comparative_tables.orthology_%s t1 inner join orthology_detail_%s t2 on t1.orthogroup=t2.orthogroup' \
          ' where t2.taxon_id=%s' % (taxon_id_filter, biodb, biodb, taxon_id)
    sql3 = 'show columns from comparative_tables.orthology_%s' % (biodb)

    data = numpy.array(
        [list(i) for i in server.adaptor.execute_and_fetchall(sql, )])

    all_cols = [i[0] for i in server.adaptor.execute_and_fetchall(sql3, )]

    count_df = pandas.DataFrame(data, columns=all_cols)

    count_df = count_df.set_index(['orthogroup'])
    count_df = count_df.apply(pandas.to_numeric, args=('coerce', ))
    count_df[(count_df > 1)] = 1
    #print count_df
    transposed_table = count_df.transpose()
    #print transposed_table
    #transposed_table.columns = []
    all_records = []
    for taxon, row in transposed_table.iterrows():
        #print taxon, row
        profile_dat = [str(i) for i in row]
        profile = ''.join(profile_dat)
        simple_seq = Seq(profile)
        simple_seq_r = SeqRecord(simple_seq)
        simple_seq_r.id = taxon2accession[taxon]
        simple_seq_r.description = ""
        all_records.append(simple_seq_r)
    with open("profiles_all.fasta", 'w') as tt:
        SeqIO.write(all_records, tt, 'fasta')
def get_module_count_all_db(biodb, category=False):
    '''

    :param biodb: <biodatabase name>
    :param category: KEGG module category (optional)
    :return: for each module, return the total count from KEGG, and the total count of KO present in the <biodb>
    '''

    import manipulate_biosqldb
    server, db = manipulate_biosqldb.load_db(biodb)

    sql_biodb_id = 'select biodatabase_id from biodatabase where name="%s"' % biodb

    database_id = server.adaptor.execute_and_fetchall(sql_biodb_id, )[0][0]

    if category:

        sql_pathway_count = 'select BB.module_name,count_all,count_db,count_db/count_all from (select module_id, count(*) ' \
                            ' as count_db from (select distinct ko_id from enzyme.locus2ko_%s) as t1' \
                            ' inner join enzyme.module2ko as t2 on t1.ko_id=t2.ko_id group by module_id) AA ' \
                            ' right join (select t1.module_id,module_name, count_all from (select module_id, count(*) as count_all ' \
                            'from enzyme.module2ko group by module_id) t1 inner join enzyme.kegg_module as t2 ' \
                            'on t1.module_id=t2.module_id where module_sub_cat="%s")BB on AA.module_id=BB.module_id;' % (biodb, category) # where pathway_category!="1.0 Global and overview maps"
    else:
        # select distinct KO
        # join with module
        sql_pathway_count = 'select BB.module_name,count_all,count_db,count_db/count_all from (select module_id, count(*) ' \
                            ' as count_db from (select distinct ko_id from enzyme.locus2ko_%s) as t1' \
                            ' inner join enzyme.module2ko as t2 on t1.ko_id=t2.ko_id group by module_id) AA ' \
                            ' right join (select t1.module_id,module_name, count_all from (select module_id, count(*) as count_all ' \
                            'from enzyme.module2ko group by module_id) t1 inner join enzyme.kegg_module as t2 ' \
                            'on t1.module_id=t2.module_id)BB on AA.module_id=BB.module_id;' % (biodb) # where pathway_category!="1.0 Global and overview maps"

    map2count = manipulate_biosqldb.to_dict(
        server.adaptor.execute_and_fetchall(sql_pathway_count, ))
    return map2count
def plot_tree(ete3_tree,
              orthogroup,
              biodb,
              mysql_host="localhost",
              mysql_user="******",
              mysql_pwd="baba",
              mysql_db="blastnr"):

    import MySQLdb
    import manipulate_biosqldb
    from ete3 import Tree, TreeStyle, faces, AttrFace

    conn = MySQLdb.connect(
        host=mysql_host,  # your host, usually localhost
        user=mysql_user,  # your username
        passwd=mysql_pwd,  # your password
        db=mysql_db)  # name of the data base

    cursor = conn.cursor()

    locus_list = [lf.name for lf in ete3_tree.iter_leaves()]

    filter = '"' + '","'.join(locus_list) + '"'

    print('get uniprot taxnomy')
    sql1 = 'select subject_accession,subject_scientific_name,t2.phylum from blast_swissprot_%s t1 ' \
          ' inner join blastnr_taxonomy as t2 on t1.subject_taxid=t2.taxon_id where subject_accession in (%s);' % (biodb,
                                                                                                                   filter)
    sql1 = 'select subject_accession,subject_scientific_name,t4.phylum from biosqldb.orthology_detail_%s t1 ' \
              ' inner join custom_tables.locus2seqfeature_id_%s t2 ' \
              ' on t1.locus_tag=t2.locus_tag ' \
              ' inner join blast_swissprot_%s t3 on t2.seqfeature_id=t3.seqfeature_id ' \
              ' inner join blastnr_taxonomy as t4 on t3.subject_taxid=t4.taxon_id ' \
              ' where t1.orthogroup="%s"' % (biodb,
                                            biodb,
                                            biodb,
                                            orthogroup)
    print('get refseq taxonomy')
    cursor.execute(sql1, )
    accession2name_and_phylum = manipulate_biosqldb.to_dict(cursor.fetchall())
    sql2 = 'select subject_accession,subject_scientific_name,t4.phylum from biosqldb.orthology_detail_%s t1 ' \
              ' inner join custom_tables.locus2seqfeature_id_%s t2 ' \
              ' on t1.locus_tag=t2.locus_tag ' \
              ' inner join blastnr_%s t3 on t2.seqfeature_id=t3.seqfeature_id ' \
              ' inner join blastnr_taxonomy as t4 on t3.subject_taxid=t4.taxon_id ' \
              ' where t1.orthogroup="%s"' % (biodb,
                                            biodb,
                                            biodb,
                                            orthogroup)

    print(sql2)
    cursor.execute(sql2, )
    accession2name_and_phylum.update(
        manipulate_biosqldb.to_dict(cursor.fetchall()))

    print('plotting tree')
    phylum_list = list(
        set([
            accession2name_and_phylum[i][1]
            for i in accession2name_and_phylum.keys()
        ]))

    sql = 'select locus_tag, organism from biosqldb.orthology_detail_%s' % biodb
    cursor.execute(sql, )
    locus2organism = manipulate_biosqldb.to_dict(cursor.fetchall())

    phylum2col = dict(zip(phylum_list, get_spaced_colors(len(phylum_list))))

    R = ete3_tree.get_midpoint_outgroup()
    # and set it as tree outgroup
    ete3_tree.set_outgroup(R)

    for lf in ete3_tree.iter_leaves():

        try:
            col = phylum2col[accession2name_and_phylum[lf.name][1]]
            lf.name = '%s|%s-%s' % (lf.name,
                                    accession2name_and_phylum[lf.name][0],
                                    accession2name_and_phylum[lf.name][1])

            ff = AttrFace("name", fsize=12)
            #ff.background.color = 'red'
            ff.fgcolor = col

            lf.add_face(ff, column=0)

            #nameFace = AttrFace(lf.name, fsize=30, fgcolor=phylum2col[accession2name_and_phylum[lf.name][1]])
            #faces.add_face_to_node(nameFace, lf, 0, position="branch-right")
            #
            #nameFace.border.width = 1
        except:
            col = 'red'
            try:
                lf.name = '%s| %s' % (lf.name, locus2organism[lf.name])
            except:
                lf.name = '%s| ??' % (lf.name)
            ff = AttrFace("name", fsize=12)
            #ff.background.color = 'red'
            ff.fgcolor = col

            lf.add_face(ff, column=0)
    ts = TreeStyle()
    ts.show_leaf_name = False
    ts.show_branch_support = True
    return ete3_tree, ts
            outname="%s_swiss_homologs.faa" % grp,
            swissprot=True,
            refseq=True)

        if alignment:
            t = aafasta2phylogeny("%s_swiss_homologs.faa" % grp)

            tree, ts = plot_tree(t, grp, "chlamydia_04_16", mysql_pwd=sqlpsw)
            out_name = "%s.svg" % grp
            tree.render(out_name, tree_style=ts)
    else:
        server, db = manipulate_biosqldb.load_db(args.biodb)
        sql = 'select orthogroup, count(*) as n from orthology_detail_%s group by orthogroup' % args.biodb

        print('gettig orthogroup2n_hits refseq')
        orthgroup2orthogroup_size = manipulate_biosqldb.to_dict(
            server.adaptor.execute_and_fetchall(sql, ))
        filter = '"' + '","'.join(exclude) + '"'
        sql2 = 'select orthogroup, count(*) from ' \
               ' (select locus_tag, count(*) as n from custom_tables.locus2seqfeature_id_%s t1 ' \
               ' inner join blastnr.blastnr_%s as t2 on t1.seqfeature_id=t2.seqfeature_id ' \
               ' inner join blastnr.blastnr_taxonomy t3 on t2.subject_taxid=t3.taxon_id ' \
               ' where t3.phylum not in (%s) group by t1.seqfeature_id) A ' \
               ' inner join biosqldb.orthology_detail_%s B on A.locus_tag=B.locus_tag ' \
               ' group by orthogroup;' % (args.biodb,
                                          args.biodb,
                                          filter,
                                          args.biodb)

        group2n_blast_refseq = manipulate_biosqldb.to_dict(
            server.adaptor.execute_and_fetchall(sql2, ))
        print('gettig orthogroup2n_hits swissprot')
Esempio n. 6
0
def get_set_data(biodb,
                 set_list_restrict=[],
                 frequency=False,
                 six_frame_translation=False,
                 return_lists=False,
                 score_cutoff=0):
    import manipulate_biosqldb
    '''

    :param biodb:
    :param set_list: restrict analysis to specific sets (empty list mean all sets)
    :param frequency: return ratio n genes identified/n genes in the set
    :param cutoff_percent: onle return presence absence data (1 and 0) given a cutoff percentage of the genes identified/genes in the set
    :param six_frame_translation: get data from the six fram translation analysis
    :return: dictionnary taxon2list of values OR taxon2set2value dictionnary
    '''

    server, db = manipulate_biosqldb.load_db(biodb)

    sql = 'select biodatabase_id from biodatabase where name="%s"' % biodb

    db_id = server.adaptor.execute_and_fetchall(sql, )[0][0]

    if six_frame_translation:
        hmm_table = 'hmm_hits_six_frame_genome'
    else:
        hmm_table = 'hmm_hits_annotated_genome'

    sql = 'select taxon_id,set_id, count(*) from ' \
          ' (select t1.*,t2.set_id from hmm.%s_%s t1 ' \
          ' inner join hmm.hmm_sets_entry t2 on t1.hmm_id=t2.hmm_id where t1.bitscore>%s' \
          ' group by taxon_id,set_id,t1.hmm_id) A group by taxon_id,set_id;' % (hmm_table, biodb, score_cutoff)

    data = server.adaptor.execute_and_fetchall(sql, )

    if frequency:
        sql = 'select taxon_id,count(*) as n from COG.locus_tag2gi_hit_%s t1 ' \
              ' inner join COG.cog_names_2014 t2 on t1.COG_id=t2.COG_id ' \
              ' inner join biosqldb.bioentry as t3 on t1.accession=t3.accession ' \
              ' where biodatabase_id=%s group by taxon_id;' % (biodb, db_id)
        taxon_id2count = manipulate_biosqldb.to_dict(
            server.adaptor.execute_and_fetchall(sql, ))

    sql = 'select * from hmm.hmm_sets'
    set_id2description = manipulate_biosqldb.to_dict(
        server.adaptor.execute_and_fetchall(sql, ))

    set2taxon2count = {}
    taxon2list = {}
    set_list = []
    for row in data:
        if row[0] not in taxon2list:
            taxon2list[row[0]] = [row[1]]
        else:
            taxon2list[row[0]].append(row[1])

        set = set_id2description[str(row[1])]
        if set not in set_list:
            set_list.append(set)
        if len(set_list_restrict) > 0:
            if set not in set_list_restrict:
                continue
        if set not in set2taxon2count:
            set2taxon2count[set] = {}

            if frequency:
                freq = round(
                    (float(row[2]) / float(taxon_id2count[str(row[0])])) * 100,
                    2)
                set2taxon2count[set][str(row[0])] = freq
            else:
                set2taxon2count[set][str(row[0])] = int(row[2])
        else:
            if frequency:
                freq = round(
                    (float(row[2]) / float(taxon_id2count[str(row[0])])) * 100,
                    2)
                set2taxon2count[set][str(row[0])] = freq
            else:
                set2taxon2count[set][str(row[0])] = int(row[2])

    if not return_lists:
        return set2taxon2count, set_list
    else:
        return taxon2list, set_list
Esempio n. 7
0
    import argparse
    import manipulate_biosqldb
    parser = argparse.ArgumentParser()
    parser.add_argument("-i", '--input', type=str, help="input genbank")
    parser.add_argument("-l", '--locus', type=str, help="locus_tag_prefix")
    args = parser.parse_args()

    target_aa = ['U', 'C', 'u', 'c']

    server, db = manipulate_biosqldb.load_db('chlamydia_03_15')

    sql = 'select locus_tag, SP, TM from orthology_detail_chlamydia_03_15'

    sql2 = 'select protein_id, locus_tag from orthology_detail_chlamydia_03_15'

    protein_id2locus_tag = manipulate_biosqldb.to_dict(
        server.adaptor.execute_and_fetchall(sql2, ))
    #print protein_id2locus_tag
    data = server.adaptor.execute_and_fetchall(sql, )
    locus_tag2SP_TM = {}
    for i in data:
        locus_tag2SP_TM[i[0]] = [i[1], i[2]]

    from Bio import SeqIO
    handle = open(args.input, "rU")
    print 'protein_id\tlocus\ttransmembrane_domains\tsignal_peptide\tcystein(%)\tn_C_U\tprotein_length\tdescription'
    for record in SeqIO.parse(handle, "fasta"):
        target_n = 0
        protein_length = len(record.seq)
        for aa in record.seq:
            if aa in target_aa:
                target_n += 1
Esempio n. 8
0
def get_whole_db_uniprot_crossref(biodb):

    # get gi from all database locus

    import MySQLdb
    from datetime import datetime
    import httplib
    import time
    import manipulate_biosqldb
    import re
    import urllib2
    import os
    #sqlpsw = os.environ['SQLPSW']
    from tempfile import NamedTemporaryFile
    conn = MySQLdb.connect(host="localhost", # your host, usually localhost
                                user="******", # your username
                                passwd="estrella3", # your password
                                db="custom_tables") # name of the data base

    cursor = conn.cursor()

    sql1 = 'CREATE TABLE IF NOT EXISTS uniprot_id2seqfeature_id_%s (seqfeature_id INT UNIQUE, uniprot_id INT AUTO_INCREMENT,' \
           ' uniprot_accession varchar(400), uniprot_status varchar(400), annotation_score INT, insert_date varchar(300), INDEX uniprot_id(uniprot_id))' % biodb

    sql2 = 'CREATE TABLE IF NOT EXISTS db_xref (db_xref_id INT AUTO_INCREMENT, db_xref_name varchar(200) UNIQUE, INDEX db_xref_id(db_xref_id))'

    sql3 = 'CREATE TABLE IF NOT EXISTS uniprot_db_xref_%s (uniprot_id INT, db_xref_id INT, db_accession varchar(200), ' \
           ' INDEX db_xref_id(db_xref_id), index uniprot_id(uniprot_id))' % biodb

    sql4 = 'CREATE TABLE IF NOT EXISTS uniprot_go_terms_%s (seqfeature_id INT, go_term_id varchar(400), go_description TEXT, ' \
           ' INDEX seqfeature_id(seqfeature_id))' % biodb

    sql5 = 'CREATE TABLE IF NOT EXISTS uniprot_annotation_%s (seqfeature_id INT, comment_function TEXT,' \
           ' ec_number TEXT,comment_similarity TEXT,comment_catalyticactivity TEXT,comment_pathway TEXT,keywords TEXT,' \
           ' comment_subunit TEXT, gene TEXT, recommendedName_fullName TEXT, proteinExistence TEXT, ' \
           ' developmentalstage TEXT, index seqfeature_id(seqfeature_id))' % biodb

    print sql1
    cursor.execute(sql1, )
    cursor.execute(sql2, )
    cursor.execute(sql3, )
    cursor.execute(sql4, )
    cursor.execute(sql5, )
    conn.commit()



    sql1 = 'select locus_tag, seqfeature_id from locus2seqfeature_id_%s' % biodb
    sql2 = 'select locus_tag, old_locus_tag from biosqldb.locus_tag2old_locus_tag'

    # attention EDIT!!!!!!!!!!!!!!
    sql3 = 'select locus_tag, protein_id from biosqldb.orthology_detail_%s where protein_id not like "%%%%CHUV%%%%"' % biodb


    sql4 = 'select locus_tag,t2.seqfeature_id from locus2seqfeature_id_%s t1 inner join uniprot_annotation_%s t2' \
           ' on t1.seqfeature_id=t2.seqfeature_id group by locus_tag;' % (biodb, biodb)
    sql5 = 'select locus_tag, organism from biosqldb.orthology_detail_%s' % biodb
    sql6 = 'select locus_tag, translation from biosqldb.orthology_detail_%s' % biodb

    sql7 = 'select locus_tag, accession from biosqldb.orthology_detail_%s' % biodb


    cursor.execute(sql1, )
    locus2seqfeature_id = manipulate_biosqldb.to_dict(cursor.fetchall())

    cursor.execute(sql2, )
    locus2old_locus = manipulate_biosqldb.to_dict(cursor.fetchall())

    cursor.execute(sql3, )
    locus2protein_id = manipulate_biosqldb.to_dict(cursor.fetchall())

    cursor.execute(sql4, )
    locus2uniprot_id = manipulate_biosqldb.to_dict(cursor.fetchall())

    cursor.execute(sql5, )
    locus2organism = manipulate_biosqldb.to_dict(cursor.fetchall())

    cursor.execute(sql6, )
    locus2sequence = manipulate_biosqldb.to_dict(cursor.fetchall())

    cursor.execute(sql7, )
    locus2genome_accession = manipulate_biosqldb.to_dict(cursor.fetchall())

    for i, locus in enumerate(locus2protein_id):
        print "%s -- %s : %s / %s" % (locus, locus2protein_id[locus],i, len(locus2protein_id))

        # already into database
        if locus in locus2uniprot_id:
            continue
        genome_accession = locus2genome_accession[locus]

        uniprot_id = ncbi_accession2uniprotid(locus2protein_id[locus], genome_accession=genome_accession)

        if not uniprot_id:
            uniprot_id = ncbi_accession2uniprotid(locus2protein_id[locus])

        if not uniprot_id:
            try:
                old_locus = locus2old_locus[locus]
            except KeyError:
                old_locus = False
            if old_locus:
                genus = locus2organism[locus].split(' ')[0]
                print 'trying with old_locus_tag'
                uniprot_id = ncbi_accession2uniprotid(old_locus, gene=True, organism=genus)
            if not uniprot_id:
                print 'trying to match with sequence: %s' % locus
                try:
                    uniprot_id = sequence2uniprot_id(locus2sequence[locus])
                    print 'ok: %s' % uniprot_id
                except:
                    continue
        if uniprot_id:
            # insert uniprot_id into mysql table
            # 1. get seqfeatureid of the corresponding locus
            seqid = locus2seqfeature_id[locus]
            try:
                uniprot_score, uniprot_status, go_data = uniprot_accession2go_and_status(uniprot_id)
            except:
                print 'echec, continue'
                continue
            # add go data
            if go_data:
                for one_go in go_data:
                    sql = 'insert into uniprot_go_terms_%s (seqfeature_id, go_term_id, go_description) ' \
                          'values(%s, "%s", "%s")' % (biodb,
                                                      seqid,
                                                      one_go,
                                                      go_data[one_go])
                    cursor.execute(sql, )
                    conn.commit()

            # insert uniprot_id
            now = datetime.now()
            str_date = "%s-%s-%s" % (now.year, now.month, now.day)

            sql = 'insert into uniprot_id2seqfeature_id_%s (seqfeature_id,uniprot_accession, uniprot_status, annotation_score,insert_date) ' \
                  ' values (%s, "%s", "%s", %s,"%s")' % (biodb,
                                                       seqid,
                                                       uniprot_id,
                                                       uniprot_status,
                                                       uniprot_score,
                                                       str_date)
            try:
                cursor.execute(sql, )
                conn.commit()
            # if seqfeature id already already inserted, no need to insert it again
            except conn.IntegrityError:
                print '%s already into uniprot_id2seqfeature_id_%s' % (seqid, biodb)
                pass
            sqlid = 'select t1.uniprot_id from uniprot_id2seqfeature_id_%s as t1 where t1.seqfeature_id=%s' % (biodb,
                                                                                                  locus2seqfeature_id[locus])
            #print sqlid
            cursor.execute(sqlid, )
            uniprot_db_id = cursor.fetchall()[0][0]
            #print 'uniprotdb id', uniprot_db_id

            uniprot_record = uniprot_id2record(uniprot_id)

            if not uniprot_record:
                import time
                time.sleep(5)
                uniprot_record = uniprot_id2record(uniprot_id)

            alldbref = uniprot_record2db_refs(uniprot_record)

            annotation = uniprot_record2annotations(uniprot_record)

            # add annotation
            sql = 'insert into uniprot_annotation_%s (seqfeature_id, comment_function,' \
                  ' ec_number,comment_similarity,comment_catalyticactivity,comment_pathway,keywords,' \
                  ' comment_subunit, gene, recommendedName_fullName, proteinExistence,developmentalstage) values' \
                  '  (%s, "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s")' % (biodb,
                                                                                                seqid,
                                                                                                re.sub('"','',annotation["comment_function"]),
                                                                                                re.sub('"','',annotation["ec_number"]),
                                                                                                re.sub('"','',annotation["comment_similarity"]),
                                                                                                re.sub('"','',annotation["comment_catalyticactivity"]),
                                                                                                re.sub('"','',annotation["comment_pathway"]),
                                                                                                re.sub('"','',annotation["keywords"]),
                                                                                                re.sub('"','',annotation["comment_subunit"]),
                                                                                                re.sub('"','',annotation["gene"]),
                                                                                                re.sub('"','',annotation["recommendedName_fullName"]),
                                                                                                re.sub('"','',annotation["proteinExistence"]),
                                                                                                re.sub('"','',annotation["developmentalstage"]))
            cursor.execute(sql, )
            conn.commit()

            # add dbxrefs
            if alldbref:
                for database in alldbref:
                    # 1. check if cross ref database already in the database list
                    sql1 = 'select db_xref_id from db_xref where db_xref_name="%s"' % database
                    try:
                        cursor.execute(sql1, )
                        database_index = cursor.fetchall()[0][0]

                    except:
                        # insert new database name
                        sql2 = 'insert into db_xref (db_xref_name) values ("%s")' % database
                        cursor.execute(sql2, )
                        conn.commit()
                        cursor.execute(sql1, )
                        database_index = cursor.fetchall()[0][0]


                    for crossref in alldbref[database]:
                        # insert cross reference into database
                        sql3 = 'insert into uniprot_db_xref_%s (uniprot_id, db_xref_id, db_accession) values (%s, %s, "%s")' % (biodb,
                                                                                                                        uniprot_db_id,
                                                                                                                        database_index,
                                                                                                                        crossref)
                        #print sql3
                        cursor.execute(sql3, )
                        conn.commit()
            else:
                print 'echec ----------------'
                print 'echec ----------------'
        else:
            print 'UNIPRITID NOT FOUND'
Esempio n. 9
0
def find_clusters_of_locus(db_name, identity_cutoff, distance_cutoff=20000):

    '''
    ATTENTION: tous les paralogues pris en comptes
    si on a 1 prot dans genome A et 3 dans le genome B
    prot 1A sera comparee a son best hit B
    prot 1B, 2B et 2C seront comparee a 1A

    dans tous les cas cette approche est redontante car on compare tjs A vs B et B vs A...

    :param db_name: biodatabase name
    :param identity_cutoff: average ortholog identity cutoff: if genomes are too close, do not identify clusters (too much clusters)
    :param distance_cutoff: size of the considered window
    :return:
    '''

    import manipulate_biosqldb
    import mysqldb_plot_genomic_feature
    from Bio.SeqRecord import SeqRecord
    from Bio.Seq import Seq
    import pylibmc

    # for memory storage of all biorecords
    mc = pylibmc.Client(["127.0.0.1"], binary=True,
                    behaviors={"tcp_nodelay": True,
                               "ketama": True})

    server, db = manipulate_biosqldb.load_db(db_name)

    sql_locus = 'select locus_tag from orthology_detail_%s' % db_name
    all_locus_list = [i[0] for i in server.adaptor.execute_and_fetchall(sql_locus,)]

    sql = 'select locus_tag, orthogroup from orthology_detail_%s' % db_name
    locus2orthogroup = manipulate_biosqldb.to_dict(server.adaptor.execute_and_fetchall(sql,))

    sql = 'select locus_tag, start, stop from orthology_detail_%s' % db_name

    locus2start_end = manipulate_biosqldb.to_dict(server.adaptor.execute_and_fetchall(sql,))
    orthogroup2locus_list = {}
    for locus in locus2orthogroup:
        if locus2orthogroup[locus] not in orthogroup2locus_list:
            orthogroup2locus_list[locus2orthogroup[locus]] = [locus]
        else:
            orthogroup2locus_list[locus2orthogroup[locus]].append(locus)

    sql_identity = 'select taxon_1, taxon_2, median_identity from comparative_tables.shared_orthogroups_average_identity_%s' % db_name
    taxon2taxon_median_id = {}

    for row in server.adaptor.execute_and_fetchall(sql_identity,):
        if row[0] not in taxon2taxon_median_id:
            taxon2taxon_median_id[row[0]] = {}
            taxon2taxon_median_id[row[0]][row[1]] = row[2]
        else:
            taxon2taxon_median_id[row[0]][row[1]] = row[2]
    sql = 'select locus_tag, accession from orthology_detail_%s' % db_name
    locus2accession = manipulate_biosqldb.to_dict(server.adaptor.execute_and_fetchall(sql,))
    accession_list = set(locus2accession.values())
    accession2record = {}

    locus2closest_locus_list = {}
    sql = 'select locus_1,locus_2 from comparative_tables.identity_closest_homolog_%s' % db_name
    data = server.adaptor.execute_and_fetchall(sql,)
    for i in data:
        if i[0] not in locus2closest_locus_list:
            locus2closest_locus_list[i[0]] = [i[1]]
        else:
            locus2closest_locus_list[i[0]].append(i[1])
        #if i[1] not in locus2closest_locus_list:
        #    locus2closest_locus_list[i[1]] = [i[0]]
        #else:
        #    locus2closest_locus_list[i[1]].append(i[0])
    # storage of all records into memory
    for accession in accession_list:
        #print accession
        rec_raw = db.lookup(accession=accession)
        try:
            new_record_reformat = mc[db_name + "_" + accession]
            print accession, 'in memory'
        except KeyError:
            print accession, 'NOT in memory'
            new_record_reformat = SeqRecord(Seq(rec_raw.seq.data, rec_raw.seq.alphabet),
                                                             id=rec_raw.id,
                                                             name=rec_raw.name,
                                                             description=rec_raw.description,
                                                             dbxrefs =rec_raw.dbxrefs,
                                                             features=rec_raw.features,
                                                             annotations=rec_raw.annotations)


            mc[db_name + "_" + accession]= new_record_reformat
        accession2record[accession] = new_record_reformat
    accession2taxon = manipulate_biosqldb.accession2taxon_id(server, db_name)

    # iter all orthogroups
    locus2linked_locus = {}

    locus2linked_taxons = {}
    all_pairs = []

    # iterate all orthogroups
    for t, ref_ortho in enumerate(list(set(locus2orthogroup.values()))):#: #: enumerate(["group_53"])
        print 'group %s / %s' % (t, len(list(set(locus2orthogroup.values()))))

        tmp_dico = {}

        # locus list of the considered group
        locus_list = orthogroup2locus_list[ref_ortho]

        # if a single locus, slip
        if len(locus_list) == 1:
            continue

        # iter all locus of the orthogroup
        for x, locus_a in enumerate(locus_list):

            locus2linked_locus[locus_a] = {}
            tmp_dico[locus_a] = {}
            locus2linked_taxons[locus_a] = {}
            # for each locus, initiate the count of comparisons
            comp_count = 0

            # extract region
            start_a = locus2start_end[locus_a][0]
            end_a = locus2start_end[locus_a][1]
            record = accession2record[locus2accession[locus_a]]
            size = distance_cutoff/2

            region_a = mysqldb_plot_genomic_feature.get_feature_neighborhood(start_a, end_a, record, size, 'rec')

            # get list of orthogroups in the neiborhood & get corresp between locus and groups
            grp_list = []
            grp2locus = {}
            for feature in region_a.features:
                if feature.type == 'CDS' and 'pseudo' not in feature.qualifiers:
                    locus_b = feature.qualifiers['locus_tag'][0]
                    orthogroup_locus = locus2orthogroup[locus_b]
                    if orthogroup_locus not in grp2locus:
                        grp2locus[orthogroup_locus] = [locus_b]
                    else:
                        grp2locus[orthogroup_locus].append(locus_b)
                    if orthogroup_locus not in grp_list:
                        grp_list.append(orthogroup_locus)

            # compare neighbours of all other locus to the reference locus
            try:
                closet_locus = locus2closest_locus_list[locus_a]
            except:
                continue
            for locus_b in closet_locus:#locus_list[x+1:len(locus_list)]:

                taxon_a = accession2taxon[locus2accession[locus_a]]
                taxon_b = accession2taxon[locus2accession[locus_b]]

                # if both locus are encoded by the same taxon, skip the comparison
                if taxon_a == taxon_b:
                    continue

                try:
                    identity = taxon2taxon_median_id[taxon_a][taxon_b]
                except KeyError:
                    identity = taxon2taxon_median_id[taxon_b][taxon_a]

                # if the 2 considered genomes are too closely related, skip the comparison
                if identity < identity_cutoff:
                    # increment the number of effective comparisons
                    comp_count+=1
                    start_b = locus2start_end[locus_b][0]
                    # if border of contig/chromosome
                    if start_b < 0:
                        start_b = 0
                    end_b = locus2start_end[locus_b][1]
                    record_b = accession2record[locus2accession[locus_b]]
                    region_b = mysqldb_plot_genomic_feature.get_feature_neighborhood(start_b, end_b, record_b, size, 'rec')

                    # get group list b
                    grp_list_b = []
                    for feature in region_b.features:
                        if feature.type == 'CDS' and 'pseudo' not in feature.qualifiers:
                            locus_b = feature.qualifiers['locus_tag'][0]
                            orthogroup_locus = locus2orthogroup[locus_b]
                            if orthogroup_locus not in grp_list_b:
                                            grp_list_b.append(orthogroup_locus)

                    # get list of common groups
                    common = list(set(grp_list).intersection(set(grp_list_b)))
                    # remove ref group
                    try:
                        common.pop(common.index(ref_ortho))
                    except:
                        with open('problems.txt', 'a') as f:
                            f.write('%s\t%s\t%s\n' % (ref_ortho, locus_a,locus_b))
                    if len(common)>0:
                        # store locus and taxons linked
                        for linked_group in common:
                            # store reciprocal relationship between the 2 genomes and the 2 locus
                            # we can have more than one locus/group (i.e identical genes side by side)
                            for linked_locus in grp2locus[linked_group]:
                                # if reverse comparison was already made
                                #if linked_locus in locus2linked_locus:
                                #    continue
                                if linked_locus not in tmp_dico[locus_a]: #locus2linked_locus
                                    # store n link and n comparisons
                                    #locus2linked_locus[locus_a][linked_locus] = [1, '-']
                                    tmp_dico[locus_a][linked_locus] = [1, '-']
                                    locus2linked_taxons[locus_a][linked_locus] = [[ref_ortho, linked_group, taxon_a,taxon_b]]
                                else:
                                    #locus2linked_locus[locus_a][linked_locus][0] += 1
                                    tmp_dico[locus_a][linked_locus][0] += 1
                                    locus2linked_taxons[locus_a][linked_locus].append([ref_ortho, linked_group, taxon_a,taxon_b])

            # end of loop for locus_a: store the number of comparisons done
            for linked_locus in tmp_dico[locus_a]: # locus2linked_locus
                #locus2linked_locus[locus_a][linked_locus][1] = comp_count
                tmp_dico[locus_a][linked_locus][1] = comp_count
            #if len(locus2linked_locus[locus_a]) == 0:
            #    del locus2linked_locus[locus_a]
            if len(tmp_dico[locus_a]) > 0:
                #print 'insert!'
                #print tmp_dico
                for locus_b in tmp_dico[locus_a]:
                    # only add minimum of 50% links
                    if tmp_dico[locus_a][locus_b][0]/float(tmp_dico[locus_a][locus_b][1]) > 0.5:
                        sql = 'insert into interactions.colocalization_table_locus_%s (locus_1, locus_2, n_links, n_comparisons, ratio)' \
                              ' values ("%s","%s",%s,%s,%s)' % (db_name,
                                                                locus_a,
                                                                locus_b,
                                                                tmp_dico[locus_a][locus_b][0],
                                                                tmp_dico[locus_a][locus_b][1],
                                                                tmp_dico[locus_a][locus_b][0]/float(tmp_dico[locus_a][locus_b][1]))
                        server.adaptor.execute(sql,)
                        server.adaptor.commit()

            if len(locus2linked_taxons[locus_a]) == 0:
                del locus2linked_taxons[locus_a]
            #print locus2linked_locus
    return locus2linked_locus, locus2linked_taxons
Esempio n. 10
0
def find_clusters_of_orthogroups(db_name, identity_cutoff, distance_cutoff=10000):

    '''
    ATTENTION: tous les paralogues pris en comptes
    si on a 1 prot dans genome A et 3 dans le genome B
    prot 1A sera comparee a son best hit B
    prot 1B, 2B et 2C seront comparee a 1A

    dans tous les cas cette approche est redontante car on compare tjs A vs B et B vs A...

    :param db_name: biodatabase name
    :param identity_cutoff: average ortholog identity cutoff: if genomes are too close, do not identify clusters (too much clusters)
    :param distance_cutoff: size of the considered window
    :return:
    '''

    import manipulate_biosqldb
    import mysqldb_plot_genomic_feature
    from Bio.SeqRecord import SeqRecord
    from Bio.Seq import Seq
    import pylibmc

    # for memory storage of all biorecords
    mc = pylibmc.Client(["127.0.0.1"], binary=True,
                    behaviors={"tcp_nodelay": True,
                               "ketama": True})

    server, db = manipulate_biosqldb.load_db(db_name)

    sql_locus = 'select locus_tag from orthology_detail_%s' % db_name
    all_locus_list = [i[0] for i in server.adaptor.execute_and_fetchall(sql_locus,)]

    sql = 'select locus_tag, orthogroup from orthology_detail_%s' % db_name
    locus2orthogroup = manipulate_biosqldb.to_dict(server.adaptor.execute_and_fetchall(sql,))

    sql = 'select locus_tag, start, stop from orthology_detail_%s' % db_name

    locus2start_end = manipulate_biosqldb.to_dict(server.adaptor.execute_and_fetchall(sql,))
    orthogroup2locus_list = {}
    for locus in locus2orthogroup:
        if locus2orthogroup[locus] not in orthogroup2locus_list:
            orthogroup2locus_list[locus2orthogroup[locus]] = [locus]
        else:
            orthogroup2locus_list[locus2orthogroup[locus]].append(locus)

    sql_identity = 'select taxon_1, taxon_2, median_identity from comparative_tables.shared_orthogroups_average_identity_%s' % db_name
    taxon2taxon_median_id = {}

    for row in server.adaptor.execute_and_fetchall(sql_identity,):
        if row[0] not in taxon2taxon_median_id:
            taxon2taxon_median_id[row[0]] = {}
            taxon2taxon_median_id[row[0]][row[1]] = row[2]
        else:
            taxon2taxon_median_id[row[0]][row[1]] = row[2]
    sql = 'select locus_tag, accession from orthology_detail_%s' % db_name
    locus2accession = manipulate_biosqldb.to_dict(server.adaptor.execute_and_fetchall(sql,))
    accession_list = set(locus2accession.values())
    accession2record = {}

    locus2closest_locus_list = {}
    sql = 'select locus_1,locus_2 from comparative_tables.identity_closest_homolog_%s' % db_name
    data = server.adaptor.execute_and_fetchall(sql,)
    for i in data:
        if i[0] not in locus2closest_locus_list:
            locus2closest_locus_list[i[0]] = [i[1]]
        else:
            locus2closest_locus_list[i[0]].append(i[1])
        if i[1] not in locus2closest_locus_list:
            locus2closest_locus_list[i[1]] = [i[0]]
        else:
            locus2closest_locus_list[i[1]].append(i[0])
    # storage of all records into memory
    for accession in accession_list:
        #print accession
        rec_raw = db.lookup(accession=accession)
        try:
            new_record_reformat = mc[db_name + "_" + accession]
        except KeyError:
            #print accession, 'not in memory'
            new_record_reformat = SeqRecord(Seq(rec_raw.seq.data, rec_raw.seq.alphabet),
                                                             id=rec_raw.id, name=rec_raw.name,
                                                             description=rec_raw.description,
                                                             dbxrefs =rec_raw.dbxrefs,
                                                             features=rec_raw.features,
                                                             annotations=rec_raw.annotations)


            mc[db_name + "_" + accession]= new_record_reformat
        accession2record[accession] = new_record_reformat
    accession2taxon = manipulate_biosqldb.accession2taxon_id(server, db_name)


    # iter all orthogroups
    group2linked_groups = {}
    group2linked_taxons = {}
    all_pairs = []
    for t, ref_ortho in enumerate(list(set(locus2orthogroup.values()))):#: #: enumerate(["group_53"])
        #print t, len(list(set(locus2orthogroup.values())))
        comp_count = 0
        #reference_grp = locus2orthogroup[locus]
        group2linked_groups[ref_ortho] = {}
        locus_list = orthogroup2locus_list[ref_ortho]

        # if no homologs, skip
        if len(locus_list) == 1:
            continue

        # iter all locus of the orthogroup
        for x, locus_a in enumerate(locus_list):

            # extract region
            start_a = locus2start_end[locus_a][0]
            end_a = locus2start_end[locus_a][1]
            record = accession2record[locus2accession[locus_a]]
            size = distance_cutoff/2

            region_a = mysqldb_plot_genomic_feature.get_feature_neighborhood(start_a, end_a, record, size, 'rec')
            grp_list = []
            for feature in region_a.features:
                if feature.type == 'CDS' and 'pseudo' not in feature.qualifiers:
                    locus_b = feature.qualifiers['locus_tag'][0]
                    orthogroup_locus = locus2orthogroup[locus_b]
                    if orthogroup_locus not in grp_list:
                        grp_list.append(orthogroup_locus)

            # compare neighbours of all other locus to the reference locus
            try:
                closet_locus = locus2closest_locus_list[locus_a]
            except:
                continue
            for locus_b in locus_list[x+1:len(locus_list)]:

                # only consider "best hit", locus with the highest identity
                # si on a une relation 1 vs 3
                # on va avoir une seule comparaison pour le genome A vs B mais 3 pour la comparsion B vs A...
                # oubien: si plusieurs pairs: ne comparer que la paire la plus proche.
                # dans tous les cas ca va associer les groupes qui incluent de multiples paralogues.
                # keep all comparisons in memory and do it only once?
                # cas des multiples paralogues side by side
                if locus_b not in closet_locus:
                    continue

                taxon_a = accession2taxon[locus2accession[locus_a]]
                taxon_b = accession2taxon[locus2accession[locus_b]]

                # if both locus are encoded by the same taxon, skip the comparison
                if taxon_a == taxon_b:
                    continue
                try:
                    identity = taxon2taxon_median_id[taxon_a][taxon_b]
                except KeyError:
                    identity = taxon2taxon_median_id[taxon_b][taxon_a]

                # if the 2 considered genomes are too closely related, skip the comparison
                if identity < identity_cutoff:
                    comp_count+=1
                    #print comp_count, "comp_count"
                    start_b = locus2start_end[locus_b][0]
                    if start_b < 0:
                        start_b = 0
                    end_b = locus2start_end[locus_b][1]
                    record_b = accession2record[locus2accession[locus_b]]
                    region_b = mysqldb_plot_genomic_feature.get_feature_neighborhood(start_b, end_b, record_b, size, 'rec')
                    grp_list_b = []
                    for feature in region_b.features:
                        if feature.type == 'CDS' and 'pseudo' not in feature.qualifiers:
                            #print feature
                            locus_b = feature.qualifiers['locus_tag'][0]
                            orthogroup_locus = locus2orthogroup[locus_b]
                            if orthogroup_locus not in grp_list_b:
                                            grp_list_b.append(orthogroup_locus)


                    common = list(set(grp_list).intersection(set(grp_list_b)))
                    # remove ref group
                    try:
                        common.pop(common.index(ref_ortho))
                    except:
                        pass
                    if len(common)>0:
                        # store groups and taxons linked
                        for linked_group in common:
                            # store reciprocal relationship between the 2 genomes and the 2 groups
                            # group a vs group b == group b vs group a
                            try:
                                if [taxon_a, taxon_b] not in group2linked_taxons[ref_ortho][linked_group]:
                                    group2linked_taxons[ref_ortho][linked_group].append([taxon_a, taxon_b])
                            except KeyError:
                                try:
                                    # remove potential redundant pairs due to paralogs
                                    # all paralogs are taken into acount
                                    if [taxon_a, taxon_b] not in group2linked_taxons[linked_group][ref_ortho]:
                                        group2linked_taxons[linked_group][ref_ortho].append([taxon_a, taxon_b])
                                except KeyError:
                                    if ref_ortho in group2linked_taxons:
                                        group2linked_taxons[ref_ortho][linked_group] = [[taxon_a, taxon_b]]
                                    elif linked_group in group2linked_taxons:
                                        group2linked_taxons[linked_group][ref_ortho] = [[taxon_a, taxon_b]]
                                    else:
                                        group2linked_taxons[ref_ortho] = {}
                                        group2linked_taxons[ref_ortho][linked_group] = [[taxon_a, taxon_b]]
                            # store counts of links out of the total number of comparsions
                            if linked_group in group2linked_groups[ref_ortho]:
                                group2linked_groups[ref_ortho][linked_group][0] += 1
                            else:
                                group2linked_groups[ref_ortho][linked_group] = [1]
                    else:
                        pass
                        #print 'no common groups'
                    # check if multiple common elements
        for linked_group in group2linked_groups[ref_ortho]:
            group2linked_groups[ref_ortho][linked_group].append(comp_count)
    return group2linked_groups, group2linked_taxons
Esempio n. 11
0
def edit_svg_map(map_path, keep_ko_list, biodb_name, map_name, taxon_id=False):

    import manipulate_biosqldb
    import re

    server, db = manipulate_biosqldb.load_db(biodb_name)

    sql = 'select description,pathway_name from enzyme.kegg_pathway;'

    description2map = manipulate_biosqldb.to_dict(
        server.adaptor.execute_and_fetchall(sql, ))

    from xml.etree import ElementTree
    tree = ElementTree.parse(map_path)
    #print tree
    for element in tree.iter():
        if element.tag.split("}")[1] == 'text':
            #print element.tag
            #print element.attrib

            for child in element:
                #print child.tag
                #print child.attrib
                if child.text[0] != 'K':
                    #print child.text
                    try:
                        if not taxon_id:
                            add = 'window.open("/chlamdb/KEGG_mapp_ko/%s", "_top");' % (
                                description2map[child.text])
                        else:
                            add = 'window.open("/chlamdb/KEGG_mapp_ko_organism/%s/%s", "_top");' % (
                                description2map[child.text], taxon_id)
                    except:
                        continue
                    mystyle = element.get("style")

                    add4 = "this.style.stroke = '#ff0000'; this.style['stroke-width'] = 1;"
                    add5 = "this.style.stroke = '#000000'; this.style['stroke-width'] = 0;"

                    element.set("onclick", add)
                    #element.set("target", add2)
                    element.set("onmouseover", add4)
                    element.set("onmouseout", add5)

                if child.text in keep_ko_list:
                    #print 'match-----------'
                    add = 'window.open("/chlamdb/fam/%s/ko", "_top");' % (
                        child.text)
                    mystyle = element.get("style")

                    add4 = "this.style.stroke = '#ff0000'; this.style['stroke-width'] = 1;"
                    add5 = "this.style.stroke = '#000000'; this.style['stroke-width'] = 0;"

                    element.set("onclick", add)
                    #element.set("target", add2)
                    element.set("onmouseover", add4)
                    element.set("onmouseout", add5)
                if '...' in child.text:
                    #print 'match-----------'
                    add = 'window.open("/chlamdb/kegg_multi/%s/%s/", "_top");' % (
                        map_name, re.sub('\.\.\.', '', child.text))
                    mystyle = element.get("style")

                    add4 = "this.style.stroke = '#ff0000'; this.style['stroke-width'] = 1;"
                    add5 = "this.style.stroke = '#000000'; this.style['stroke-width'] = 0;"

                    element.set("onclick", add)
                    #element.set("target", add2)
                    element.set("onmouseover", add4)
                    element.set("onmouseout", add5)
    return tree
Esempio n. 12
0
def biodb2all_connections(biodb):

    import manipulate_biosqldb
    import time
    import re

    server, db = manipulate_biosqldb.load_db(biodb)

    sql = 'select db_accession from custom_tables.uniprot_id2seqfeature_id_%s t0 ' \
          ' inner join custom_tables.uniprot_db_xref_%s t1 on t0.uniprot_id=t1.uniprot_id ' \
          ' inner join custom_tables.db_xref t2 on t1.db_xref_id=t2.db_xref_id where db_xref_name="string" and db_accession like "%%%%CPn%%%%";' % (biodb, biodb)

    all_string_accessions = [
        i[0] for i in server.adaptor.execute_and_fetchall(sql, )
    ]

    sql = 'select seqfeature_id, taxon_id from custom_tables.locus2seqfeature_id_%s' % biodb

    seqfeature_id2taxon_id = manipulate_biosqldb.to_dict(
        server.adaptor.execute_and_fetchall(sql, ))

    sql = 'select locus_tag,seqfeature_id from custom_tables.locus2seqfeature_id_%s' % biodb

    new_locus_tag2seqfeature_id = manipulate_biosqldb.to_dict(
        server.adaptor.execute_and_fetchall(sql, ))

    sql = 'select old_locus_tag,seqfeature_id from custom_tables.seqfeature_id2old_locus_tag_%s' % biodb

    old_locus_tag2seqfeature_id = manipulate_biosqldb.to_dict(
        server.adaptor.execute_and_fetchall(sql, ))

    sql = 'create table if not exists string.interactions_%s (taxon_id INT, ' \
          ' seqfeature_id_1 INT, ' \
          ' seqfeature_id_2 INT,' \
          ' old_locus_tag_1 varchar(400), ' \
          ' old_locus_tag_2 varchar (400), ' \
          ' label_1 varchar(400), ' \
          ' label_2 varchar (400), ' \
          ' global_score FLOAT,' \
          ' neighborhood FLOAT,' \
          ' gene_fusion FLOAT,' \
          ' cooccurence FLOAT,' \
          ' coexpression FLOAT,' \
          ' experiments FLOAT,' \
          ' biodatabases FLOAT,' \
          ' textmining FLOAT, ' \
          ' index seqfeature_id_1 (seqfeature_id_1),' \
          ' index seqfeature_id_2 (seqfeature_id_2),' \
          ' INDEX old_locus_tag_1 (old_locus_tag_1),' \
          ' index old_locus_tag_2 (old_locus_tag_2))' % biodb
    print sql
    #server.adaptor.execute(sql,)

    ref_locus_list = []

    for n, string_accession in enumerate(all_string_accessions):
        print "%s / %s" % (n, len(all_string_accessions))
        interactions = string_id2connexions(string_accession)

        if not interactions:
            while interactions is False:
                print 'trying again...'
                time.sleep(10)
                interactions = string_id2connexions(string_accession)

        for one_interaction in interactions:
            print string_accession, one_interaction
            gscore = 0
            fscore = 0
            pscore = 0
            nscore = 0
            ascore = 0
            escore = 0
            dscore = 0
            tscore = 0

            if string_accession in one_interaction[0]:
                ref_locus = one_interaction[0].split(':')[1].split('.')[1]
                link_locus = one_interaction[1].split(':')[1].split('.')[1]

            elif string_accession in one_interaction[1]:
                ref_locus = one_interaction[1].split(':')[1].split('.')[1]
                link_locus = one_interaction[0].split(':')[1].split('.')[1]
            else:
                # connection does not contain reference link, skiping
                continue
            ref_locus_list.append(ref_locus)
            if link_locus in ref_locus_list:
                # not a new connection
                continue

            label_1 = one_interaction[2]
            label_2 = one_interaction[3]

            # locus tag corresp between old and new RefSeq annotation
            try:
                ref_locus_seqfeature_id = old_locus_tag2seqfeature_id[
                    ref_locus]
            except:
                # special case trachomatis
                try:
                    ref_locus = re.sub('CT', 'CT_', ref_locus)
                    ref_locus_seqfeature_id = new_locus_tag2seqfeature_id[
                        ref_locus]
                except:
                    ref_locus_seqfeature_id = 'NULL'
            print 'ref_locus', ref_locus
            # locus tag corresp OK but pseudogene
            try:
                taxon_id = seqfeature_id2taxon_id[str(ref_locus_seqfeature_id)]
            except:
                taxon_id = 'NULL'
            if taxon_id is None:
                taxon_id = 'NULL'
            # locus tag corresp between old and new RefSeq annotation
            try:
                link_locus_seqfeature_id = old_locus_tag2seqfeature_id[
                    link_locus]
            except:
                try:
                    link_locus = re.sub('CT', 'CT_', link_locus)
                    link_locus_seqfeature_id = new_locus_tag2seqfeature_id[
                        link_locus]
                except:
                    link_locus_seqfeature_id = 'NULL'

            scores = one_interaction[4].split('|')

            for one_score in scores:
                score, value = one_score.split(':')
                #print ref_locus, link_locus, score, value
                if score == 'score':
                    gscore = value
                elif score == 'nscore':
                    nscore = value
                elif score == 'fscore':
                    fscore = value
                elif score == 'pscore':
                    pscore = value
                elif score == 'ascore':
                    ascore = value
                elif score == 'escore':
                    escore = value
                elif score == 'dscore':
                    dscore = value
                elif score == 'tscore':
                    tscore = value
                else:
                    print 'unkonwn score type', score, value
            # ref_locus, link_locus, ref_locus_seqfeature_id, link_locus_seqfeature_id, label_1, label_2, gscore, ncore, fscore, pscore, ascore, escore, dscore, tscore
            sql = 'insert into string.interactions_%s values ' \
                  ' (%s, %s, %s, "%s", "%s", "%s", "%s", %s, %s, %s, %s, %s, %s, %s, %s)' % (biodb,
                                                                                                 taxon_id,
                                                                                                 ref_locus_seqfeature_id,
                                                                                                 link_locus_seqfeature_id,
                                                                                                 ref_locus,
                                                                                                 link_locus,
                                                                                                 label_1,
                                                                                                 label_2,
                                                                                                 gscore,
                                                                                                 nscore,
                                                                                                 fscore,
                                                                                                 pscore,
                                                                                                 ascore,
                                                                                                 escore,
                                                                                                 dscore,
                                                                                                 tscore)
            print taxon_id, sql
            server.adaptor.execute(sql, )
        server.commit()
Esempio n. 13
0
def biodb2string_pmid_data(biodb):

    import manipulate_biosqldb
    import pubmed_utils
    import time
    import re

    server, db = manipulate_biosqldb.load_db(biodb)

    sql = 'select db_accession from custom_tables.uniprot_id2seqfeature_id_%s t0 ' \
          ' inner join custom_tables.uniprot_db_xref_%s t1 on t0.uniprot_id=t1.uniprot_id ' \
          ' inner join custom_tables.db_xref t2 on t1.db_xref_id=t2.db_xref_id where db_xref_name="string" and db_accession like "%%%%CPn%%%%";' % (biodb, biodb)

    all_string_accessions = [
        i[0] for i in server.adaptor.execute_and_fetchall(sql, )
    ]

    sql = 'create table if not exists string.seqfeature_id2string_pmid_%s (taxon_id INT, ' \
          ' seqfeature_id INT, ' \
          ' pmid INT, ' \
          ' authors TEXT,' \
          ' title TEXT,' \
          ' abstract TEXT, ' \
          ' source TEXT,' \
          ' INDEX seqfeature_id(seqfeature_id))' % biodb

    server.adaptor.execute(sql, )
    server.commit()

    sql = 'select seqfeature_id, taxon_id from custom_tables.locus2seqfeature_id_%s' % biodb

    seqfeature_id2taxon_id = manipulate_biosqldb.to_dict(
        server.adaptor.execute_and_fetchall(sql, ))

    sql = 'select old_locus_tag, seqfeature_id from custom_tables.seqfeature_id2old_locus_tag_%s' % biodb

    old_locus_tag2seqfeature_id = manipulate_biosqldb.to_dict(
        server.adaptor.execute_and_fetchall(sql, ))

    sql = 'select locus_tag,seqfeature_id from custom_tables.locus2seqfeature_id_%s' % biodb

    new_locus_tag2seqfeature_id = manipulate_biosqldb.to_dict(
        server.adaptor.execute_and_fetchall(sql, ))

    for n, string_accession in enumerate(all_string_accessions):
        print "%s / %s" % (n, len(all_string_accessions))

        old_locus_tag = string_accession.split('.')[1]
        try:
            seqfeature_id = old_locus_tag2seqfeature_id[old_locus_tag]
        except:
            try:
                # special case trachomatis
                old_locus_tag = re.sub('CT', 'CT_', old_locus_tag)
                seqfeature_id = new_locus_tag2seqfeature_id[old_locus_tag]
            except:
                continue
        taxon_id = seqfeature_id2taxon_id[str(seqfeature_id)]
        if taxon_id is None:
            taxon_id = 'NULL'
        pmid_list = string_id2pubmed_id_list(string_accession)
        print 'miidjdjnjdhd', pmid_list
        if pmid_list is False:
            while pmid_list is False:
                print 'trying again'
                time.sleep(10)
                pmid_list = string_id2pubmed_id_list(string_accession)

        if len(pmid_list) == 0:
            print '0 pmid for', string_accession
            continue
        else:
            for one_pmid in pmid_list:
                abstract_data = pubmed_utils.pmid2abstract_info(one_pmid)
                print 'data', abstract_data
                abstract = re.sub("'", "", abstract_data['abstract'])
                abstract = re.sub("%", "%%%%", abstract)
                title = re.sub("'", "", abstract_data['title'])
                title = re.sub("%", "%%%%", title)
                source = re.sub("'", "", abstract_data['source'])
                source = re.sub("%", "%%%%", source)

                sql = '''insert into string.seqfeature_id2string_pmid_%s values (%s, %s, %s, '%s', '%s', '%s', '%s')''' % (
                    biodb, taxon_id, seqfeature_id, abstract_data['pmid'],
                    re.sub("'", "", str(
                        abstract_data['authors'])), title, abstract, source)
                print sql
                server.adaptor.execute(sql, )
            server.commit()
Esempio n. 14
0
def plot_cog_eatmap(biodb,
                    ref_tree,
                    taxon_id_list=[],
                    frequency=False,
                    group_by_cog_id=False):
    import manipulate_biosqldb
    import ete_motifs

    server, db = manipulate_biosqldb.load_db(biodb)

    sql = 'select biodatabase_id from biodatabase where name="%s"' % biodb

    db_id = server.adaptor.execute_and_fetchall(sql, )[0][0]

    # RESTRICT TO AS SUBSET OF THE TAXON AVAILABLE

    sql = ''

    if len(taxon_id_list) > 0:
        filter = ','.join(taxon_id_list)

        sql = 'select taxon_id, code, count(*) as n from COG.seqfeature_id2best_COG_hit_%s t1 ' \
              ' inner join biosqldb.bioentry t2 on t1.bioentry_id=t2.bioentry_id' \
              ' inner join COG.cog_id2cog_category t3 on t1.hit_cog_id=t3.COG_id ' \
              ' inner join COG.code2category t4 on t3.category_id=t4.category_id ' \
              ' where t2.biodatabase_id=%s and taxon_id in (%s)' \
              ' group by taxon_id, code;' % (biodb,
                db_id,
                filter)

        print(sql)
    else:
        if not group_by_cog_id:
            sql = 'select taxon_id,functon,count(*) as n ' \
                  ' from COG.locus_tag2gi_hit_%s t1 ' \
                  ' inner join COG.cog_names_2014 t2 on t1.COG_id=t2.COG_id ' \
                  ' inner join biosqldb.bioentry as t3 on t1.accession=t3.accession ' \
                  ' where biodatabase_id=%s group by taxon_id,functon' % (biodb, db_id)
        else:
            sql = ' select A.taxon_id,B.functon,count(*) from (select t1.COG_id, t3.taxon_id from COG.locus_tag2gi_hit_%s t1 ' \
                  ' inner join biosqldb.orthology_detail_%s t3 on t1.locus_tag=t3.locus_tag ' \
                  ' group by taxon_id,t1.COG_id) A inner join COG.cog_names_2014 B on A.COG_id=B.COG_id ' \
                  ' group by A.taxon_id,B.functon;' % (biodb, biodb)

    data = server.adaptor.execute_and_fetchall(sql, )

    if frequency:
        '''
        ATTENTION: based on total annotated with COG and not genome size
        
        '''
        sql = 'select taxon_id, count(*) as n from COG.seqfeature_id2best_COG_hit_%s t1' \
              ' inner join biosqldb.bioentry t2 on t1.bioentry_id=t2.bioentry_id' \
              ' where t2.biodatabase_id=%s group by taxon_id;' % (biodb, db_id)
        taxon_id2count = manipulate_biosqldb.to_dict(
            server.adaptor.execute_and_fetchall(sql, ))

        code2taxon2count = {}
        cog_list = []

    else:
        sql = 'select taxon_id, count(*) from biosqldb.orthology_detail_%s t1 left join COG.locus_tag2gi_hit_%s t2 ' \
              ' on t1.locus_tag=t2.locus_tag where COG_id is NULL group by t1.taxon_id;' % (biodb,  biodb)

        taxon2count_no_GOG = manipulate_biosqldb.to_dict(
            server.adaptor.execute_and_fetchall(sql, ))

        sql = 'select taxon_id, count(*) from orthology_detail_%s group by taxon_id' % biodb

        taxon2proteome_size = manipulate_biosqldb.to_dict(
            server.adaptor.execute_and_fetchall(sql, ))

        code2taxon2count = {}
        code2taxon2count['-'] = {}
        code2taxon2count['TOTAL'] = {}
        for taxon in taxon2count_no_GOG:
            if taxon in taxon_id_list:
                code2taxon2count['-'][taxon] = int(taxon2count_no_GOG[taxon])
                code2taxon2count['TOTAL'][taxon] = int(
                    taxon2proteome_size[taxon])

        cog_list = ['TOTAL', '-']

    sql = 'select code, description from COG.code2category;'
    code2description = manipulate_biosqldb.to_dict(
        server.adaptor.execute_and_fetchall(sql, ))

    for row in data:
        descr = "%s (%s)" % (code2description[row[1]], row[1])
        if descr not in cog_list:
            cog_list.append(descr)
        if descr not in code2taxon2count:
            code2taxon2count[descr] = {}
            if frequency:
                code2taxon2count[descr][str(row[0])] = round(
                    (float(row[2]) / float(taxon_id2count[str(row[0])])) * 100,
                    2)
            else:
                code2taxon2count[descr][str(row[0])] = int(row[2])
        else:
            if frequency:
                code2taxon2count[descr][str(row[0])] = round(
                    (float(row[2]) / float(taxon_id2count[str(row[0])])) * 100,
                    2)
            else:
                code2taxon2count[descr][str(row[0])] = int(row[2])

    tree2 = ete_motifs.multiple_profiles_heatmap(biodb,
                                                 cog_list,
                                                 code2taxon2count,
                                                 show_labels=True,
                                                 column_scale=True,
                                                 tree=ref_tree,
                                                 as_float=frequency)
    return tree2