Example #1
0
def locus_tag2identity_best_hit_all_genomes(biodb_name,
                                            locus,
                                            group_name,
                                            locus_tag2taxonomic_id_dict=""):

    server, db = manipulate_biosqldb.load_db(biodb_name)

    genomes = manipulate_biosqldb.get_genome_taxons_list(server, biodb_name)

    #print genomes

    # create empty dictionnary
    genome2genome = {}
    for genome1 in genomes:
        if not genome1 in genome2genome.keys():
            genome2genome[str(genome1)] = {}
            for genome2 in genomes:
                genome2genome[str(genome1)][str(genome2)] = []

    try:
        identity_table = np.array(
            get_orthogroup_identity_table(biodb_name, group_name))
        all_locus = identity_table[:, 0]
    except:
        genome2best_hit = {}
        for genome1 in genomes:
            genome2best_hit[genome1] = 0
        return genome2best_hit

    #print "identity table", len(identity_table)

    #locus_tag2genome_name = manipulate_biosqldb.locus_tag2genome_description(server, biodb_name)
    locus_tag2taxonomic_id_dict = manipulate_biosqldb.locus_tag2genome_taxon_id(
        server, biodb_name)

    #print "locus_tag2taxonomic_id ok"

    locus_index_ref = list(all_locus).index(locus)
    '''
    for y in range(1, len(all_locus) + 1):
        genome1 = locus_tag2taxonomic_id_dict[all_locus[y-1]]
        for z in range(0, len(all_locus)):
            genome2= locus_tag2taxonomic_id_dict[all_locus[z]]
            genome2genome[str(genome1)][str(genome2)].append(identity_table[z, y])
    '''
    genome2best_hit = {}
    for genome1 in genomes:
        genome2best_hit[genome1] = 0
    #print genome2best_hit
    for target_locus in all_locus:
        locus_index_target = list(all_locus).index(target_locus)
        identity = identity_table[locus_index_ref, locus_index_target + 1]
        target_taxon = locus_tag2taxonomic_id_dict[target_locus]
        if float(identity) > genome2best_hit[str(target_taxon)]:
            genome2best_hit[str(target_taxon)] = float(identity)
    #print genome2best_hit
    return genome2best_hit
Example #2
0
def locus_list2presence_absence_all_genomes(locus_list, biodb_name):
    server, db = manipulate_biosqldb.load_db(biodb_name)

    locus_tag2seqfeature_id = manipulate_biosqldb.locus_tag2seqfeature_id_dict(
        server, biodb_name)

    taxon_id2description = manipulate_biosqldb.taxon_id2genome_description(
        server, biodb_name)

    import re
    for i in taxon_id2description.keys():
        taxon_id2description[i] = re.sub(" subsp\. aureus", "",
                                         taxon_id2description[i])
        taxon_id2description[i] = re.sub(", complete genome\.", "",
                                         taxon_id2description[i])
        taxon_id2description[i] = re.sub(", complete sequence\.", "",
                                         taxon_id2description[i])
        taxon_id2description[i] = re.sub("strain ", "",
                                         taxon_id2description[i])
        taxon_id2description[i] = re.sub("str\. ", "", taxon_id2description[i])
        taxon_id2description[i] = re.sub(" complete genome sequence\.", "",
                                         taxon_id2description[i])
        taxon_id2description[i] = re.sub(" complete genome\.", "",
                                         taxon_id2description[i])
        taxon_id2description[i] = re.sub(" chromosome", "",
                                         taxon_id2description[i])
        taxon_id2description[i] = re.sub("Staphylococcus aureus ", "",
                                         taxon_id2description[i])

    header = 'orthogroup\t'
    genomes = manipulate_biosqldb.get_genome_taxons_list(server, biodb_name)
    for i in genomes:
        header += taxon_id2description[i] + '\t'
    final_out = header + '\n'

    for i in locus_list:
        #print "locus", i
        seqfeature_id = locus_tag2seqfeature_id[i]
        orthogroup = manipulate_biosqldb.seqfeature_id2orthogroup(
            server, seqfeature_id, biodb_name)
        #print "ortho", orthogroup
        dico = heatmap_presence_absence(biodb_name, orthogroup)

        #print "dico done..."
        #print dico
        out = '%s\t' % orthogroup
        for i in genomes:

            out += '%s\t' % dico[i]
        final_out += out + '\n'

    return final_out
Example #3
0
def heatmap_presence_absence(biodb_name, group_name):
    server, db = manipulate_biosqldb.load_db(biodb_name)
    genomes = manipulate_biosqldb.get_genome_taxons_list(server, biodb_name)
    template = ''
    for i in range(0, len(genomes)):
        template += '`%s`, ' % genomes[i]
    template += '`%s`' % genomes[-1]

    #print "template", template

    sql = 'select %s from orthology_%s where orthogroup = "%s"' % (template, biodb_name, group_name)

    result = [int(i) for i in server.adaptor.execute_and_fetchall(sql,)[0]]
    #print result
    taxon2presence_absence = {}
    for x, y in zip(genomes, result):
        taxon2presence_absence[x] = y
    return taxon2presence_absence