Ejemplo n.º 1
0
def generate_reports_for_experiment(weight_cutoff,
                                    n_clusters,
                                    neighborhoods=None,
                                    clustered_profiles=None,
                                    target_profiles=None,
                                    profile2def=None):

    if not target_profiles:
        target_profiles = t.target_profiles()
    if not profile2def:
        profile2def = t.map_profile2def()
    if not neighborhoods:
        neighborhoods = pickle.load(open('files/neighborhoods.p'))
    if not clustered_profiles:
        clustered_profiles_file = os.path.join(
            gv.project_data_path,
            'clustering/10/cluster_profiles_cutoff_%1.1f.txt' % weight_cutoff)
        clustered_profiles = [
            l.strip().split()
            for l in open(clustered_profiles_file).readlines()
        ]

    reports_file_dir = os.path.join('reports', str(n_clusters))
    if not os.path.exists(reports_file_dir):
        os.mkdir(reports_file_dir)

    reports_file_dir = os.path.join(reports_file_dir,
                                    'weight_cutoff_' + str(weight_cutoff))
    if not os.path.exists(reports_file_dir):
        os.mkdir(reports_file_dir)
    print 'Data ready. Starting report generation.'
    cnt = 1
    for cl in clustered_profiles:
        print 'Cluster no:', cnt
        tmp_neighborhoods = []
        for nbr in neighborhoods:
            tmp_profiles = []
            for g in nbr.genes:
                if g.cogid not in ["", None]:
                    tmp_profiles += g.cogid.split()
            prof_cnt = 0
            for prof in set(tmp_profiles):
                if prof in cl:
                    prof_cnt += 1

            if prof_cnt >= 7:
                tmp_neighborhoods.append(nbr)

        xls_file = os.path.join(reports_file_dir, "cl_no_%d.xls" % cnt)
        write_to_xls(xls_file, tmp_neighborhoods, cl, target_profiles,
                     profile2def)
        cnt += 1
Ejemplo n.º 2
0
def generate_reports_for_experiment(weight_cutoff, n_clusters, neighborhoods=None, clustered_profiles=None, target_profiles=None, profile2def=None):

    if not target_profiles:
        target_profiles = t.target_profiles()
    if not profile2def:
        profile2def = t.map_profile2def()
    if not neighborhoods:
        neighborhoods = pickle.load(open('files/neighborhoods.p'))
    if not clustered_profiles:
        clustered_profiles_file = os.path.join(gv.project_data_path, 'clustering/10/cluster_profiles_cutoff_%1.1f.txt' % weight_cutoff)
        clustered_profiles = [l.strip().split() for l in open(clustered_profiles_file).readlines()]

    reports_file_dir = os.path.join('reports', str(n_clusters))
    if not os.path.exists(reports_file_dir):
        os.mkdir(reports_file_dir)

    reports_file_dir = os.path.join(reports_file_dir, 'weight_cutoff_'+str(weight_cutoff))
    if not os.path.exists(reports_file_dir):
        os.mkdir(reports_file_dir)
    print 'Data ready. Starting report generation.'
    cnt = 1
    for cl in clustered_profiles:
        print 'Cluster no:', cnt
        tmp_neighborhoods = []
        for nbr in neighborhoods:
            tmp_profiles = []
            for g in nbr.genes:
                if g.cogid not in ["", None]:
                    tmp_profiles += g.cogid.split()
            prof_cnt = 0
            for prof in set(tmp_profiles):
                if prof in cl:
                    prof_cnt += 1

            if prof_cnt >= 7:
                tmp_neighborhoods.append(nbr)

        xls_file = os.path.join(reports_file_dir, "cl_no_%d.xls" % cnt)
        write_to_xls(xls_file, tmp_neighborhoods, cl, target_profiles, profile2def)
        cnt += 1
Ejemplo n.º 3
0
elif sys.platform=='linux2':
    sys.path.append('/home/hudaiber/Projects/lib/BioPy/')
    sys.path.append('/home/hudaiber/Projects/SystemFiles/')

import global_variables as gv
sys.path.append(gv.project_code_path)

from lib.db.archea import db_tools, neighborhoods_path
from lib.utils import tools as t
import os

target_profiles = t.target_profiles()

target_profiles = [l.strip() for l in open('/Volumes/pan1/patternquest/Projects/NewSystems/data/Archea/arCOG/selected_arcogs.txt').readlines()]

profile2def = t.map_profile2def()
gid2arcog_cdd = t.map_gid2arcog_cdd()
neighborhood_files_path = neighborhoods_path()

neighborhood_files_path = '/Volumes/pan1/patternquest/Projects/NewSystems/data/Archea/genes_and_flanks/win_10/pty/'


def write_to_xls(xls_file, kplets):

    community = set()
    [community.update(kplet.codes) for kplet in kplets]
    _file2kplets = {}
    for kplet in kplets:
        for f in kplet.files:
            if f in _file2kplets:
                _file2kplets[f].append(kplet)
Ejemplo n.º 4
0
from lib.db.archea import triplets as tr
from lib.db.archea import duplets as d
from lib.db.archea.db_tools import file2src_src2org_map
from lib.db.archea import neighborhoods_path

# import report_generation as r
from lib.utils import reporting as r
import lib.utils.merging as merging
import lib.utils.tools as t
import pickle

if __name__ == '__main__':

    print 'Pre-Loading dictionaries'
    target_profiles = t.target_profiles()
    profile2def = t.map_profile2def()
    gid2arcog_cdd = t.map_gid2arcog_cdd()
    neighborhood_files_path = neighborhoods_path()
    print "\n"

    limit_to = 1000000
    data_path = os.path.join(gv.project_data_path, 'Archea/pickle/')
    fname = os.path.join(data_path, str(limit_to),
                         'pentaplets_merged_across.p.bz2')
    pentaplets = t.load_compressed_pickle(fname)

    report_dir = 'all'
    report_files_dir = os.path.join(gv.project_data_path,
                                    'Archea/reports/merged_across_orders/',
                                    report_dir)
Ejemplo n.º 5
0
def calculate_profile_based_baiticity(bacteria_loci, loci_gis, loci_organisms,
                                      arcog_path_file, bait_profiles,
                                      filter_threshold, save_path):

    print "Loding global maps"
    global_profile2orgs2gis = load_maps_simple(arcog_path_file, loci_gis)
    print "Loading weights"
    gnm2weight = t.map_genome2weight()
    print "Loading CDD definitions"
    profile2def = t.map_cdd_profile2def()

    profile2def.update(t.map_profile2def())

    print "Counting in loci"

    profile2orgs2obj = {}

    gi_checklist = set()

    for locus in bacteria_loci:
        for gene in locus:

            if gene.gid in gi_checklist:
                continue

            for _cogid in gene.cogid.split():

                if _cogid in bait_profiles:
                    continue

                if _cogid not in profile2orgs2obj:
                    profile2orgs2obj[_cogid] = {}
                    for _org in loci_organisms:
                        _orgObj = ProfileInOrganismCount(_org, _cogid)

                        if _cogid in global_profile2orgs2gis:
                            _orgObj.outside = len(global_profile2orgs2gis[_cogid][_org]) \
                                              if _org in global_profile2orgs2gis[_cogid] \
                                              else 0
                        else:
                            _orgObj.outside = 0

                        profile2orgs2obj[_cogid][_org] = _orgObj

                profile2orgs2obj[_cogid][gene.organism].in_locus += 1

            gi_checklist.update([gene.gid])

    print len(profile2orgs2obj['arCOG08578'])
    # print profile2orgs2obj['arCOG08578'].keys()

    for org, obj in profile2orgs2obj['arCOG08578'].items():
        if obj.in_locus + obj.outside > 0:
            print org, obj.in_locus, obj.outside

    sys.exit()
    out_file = os.path.join(save_path, 'baiticity.tab')

    profiles = []

    in_loci_count = []
    baiticity_count = []

    in_loci_weight = []
    baiticity_weight = []

    rare_profiles_file = open(os.path.join(save_path, 'rare_profiles.tab'),
                              'w')
    rare_profiles_file.write(
        "Profile\tOccurence everywhere\tOccurrence in loci\tBaiticity\tDefinition\n"
    )

    print "Writing to file:", out_file
    with open(out_file, 'w') as outf:

        outf.write(
            "Profile\tOccurrence in loci(count)\tBaiticity(count)\tOccurrence in loci(weight)\tBaiticity(weight)\tDefinition\n"
        )

        for profile in profile2orgs2obj:

            if profile == 'arCOG14077':
                continue

            in_locus_count = 0
            everywhere_count = 0
            in_locus_weight = 0
            everywhere_weight = 0

            for org in profile2orgs2obj[profile]:

                if org in [
                        'Nitrosoarchaeum_koreensis_MY1_MY1',
                        'Nitrosoarchaeum_limnia_SFB1'
                ]:
                    continue

                _org = profile2orgs2obj[profile][org]

                in_locus_count += _org.in_locus
                everywhere_count += (_org.in_locus + _org.outside)

                in_locus_weight += _org.in_locus * gnm2weight[org]
                everywhere_weight += (_org.in_locus +
                                      _org.outside) * gnm2weight[org]

            _baiticity_count = 1.0 * in_locus_count / everywhere_count
            _baiticity_weight = in_locus_weight / everywhere_weight

            if everywhere_weight < filter_threshold:

                rare_profiles_file.write(
                    "%s\t%f\t%f\t%f\t%s\n" %
                    (profile, everywhere_count, in_locus_count,
                     _baiticity_count, profile2def[profile]))
                continue

            in_loci_count.append(in_locus_count)
            baiticity_count.append(_baiticity_count)
            in_loci_weight.append(in_locus_weight)
            baiticity_weight.append(_baiticity_weight)

            profiles.append(profile)
            outf.write(
                "%s\t%f\t%f\t%f\t%f\t%s\n" %
                (profile, in_locus_count, _baiticity_count, in_locus_weight,
                 _baiticity_weight, profile2def[profile]))

    in_loci_weight = np.asarray(in_loci_weight)
    in_loci_weight = np.log10(in_loci_weight)
    baiticity_weight = np.asarray(baiticity_weight)

    plt.ioff()
    fig, ax = plt.subplots()
    ax.scatter(in_loci_weight, baiticity_weight, s=1)

    plt.xlabel("Effective orcurrence in loci (log10)")
    plt.ylabel("Baiticity")

    image_file = os.path.join(save_path, 'baiticity.png')
    plt.savefig(image_file)

    # for i, profile in enumerate(profiles_all):
    #     ax.annotate(profile, (in_loci_all[i], crispricity_all[i]))
    # fig.savefig('second.png')
    # plt.savefig('second.png')

    rare_profiles_file.close()
Ejemplo n.º 6
0
def calculate_profile_based_baiticity(bacteria_loci, loci_gis,
                                      loci_organisms,
                                      arcog_path_file,
                                      bait_profiles,
                                      filter_threshold,
                                      save_path):

    print "Loding global maps"
    global_profile2orgs2gis = load_maps_simple(arcog_path_file, loci_gis)
    print "Loading weights"
    gnm2weight = t.map_genome2weight()
    print "Loading CDD definitions"
    profile2def = t.map_cdd_profile2def()

    profile2def.update(t.map_profile2def())

    print "Counting in loci"

    profile2orgs2obj = {}

    gi_checklist = set()

    for locus in bacteria_loci:
        for gene in locus:

            if gene.gid in gi_checklist:
                continue

            for _cogid in gene.cogid.split():

                if _cogid in bait_profiles:
                    continue

                if _cogid not in profile2orgs2obj:
                    profile2orgs2obj[_cogid] = {}
                    for _org in loci_organisms:
                        _orgObj = ProfileInOrganismCount(_org, _cogid)

                        if _cogid in global_profile2orgs2gis:
                            _orgObj.outside = len(global_profile2orgs2gis[_cogid][_org]) \
                                              if _org in global_profile2orgs2gis[_cogid] \
                                              else 0
                        else:
                            _orgObj.outside = 0

                        profile2orgs2obj[_cogid][_org] = _orgObj

                profile2orgs2obj[_cogid][gene.organism].in_locus += 1

            gi_checklist.update([gene.gid])

    print len(profile2orgs2obj['arCOG08578'])
    # print profile2orgs2obj['arCOG08578'].keys()

    for org, obj in profile2orgs2obj['arCOG08578'].items():
        if obj.in_locus + obj.outside > 0:
            print org, obj.in_locus, obj.outside

    sys.exit()
    out_file = os.path.join(save_path, 'baiticity.tab')

    profiles = []

    in_loci_count = []
    baiticity_count = []

    in_loci_weight = []
    baiticity_weight = []

    rare_profiles_file = open(os.path.join(save_path, 'rare_profiles.tab'), 'w')
    rare_profiles_file.write("Profile\tOccurence everywhere\tOccurrence in loci\tBaiticity\tDefinition\n")

    print "Writing to file:", out_file
    with open(out_file, 'w') as outf:

        outf.write("Profile\tOccurrence in loci(count)\tBaiticity(count)\tOccurrence in loci(weight)\tBaiticity(weight)\tDefinition\n")

        for profile in profile2orgs2obj:

            if profile=='arCOG14077':
                continue

            in_locus_count = 0
            everywhere_count = 0
            in_locus_weight = 0
            everywhere_weight = 0

            for org in profile2orgs2obj[profile]:

                if org in ['Nitrosoarchaeum_koreensis_MY1_MY1','Nitrosoarchaeum_limnia_SFB1']:
                    continue

                _org = profile2orgs2obj[profile][org]

                in_locus_count   += _org.in_locus
                everywhere_count += (_org.in_locus + _org.outside)

                in_locus_weight   += _org.in_locus * gnm2weight[org]
                everywhere_weight += (_org.in_locus + _org.outside) * gnm2weight[org]


            _baiticity_count  = 1.0 * in_locus_count / everywhere_count
            _baiticity_weight = in_locus_weight / everywhere_weight

            if everywhere_weight < filter_threshold:

                rare_profiles_file.write("%s\t%f\t%f\t%f\t%s\n"%(profile, everywhere_count, in_locus_count, _baiticity_count, profile2def[profile]))
                continue

            in_loci_count.append(in_locus_count)
            baiticity_count.append(_baiticity_count)
            in_loci_weight.append(in_locus_weight)
            baiticity_weight.append(_baiticity_weight)

            profiles.append(profile)
            outf.write("%s\t%f\t%f\t%f\t%f\t%s\n"%(profile,
                                                    in_locus_count,
                                                    _baiticity_count,
                                                    in_locus_weight,
                                                    _baiticity_weight,
                                                    profile2def[profile]))

    in_loci_weight = np.asarray(in_loci_weight)
    in_loci_weight = np.log10(in_loci_weight)
    baiticity_weight = np.asarray(baiticity_weight)

    plt.ioff()
    fig, ax = plt.subplots()
    ax.scatter(in_loci_weight, baiticity_weight, s=1)

    plt.xlabel("Effective orcurrence in loci (log10)")
    plt.ylabel("Baiticity")

    image_file = os.path.join(save_path, 'baiticity.png')
    plt.savefig(image_file)

    # for i, profile in enumerate(profiles_all):
    #     ax.annotate(profile, (in_loci_all[i], crispricity_all[i]))
    # fig.savefig('second.png')
    # plt.savefig('second.png')

    rare_profiles_file.close()