Beispiel #1
0
def merge_into_file_summaries(kplets, neighborhood_files_path, file2src_src2org_map, data_type='bacteria'):

    _org2weight = t.map_genome2weight()

    _file2kplets = dict()
    for kplet in kplets:
        for f in kplet.files:
            if f in _file2kplets:
                _file2kplets[f].append(kplet)
            else:
                _file2kplets[f] = [kplet]

    kplet_files = _file2kplets.keys()
    _file2src, _src2org = file2src_src2org_map(kplet_files)

    file_summaries = list()
    for f in kplet_files:
        _neighborhood = Neighborhood(os.path.join(neighborhood_files_path, f))
        _src = _file2src[f]
        _org = _src2org[_src]
        _weight = _org2weight[_org]
        kplets = _file2kplets[f]
        _neighborhood.extend_flanks(10, os.path.join(gv.pty_data_path, _org, "%s.pty" % _src), _gid2arcog_cdd)
        file_summaries.append(NeighborhoodFileSummary(f, kplets, _neighborhood, _org, _src, _weight))

    # file_summaries = trim_file_summary_list(file_summaries, data_type)
    # file_summaries = [fs for fs in file_summaries if fs]

    # Updating the map _file2src after trimming.
    # new_file_list = [ fs.file_name for fs in file_summaries]
    # for _file_name in _file2src.keys():
    #     if _file_name not in new_file_list:
    #         del _file2src[_file_name]

    # if len(file_summaries) < 2:
    #     return None, None, None, None, None, None

    file_summaries.sort(key= lambda x: x.weight, reverse=True)

    community_count_with_flanks = {}
    community_count = {}
    _org2weight = t.map_genome2weight()

    total_weight = 0

    for i in range(len(file_summaries)):
        cur_file_summary = file_summaries[i]
        _weight = _org2weight[cur_file_summary.org]
        total_weight += _weight
        for gene in cur_file_summary.neighborhood.genes:
            if gene.tag == 'flank':
                for k in gene.cogid.split():
                    t.update_dictionary(community_count_with_flanks, k, _weight)
            else:
                for k in gene.cogid.split():
                    t.update_dictionary(community_count_with_flanks, k, _weight)
                    t.update_dictionary(community_count, k, _weight)
    community = []
    return _src2org, file_summaries, community, community_count, community_count_with_flanks, total_weight
Beispiel #2
0
def get_flank_distributions(kplets_2d_list, neighborhood_path, target_profiles):

    org2weights = t.map_genome2weight()
    flanking_genes_count = []

    cog2gids = []

    gid2weight = dict()

    for kplets_list in kplets_2d_list:
        cur_flanking_genes_count = dict()

        cur_cog2gids = dict()

        for kplet in kplets_list:
            neighborhoods = [Neighborhood(os.path.join(neighborhood_path, f)) for f in kplet.files]

            for neighborhood in neighborhoods:
                for gene in neighborhood.genes:

                    gid2weight[int(gene.gid)] = org2weights[gene.organism]

                    for cogid in gene.cogid.split():
                        # if cogid in target_profiles:
                        #     continue
                        t.update_dictionary(cur_flanking_genes_count,cogid,org2weights[gene.organism])
                        t.update_dictionary_set(cur_cog2gids, cogid, set([int(gene.gid)]))

        flanking_genes_count.append(cur_flanking_genes_count)
        cog2gids.append(cur_cog2gids)

    return flanking_genes_count, cog2gids, gid2weight
def count_profiles_in_neighborhoods(neighborhoods_path, save_path, limit_to, combination_size):
    target_profiles = [l.strip() for l in open(os.path.join(gv.project_data_path, 'Archea', 'arCOG/selected_arcogs.txt'))]
    src2org = t.map_src2org()
    gnm2weight = t.map_genome2weight()
    neighborhoods = [cl.Neighborhood(os.path.join(neighborhoods_path, f)) for f in os.listdir(neighborhoods_path)]

    # pickle.dump(neighborhoods, open('files/neighborhoods.p', 'w'))
    # neighborhoods = pickle.load(open('files/neighborhoods.p'))
    
    profile_stats = {}
    for nbr in neighborhoods:
        src_name = nbr.genes[0].src
        org_name = src2org[src_name]
        org_weight = gnm2weight[org_name] if org_name in gnm2weight else 1
        for g in nbr.genes:
            if g.cogid == "":
                continue
            for tmpCog in g.cogid.split():
                if tmpCog in target_profiles:
                    continue
                if tmpCog in profile_stats:
                    profile_stats[tmpCog].weight += org_weight
                    profile_stats[tmpCog].count += 1
                else:
                    profile_stats[tmpCog] = cl.ProfileCount(1, org_weight)

    profile_weights = [(k, v.weight) for k, v in profile_stats.items()]
    profile_weights = sorted(profile_weights, key=itemgetter(1), reverse=True)

    # pickle.dump(profile_weights, open('files/profile_weights.p', 'w'))
    # profile_weights = pickle.load(open('files/profile_weights.p'))

    with open('files/profile_weights.tab','w') as f:
        for profile, weight in profile_weights[:limit_to]:
            f.write('%f\t%s\n'%(weight, profile))

    top_profiles = [k for (k, v) in profile_weights[:limit_to]]
    print 'started counting'
    counted_combinations = count_combinations(neighborhoods, top_profiles, combination_size, src2org, gnm2weight)
    print 'Done counting'
    weight_values = np.array([v.weight for v in counted_combinations.values()])
    weight_values.sort()
    weight_values = weight_values[::-1]
    pivot_ind = np.where(np.cumsum(weight_values)/np.sum(weight_values)>=0.9)[0][0]
    pivot_value = weight_values[pivot_ind]

    M = pd.DataFrame([], columns=['Comb', 'weight', 'count'])
    M['Comb'] = counted_combinations.keys()
    M['weight'] = [v.weight for v in counted_combinations.values()]
    M['count'] = [v.count for v in counted_combinations.values()]

    M = M[M['count'] > 1]
    M = M[M['weight'] > pivot_value]
    M = M.sort('weight',ascending=False)
    fname = '%d_%d.tab' % (limit_to, combination_size)
    fname = os.path.join(save_path, fname)
    fout = open(fname, 'w')
    M.to_csv(fout, sep="\t", index=False)
Beispiel #4
0
    sys.path.append(os.path.join(os.path.expanduser('~'),'Projects/SystemFiles/'))
import global_variables as gv

import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import numpy as np
import shutil as sh
import xlsxwriter as x
from operator import itemgetter

import scores
import dendrogram

from lib.utils import tools as t
gnm2weight = t.map_genome2weight()
# file2org = {l.split()[0]:l.strip().split()[1] for l in open(os.path.join(gv.project_data_path,'cas1402/file2org.txt')).readlines()}
# file2crispr_type = {l.split('\t')[0]:l.strip().split('\t')[1].split(';') for l in open(os.path.join(gv.project_data_path,'cas1402/file2type.tab'))}

import lib.utils.reporting as r

from lib.db.generic import map_profiles_id2code_code2def
(_, profile_code2def) = map_profiles_id2code_code2def('cas')


def plot_block(block):

    _fname = block[0].strip()

    thresholds = []
    singles = []
cfg.read(config_file)

code_path = cfg.get('NewSystems', 'code_path')
data_path = cfg.get('NewSystems', 'data_path')
sys.path.append(code_path)
###############################################################

import lib.utils.tools as t
from lib.utils.classes import Kplet
import lib.db.prok1402.db_tools as dt
from lib.db.prok1402.duplets import extract_baited_duplet_aggregates
import numpy as np
import matplotlib.pyplot as plt

file_name2file_id, file_id2file_name = dt.map_baited_file2id()
genome2weight = t.map_genome2weight()


def load_adjacent_duplets_graph():

    duplet_rows = extract_baited_duplet_aggregates()

    duplets = []
    graph = nx.DiGraph()

    for row in duplet_rows:
        kplet_id = row[0]
        profiles = (row[1], row[2])
        number_of_loci = int(row[3])
        weight_of_loci = float(row[4])
        locus_file_ids = row[5].split(",")
Beispiel #6
0
def calculate_profile_based_crispricity_old(cas1402_loci, cas1402_gis,
                                            cas1402_organisms,
                                            prok1402_path_file):

    print "Loding global maps"
    global_gid2profiles, global_profile2orgs2gis = load_maps(
        prok1402_path_file, cas1402_gis, cas1402_organisms)
    print "Loading weights"
    gnm2weight = t.map_genome2weight()

    print "Counting in CRISPR loci"

    profile2orgs2count = {}

    for locus in cas1402_loci:
        for gene in locus:

            for _cogid in gene.cogid.split(','):

                if _cogid not in profile2orgs2count:
                    profile2orgs2count[_cogid] = {}

                if gene.organism not in profile2orgs2count[_cogid]:

                    _orgObj = ProfileInOrganismCount(gene.organism, _cogid)

                    outside_count = 0

                    if gene.gid in global_gid2profiles:
                        for _profile in global_gid2profiles[gene.gid]:
                            for _org in global_profile2orgs2gis[_profile]:
                                outside_count += 1 if global_profile2orgs2gis[
                                    _profile][_org].difference(
                                        cas1402_gis) else 0

                    _orgObj.outside = outside_count
                    profile2orgs2count[_cogid][gene.organism] = _orgObj

                profile2orgs2count[_cogid][gene.organism].in_crispr = 1

    in_crispr_all = []
    crispricity_all = []
    profiles_all = []

    print "Writing to files"
    with open('crispricity_profiles.tab', 'w') as outf_profiles:
        with open('crispricity_gis.tab', 'w') as outf_gis:

            for profile in profile2orgs2count:
                in_crispr = 0
                everywhere = 0

                for org in profile2orgs2count[profile]:
                    _org = profile2orgs2count[profile][org]
                    in_crispr += _org.in_crispr * gnm2weight[org]
                    everywhere += (_org.in_crispr +
                                   _org.outside) * gnm2weight[org]

                crispricity = in_crispr / everywhere

                in_crispr_all.append(in_crispr)
                crispricity_all.append(crispricity)
                profiles_all.append(profile)
                if profile.isdigit():
                    outf_gis.write("%s\t%f\t%f\n" %
                                   (profile, in_crispr, crispricity))
                else:
                    outf_profiles.write("%s\t%f\t%f\n" %
                                        (profile, in_crispr, crispricity))

    in_crispr_all = np.asarray(in_crispr_all)
    in_crispr_all = np.log10(in_crispr_all)
    crispricity_all = np.asarray(crispricity_all)
    # crispricity_all = np.log(crispricity_all)

    plt.ioff()
    fig, ax = plt.subplots()
    ax.scatter(in_crispr_all, crispricity_all, s=1)

    plt.xlabel("Effective orcurrence in CRISPR loci (log10)")
    plt.ylabel("X-axis / Effective occurrences")

    plt.savefig('crispricity.png')
Beispiel #7
0
def calculate_profile_based_crispricity(cas1402_loci, cas1402_gis,
                                        cas1402_organisms, prok1402_path_file):

    print "Loding global maps"
    global_profile2orgs2gis = load_maps_simple(prok1402_path_file, cas1402_gis)
    print "Loading weights"
    gnm2weight = t.map_genome2weight()
    print "Loading CDD definitions"
    profile2def = t.map_cdd_profile2def()

    print "Counting in CRISPR loci"

    profile2orgs2obj = {}

    for locus in cas1402_loci:
        for gene in locus:

            for _cogid in gene.cogid.split():

                if _cogid not in profile2orgs2obj:
                    profile2orgs2obj[_cogid] = {}
                    for _org in cas1402_organisms:
                        _orgObj = ProfileInOrganismCount(_org, _cogid)

                        if _cogid in global_profile2orgs2gis:
                            _orgObj.outside = len(global_profile2orgs2gis[_cogid][_org]) \
                                              if _org in global_profile2orgs2gis[_cogid] \
                                              else 0
                        else:
                            _orgObj.outside = 0

                        profile2orgs2obj[_cogid][_org] = _orgObj

                profile2orgs2obj[_cogid][gene.organism].in_crispr += 1

    out_file = os.path.join(gv.project_data_path,
                            'cas1402/crispricity_count.tab')

    in_crispr_all = []
    crispricity_all = []
    profiles_all = []

    print "Writing to file:", out_file
    with open(out_file, 'w') as outf:

        outf.write(
            "Profile\tOccurrence in CRISPR loci\tCrispricity\tDefinition\n")

        for profile in profile2orgs2obj:
            in_crispr = 0
            everywhere = 0

            # for org in profile2orgs2obj[profile]:
            #     _org = profile2orgs2obj[profile][org]
            #     in_crispr +=  _org.in_crispr * gnm2weight[org]
            #     everywhere += (_org.in_crispr + _org.outside) * gnm2weight[org]

            for org in profile2orgs2obj[profile]:
                _org = profile2orgs2obj[profile][org]
                in_crispr += _org.in_crispr
                everywhere += (_org.in_crispr + _org.outside)

            crispricity = in_crispr / everywhere

            in_crispr_all.append(in_crispr)
            crispricity_all.append(crispricity)
            profiles_all.append(profile)
            outf.write("%s\t%f\t%f\t%s\n" %
                       (profile, in_crispr, crispricity, profile2def[profile]))

    in_crispr_all = np.asarray(in_crispr_all)
    in_crispr_all = np.log10(in_crispr_all)
    crispricity_all = np.asarray(crispricity_all)
    # crispricity_all = np.log(crispricity_all)

    plt.ioff()
    fig, ax = plt.subplots()
    ax.scatter(in_crispr_all, crispricity_all, s=1)

    plt.xlabel("Effective orcurrence in CRISPR loci (log10)")
    plt.ylabel("X-axis / Effective occurrences")

    # fig.savefig('first.png')
    plt.savefig('first_count.png')
Beispiel #8
0
def calculate_profile_based_baiticity(bacteria_loci, loci_gis, loci_organisms,
                                      arcog_path_file, bait_profiles,
                                      filter_threshold, save_path):

    print "Loding global maps"
    global_profile2orgs2gis = load_maps_simple(arcog_path_file, loci_gis)
    print "Loading weights"
    gnm2weight = t.map_genome2weight()
    print "Loading CDD definitions"
    profile2def = t.map_cdd_profile2def()

    profile2def.update(t.map_profile2def())

    print "Counting in loci"

    profile2orgs2obj = {}

    gi_checklist = set()

    for locus in bacteria_loci:
        for gene in locus:

            if gene.gid in gi_checklist:
                continue

            for _cogid in gene.cogid.split():

                if _cogid in bait_profiles:
                    continue

                if _cogid not in profile2orgs2obj:
                    profile2orgs2obj[_cogid] = {}
                    for _org in loci_organisms:
                        _orgObj = ProfileInOrganismCount(_org, _cogid)

                        if _cogid in global_profile2orgs2gis:
                            _orgObj.outside = len(global_profile2orgs2gis[_cogid][_org]) \
                                              if _org in global_profile2orgs2gis[_cogid] \
                                              else 0
                        else:
                            _orgObj.outside = 0

                        profile2orgs2obj[_cogid][_org] = _orgObj

                profile2orgs2obj[_cogid][gene.organism].in_locus += 1

            gi_checklist.update([gene.gid])

    print len(profile2orgs2obj['arCOG08578'])
    # print profile2orgs2obj['arCOG08578'].keys()

    for org, obj in profile2orgs2obj['arCOG08578'].items():
        if obj.in_locus + obj.outside > 0:
            print org, obj.in_locus, obj.outside

    sys.exit()
    out_file = os.path.join(save_path, 'baiticity.tab')

    profiles = []

    in_loci_count = []
    baiticity_count = []

    in_loci_weight = []
    baiticity_weight = []

    rare_profiles_file = open(os.path.join(save_path, 'rare_profiles.tab'),
                              'w')
    rare_profiles_file.write(
        "Profile\tOccurence everywhere\tOccurrence in loci\tBaiticity\tDefinition\n"
    )

    print "Writing to file:", out_file
    with open(out_file, 'w') as outf:

        outf.write(
            "Profile\tOccurrence in loci(count)\tBaiticity(count)\tOccurrence in loci(weight)\tBaiticity(weight)\tDefinition\n"
        )

        for profile in profile2orgs2obj:

            if profile == 'arCOG14077':
                continue

            in_locus_count = 0
            everywhere_count = 0
            in_locus_weight = 0
            everywhere_weight = 0

            for org in profile2orgs2obj[profile]:

                if org in [
                        'Nitrosoarchaeum_koreensis_MY1_MY1',
                        'Nitrosoarchaeum_limnia_SFB1'
                ]:
                    continue

                _org = profile2orgs2obj[profile][org]

                in_locus_count += _org.in_locus
                everywhere_count += (_org.in_locus + _org.outside)

                in_locus_weight += _org.in_locus * gnm2weight[org]
                everywhere_weight += (_org.in_locus +
                                      _org.outside) * gnm2weight[org]

            _baiticity_count = 1.0 * in_locus_count / everywhere_count
            _baiticity_weight = in_locus_weight / everywhere_weight

            if everywhere_weight < filter_threshold:

                rare_profiles_file.write(
                    "%s\t%f\t%f\t%f\t%s\n" %
                    (profile, everywhere_count, in_locus_count,
                     _baiticity_count, profile2def[profile]))
                continue

            in_loci_count.append(in_locus_count)
            baiticity_count.append(_baiticity_count)
            in_loci_weight.append(in_locus_weight)
            baiticity_weight.append(_baiticity_weight)

            profiles.append(profile)
            outf.write(
                "%s\t%f\t%f\t%f\t%f\t%s\n" %
                (profile, in_locus_count, _baiticity_count, in_locus_weight,
                 _baiticity_weight, profile2def[profile]))

    in_loci_weight = np.asarray(in_loci_weight)
    in_loci_weight = np.log10(in_loci_weight)
    baiticity_weight = np.asarray(baiticity_weight)

    plt.ioff()
    fig, ax = plt.subplots()
    ax.scatter(in_loci_weight, baiticity_weight, s=1)

    plt.xlabel("Effective orcurrence in loci (log10)")
    plt.ylabel("Baiticity")

    image_file = os.path.join(save_path, 'baiticity.png')
    plt.savefig(image_file)

    # for i, profile in enumerate(profiles_all):
    #     ax.annotate(profile, (in_loci_all[i], crispricity_all[i]))
    # fig.savefig('second.png')
    # plt.savefig('second.png')

    rare_profiles_file.close()
        "pfam13011": 5.0,
        "pfam13276": 4.0,
        "pfam13481": 2.9,
        "pfam13542": 5.0,
        "pfam13551": 5.0,
        "pfam13592": 4.0,
        "pfam13700": 4.0,
        "pfam13817": 4.0,
        "pfam14261": 5.0,
        "pfam14294": 5.0
    }

    profile2def = t.map_cdd_profile2def()
    # gid2arcog_cdd = t.map_gid2arcog_cdd()
    neighborhood_files_path = merged_neighborhoods_path()
    org2weight = t.map_genome2weight()

    pan_data_path = '/panfs/pan1/patternquest/Projects/NewSystems/data/Bacteria/'

    pickle_file = os.path.join(pan_data_path, 'pickle/10000/profile2merged_files.p.bz2')
    profile2files = t.load_compressed_pickle(pickle_file)

    baiticity_file = os.path.join(gv.project_data_path, 'baiticity/bacteria/baiticity.tab')
    profile2baiticity = {l.split()[0]: l.split()[4] for l in open(baiticity_file).readlines()[1:] if l.strip()}

    i = 1

    for highlight_profile in additional_profiles:

        _profile_containing_files = profile2files[highlight_profile]
        file_summaries = merging.get_file_summaries(_profile_containing_files, neighborhood_files_path, org2weight)
Beispiel #10
0
def calculate_profile_based_baiticity(bacteria_loci, loci_gis,
                                      loci_organisms,
                                      prok1402_path_file,
                                      bait_profiles,
                                      filter_threshold,
                                      save_path):

    print "Loding global maps"
    global_profile2orgs2gis = load_maps_simple(prok1402_path_file, loci_gis)
    print "Loading weights"
    gnm2weight = t.map_genome2weight()
    print "Loading CDD definitions"
    profile2def = t.map_cdd_profile2def()

    print "Counting in loci"

    profile2orgs2obj = {}

    gi_checklist = set()

    for locus in bacteria_loci:
        for gene in locus:

            if gene.gid in gi_checklist:
                continue

            for _cogid in gene.cogid.split():

                if _cogid in bait_profiles:
                    continue

                if _cogid not in profile2orgs2obj:
                    profile2orgs2obj[_cogid] = {}
                    for _org in loci_organisms:
                        _orgObj = ProfileInOrganismCount(_org, _cogid)

                        if _cogid in global_profile2orgs2gis:
                            _orgObj.outside = len(global_profile2orgs2gis[_cogid][_org]) \
                                              if _org in global_profile2orgs2gis[_cogid] \
                                              else 0
                        else:
                            _orgObj.outside = 0

                        profile2orgs2obj[_cogid][_org] = _orgObj

                profile2orgs2obj[_cogid][gene.organism].in_locus += 1

            gi_checklist.update([gene.gid])

    out_file = os.path.join(save_path, 'baiticity.tab')

    profiles = []

    in_loci_count = []
    baiticity_count = []

    in_loci_weight = []
    baiticity_weight = []

    rare_profiles_file = open(os.path.join(save_path, 'rare_profiles.tab'), 'w')
    rare_profiles_file.write("Profile\tOccurence everywhere\tOccurrence in loci\tBaiticity\tDefinition\n")

    print "Writing to file:", out_file
    with open(out_file, 'w') as outf:

        outf.write("Profile\tOccurrence in loci(count)\tBaiticity(count)\tOccurrence in loci(weight)\tBaiticity(weight)\tDefinition\n")

        for profile in profile2orgs2obj:

            in_locus_count = 0
            everywhere_count = 0
            in_locus_weight = 0
            everywhere_weight = 0

            for org in profile2orgs2obj[profile]:
                _org = profile2orgs2obj[profile][org]

                in_locus_count   += _org.in_locus
                everywhere_count += (_org.in_locus + _org.outside)

                in_locus_weight   += _org.in_locus * gnm2weight[org]
                everywhere_weight += (_org.in_locus + _org.outside) * gnm2weight[org]

            _baiticity_count  = 1.0 * in_locus_count / everywhere_count
            _baiticity_weight = in_locus_weight / everywhere_weight

            if everywhere_weight < filter_threshold:

                rare_profiles_file.write("%s\t%f\t%f\t%f\t%s\n"%(profile, everywhere_count, in_locus_count, _baiticity_count, profile2def[profile]))
                continue

            in_loci_count.append(in_locus_count)
            baiticity_count.append(_baiticity_count)
            in_loci_weight.append(in_locus_weight)
            baiticity_weight.append(_baiticity_weight)

            profiles.append(profile)
            outf.write("%s\t%f\t%f\t%f\t%f\t%s\n"%(profile,
                                                    in_locus_count,
                                                    _baiticity_count,
                                                    in_locus_weight,
                                                    _baiticity_weight,
                                                    profile2def[profile]))

    in_loci_weight = np.asarray(in_loci_weight)
    in_loci_weight = np.log10(in_loci_weight)
    baiticity_weight = np.asarray(baiticity_weight)

    plt.ioff()
    fig, ax = plt.subplots()
    ax.scatter(in_loci_weight, baiticity_weight, s=1)

    plt.xlabel("Effective orcurrence in loci (log10)")
    plt.ylabel("Baiticity")

    image_file = os.path.join(save_path, 'baiticity.png')
    plt.savefig(image_file)

    # for i, profile in enumerate(profiles_all):
    #     ax.annotate(profile, (in_loci_all[i], crispricity_all[i]))
    # fig.savefig('second.png')
    # plt.savefig('second.png')

    rare_profiles_file.close()
Beispiel #11
0
def calculate_profile_based_crispricity_old(cas1402_loci, cas1402_gis, cas1402_organisms, prok1402_path_file):

    print "Loding global maps"
    global_gid2profiles, global_profile2orgs2gis = load_maps(prok1402_path_file, cas1402_gis, cas1402_organisms)
    print "Loading weights"
    gnm2weight = t.map_genome2weight()

    print "Counting in CRISPR loci"

    profile2orgs2count = {}

    for locus in cas1402_loci:
        for gene in locus:

            for _cogid in gene.cogid.split(','):

                if _cogid not in profile2orgs2count:
                    profile2orgs2count[_cogid] = {}

                if gene.organism not in profile2orgs2count[_cogid]:

                    _orgObj = ProfileInOrganismCount(gene.organism, _cogid)

                    outside_count = 0

                    if gene.gid in global_gid2profiles:
                        for _profile in global_gid2profiles[gene.gid]:
                            for _org in global_profile2orgs2gis[_profile]:
                                outside_count += 1 if global_profile2orgs2gis[_profile][_org].difference(cas1402_gis) else 0

                    _orgObj.outside = outside_count
                    profile2orgs2count[_cogid][gene.organism] = _orgObj

                profile2orgs2count[_cogid][gene.organism].in_crispr = 1

    in_crispr_all   = []
    crispricity_all = []
    profiles_all    = []

    print "Writing to files"
    with open('crispricity_profiles.tab', 'w') as outf_profiles:
        with open('crispricity_gis.tab', 'w') as outf_gis:

            for profile in profile2orgs2count:
                in_crispr = 0
                everywhere = 0

                for org in profile2orgs2count[profile]:
                    _org = profile2orgs2count[profile][org]
                    in_crispr +=  _org.in_crispr * gnm2weight[org]
                    everywhere += (_org.in_crispr + _org.outside) * gnm2weight[org]

                crispricity = in_crispr / everywhere

                in_crispr_all.append(in_crispr)
                crispricity_all.append(crispricity)
                profiles_all.append(profile)
                if profile.isdigit():
                    outf_gis.write("%s\t%f\t%f\n"%(profile, in_crispr, crispricity))
                else:
                    outf_profiles.write("%s\t%f\t%f\n"%(profile, in_crispr, crispricity))


    in_crispr_all   = np.asarray(in_crispr_all)
    in_crispr_all   = np.log10(in_crispr_all)
    crispricity_all = np.asarray(crispricity_all)
    # crispricity_all = np.log(crispricity_all)

    plt.ioff()
    fig, ax = plt.subplots()
    ax.scatter(in_crispr_all, crispricity_all,s=1)

    plt.xlabel("Effective orcurrence in CRISPR loci (log10)")
    plt.ylabel("X-axis / Effective occurrences")

    plt.savefig('crispricity.png')
Beispiel #12
0
def calculate_profile_based_crispricity(cas1402_loci, cas1402_gis, cas1402_organisms, prok1402_path_file):

    print "Loding global maps"
    global_profile2orgs2gis = load_maps_simple(prok1402_path_file, cas1402_gis)
    print "Loading weights"
    gnm2weight = t.map_genome2weight()
    print "Loading CDD definitions"
    profile2def = t.map_cdd_profile2def()

    print "Counting in CRISPR loci"

    profile2orgs2obj = {}

    for locus in cas1402_loci:
        for gene in locus:

            for _cogid in gene.cogid.split():

                if _cogid not in profile2orgs2obj:
                    profile2orgs2obj[_cogid] = {}
                    for _org in cas1402_organisms:
                        _orgObj = ProfileInOrganismCount(_org, _cogid)

                        if _cogid in global_profile2orgs2gis:
                            _orgObj.outside = len(global_profile2orgs2gis[_cogid][_org]) \
                                              if _org in global_profile2orgs2gis[_cogid] \
                                              else 0
                        else:
                            _orgObj.outside = 0

                        profile2orgs2obj[_cogid][_org] = _orgObj

                profile2orgs2obj[_cogid][gene.organism].in_crispr += 1

    out_file = os.path.join(gv.project_data_path, 'cas1402/crispricity_count.tab')

    in_crispr_all   = []
    crispricity_all = []
    profiles_all    = []

    print "Writing to file:", out_file
    with open(out_file, 'w') as outf:

        outf.write("Profile\tOccurrence in CRISPR loci\tCrispricity\tDefinition\n")

        for profile in profile2orgs2obj:
            in_crispr = 0
            everywhere = 0

            # for org in profile2orgs2obj[profile]:
            #     _org = profile2orgs2obj[profile][org]
            #     in_crispr +=  _org.in_crispr * gnm2weight[org]
            #     everywhere += (_org.in_crispr + _org.outside) * gnm2weight[org]

            for org in profile2orgs2obj[profile]:
                _org = profile2orgs2obj[profile][org]
                in_crispr += _org.in_crispr
                everywhere += (_org.in_crispr + _org.outside)

            crispricity = in_crispr / everywhere

            in_crispr_all.append(in_crispr)
            crispricity_all.append(crispricity)
            profiles_all.append(profile)
            outf.write("%s\t%f\t%f\t%s\n"%(profile, in_crispr, crispricity, profile2def[profile]))

    in_crispr_all   = np.asarray(in_crispr_all)
    in_crispr_all   = np.log10(in_crispr_all)
    crispricity_all = np.asarray(crispricity_all)
    # crispricity_all = np.log(crispricity_all)

    plt.ioff()
    fig, ax = plt.subplots()
    ax.scatter(in_crispr_all, crispricity_all, s=1)

    plt.xlabel("Effective orcurrence in CRISPR loci (log10)")
    plt.ylabel("X-axis / Effective occurrences")

    # fig.savefig('first.png')
    plt.savefig('first_count.png')
Beispiel #13
0
def count_profiles_in_neighborhoods(neighborhoods_path, save_path, limit_to,
                                    combination_size):
    target_profiles = [
        l.strip() for l in open(
            os.path.join(gv.project_data_path, 'Archea',
                         'arCOG/selected_arcogs.txt'))
    ]
    src2org = t.map_src2org()
    gnm2weight = t.map_genome2weight()
    neighborhoods = [
        cl.Neighborhood(os.path.join(neighborhoods_path, f))
        for f in os.listdir(neighborhoods_path)
    ]

    # pickle.dump(neighborhoods, open('files/neighborhoods.p', 'w'))
    # neighborhoods = pickle.load(open('files/neighborhoods.p'))

    profile_stats = {}
    for nbr in neighborhoods:
        src_name = nbr.genes[0].src
        org_name = src2org[src_name]
        for g in nbr.genes:
            if g.cogid == "":
                continue
            for tmpCog in g.cogid.split():
                if tmpCog in target_profiles:
                    continue
                if tmpCog in profile_stats:
                    profile_stats[tmpCog].weight += gnm2weight[org_name]
                    profile_stats[tmpCog].count += 1
                else:
                    profile_stats[tmpCog] = cl.ProfileCount(
                        1, gnm2weight[org_name])

    profile_weights = [(k, v.weight) for k, v in profile_stats.items()]
    profile_weights = sorted(profile_weights, key=itemgetter(1), reverse=True)

    # pickle.dump(profile_weights, open('files/profile_weights.p', 'w'))
    # profile_weights = pickle.load(open('files/profile_weights.p'))

    top_profiles = [k for (k, v) in profile_weights[:limit_to]]
    print 'started counting'
    counted_combinations = count_combinations(neighborhoods, top_profiles,
                                              combination_size, src2org,
                                              gnm2weight)
    print 'Done counting'
    weight_values = np.array([v.weight for v in counted_combinations.values()])
    weight_values.sort()
    weight_values = weight_values[::-1]
    pivot_ind = np.where(
        np.cumsum(weight_values) / np.sum(weight_values) >= 0.9)[0][0]
    pivot_value = weight_values[pivot_ind]

    M = pd.DataFrame([], columns=['Comb', 'weight', 'count'])
    M['Comb'] = counted_combinations.keys()
    M['weight'] = [v.weight for v in counted_combinations.values()]
    M['count'] = [v.count for v in counted_combinations.values()]

    M = M[M['count'] > 1]
    M = M[M['weight'] > pivot_value]
    M = M.sort('weight', ascending=False)
    fname = '%d_%d.tab' % (limit_to, combination_size)
    fname = os.path.join(save_path, fname)
    print fname
    fout = open(fname, 'w')
    M.to_csv(fout, sep="\t", index=False)
def extract_all_duplets_from_prok1402():

    """
    Extraction adjacent duplets is done by means of recording them in the dictionary pair2weight

    The overall abundance of profiles is also needed. It's recorded in profile2weight
    """
    pty_path = "/panfs/pan1/patternquest/data/Pty/genomes/"
    work_dir = os.path.join(data_path, 'prok1402/graph/graph_files/')

    print("Loading dictionaries")
    gi2profiles = t.map_gi2profiles()
    genome2weight = t.map_genome2weight()
    pair2weight = defaultdict(float)
    pair2count = defaultdict(int)
    profile2weight=defaultdict(float)

    print("Reading Prok1402")
    for root, dirs, files in os.walk(pty_path):
        for f in files:

            if not f.endswith(".pty"):
                continue

            file_name = os.path.join(root, f)
            genome = os.path.basename(root)

            genes = t.parse_pty_file(file_name)
            for gene in genes:
                gene.profiles = gi2profiles[gene.gid]

                for profile in gene.profiles:
                    t.update_dictionary(profile2weight, profile, genome2weight[genome])

            previous_profiles = genes[0].profiles

            if len(previous_profiles) > 1:
                domain_duplets = list(combinations(previous_profiles,2))

                for duplet in domain_duplets:
                    [kplet_1, kplet_2] = sorted(duplet)
                    key = "%s-%s" % (kplet_1, kplet_2)
                    t.update_dictionary(pair2weight, key, genome2weight[genome])
                    t.update_dictionary(pair2count, key, 1)

            for gene in genes[1:]:
                    cur_profiles = gene.profiles

                    if not previous_profiles:
                        previous_profiles = cur_profiles
                        continue

                    if len(cur_profiles) > 1:
                        domain_duplets = list(combinations(previous_profiles, 2))

                        for duplet in domain_duplets:
                            [kplet_1, kplet_2] = sorted(duplet)
                            key = "%s-%s" % (kplet_1, kplet_2)
                            t.update_dictionary(pair2weight, key, genome2weight[genome])
                            t.update_dictionary(pair2count, key, 1)

                    adjacent_duplets = list(product(previous_profiles, cur_profiles))

                    for duplet in adjacent_duplets:
                        [kplet_1, kplet_2] = sorted(duplet)
                        key = "%s-%s" % (kplet_1, kplet_2)
                        t.update_dictionary(pair2weight, key, genome2weight[genome])
                        t.update_dictionary(pair2count, key, 1)

                    previous_profiles = cur_profiles

    print("Writing to files")
    with open(os.path.join(work_dir, "prok1402_adj_duplets_weights.txt"), "w") as outf:

        for (key,weight) in sorted(pair2weight.items(), key=lambda x: x[1], reverse=True):
            [kplet_1, kplet_2] = key.split("-")
            outf.write("%s\t%s\t%f\n" % (kplet_1, kplet_2, weight))

    with open(os.path.join(work_dir, "prok1402_profile_abundance.txt"), "w") as outf:
        for (profile,weight) in sorted(profile2weight.items(), key=lambda x: x[1], reverse=True):
            outf.write("%s\t%f\n" % (profile, weight))