def merge_into_file_summaries(kplets, neighborhood_files_path, file2src_src2org_map, data_type='bacteria'): _org2weight = t.map_genome2weight() _file2kplets = dict() for kplet in kplets: for f in kplet.files: if f in _file2kplets: _file2kplets[f].append(kplet) else: _file2kplets[f] = [kplet] kplet_files = _file2kplets.keys() _file2src, _src2org = file2src_src2org_map(kplet_files) file_summaries = list() for f in kplet_files: _neighborhood = Neighborhood(os.path.join(neighborhood_files_path, f)) _src = _file2src[f] _org = _src2org[_src] _weight = _org2weight[_org] kplets = _file2kplets[f] _neighborhood.extend_flanks(10, os.path.join(gv.pty_data_path, _org, "%s.pty" % _src), _gid2arcog_cdd) file_summaries.append(NeighborhoodFileSummary(f, kplets, _neighborhood, _org, _src, _weight)) # file_summaries = trim_file_summary_list(file_summaries, data_type) # file_summaries = [fs for fs in file_summaries if fs] # Updating the map _file2src after trimming. # new_file_list = [ fs.file_name for fs in file_summaries] # for _file_name in _file2src.keys(): # if _file_name not in new_file_list: # del _file2src[_file_name] # if len(file_summaries) < 2: # return None, None, None, None, None, None file_summaries.sort(key= lambda x: x.weight, reverse=True) community_count_with_flanks = {} community_count = {} _org2weight = t.map_genome2weight() total_weight = 0 for i in range(len(file_summaries)): cur_file_summary = file_summaries[i] _weight = _org2weight[cur_file_summary.org] total_weight += _weight for gene in cur_file_summary.neighborhood.genes: if gene.tag == 'flank': for k in gene.cogid.split(): t.update_dictionary(community_count_with_flanks, k, _weight) else: for k in gene.cogid.split(): t.update_dictionary(community_count_with_flanks, k, _weight) t.update_dictionary(community_count, k, _weight) community = [] return _src2org, file_summaries, community, community_count, community_count_with_flanks, total_weight
def get_flank_distributions(kplets_2d_list, neighborhood_path, target_profiles): org2weights = t.map_genome2weight() flanking_genes_count = [] cog2gids = [] gid2weight = dict() for kplets_list in kplets_2d_list: cur_flanking_genes_count = dict() cur_cog2gids = dict() for kplet in kplets_list: neighborhoods = [Neighborhood(os.path.join(neighborhood_path, f)) for f in kplet.files] for neighborhood in neighborhoods: for gene in neighborhood.genes: gid2weight[int(gene.gid)] = org2weights[gene.organism] for cogid in gene.cogid.split(): # if cogid in target_profiles: # continue t.update_dictionary(cur_flanking_genes_count,cogid,org2weights[gene.organism]) t.update_dictionary_set(cur_cog2gids, cogid, set([int(gene.gid)])) flanking_genes_count.append(cur_flanking_genes_count) cog2gids.append(cur_cog2gids) return flanking_genes_count, cog2gids, gid2weight
def count_profiles_in_neighborhoods(neighborhoods_path, save_path, limit_to, combination_size): target_profiles = [l.strip() for l in open(os.path.join(gv.project_data_path, 'Archea', 'arCOG/selected_arcogs.txt'))] src2org = t.map_src2org() gnm2weight = t.map_genome2weight() neighborhoods = [cl.Neighborhood(os.path.join(neighborhoods_path, f)) for f in os.listdir(neighborhoods_path)] # pickle.dump(neighborhoods, open('files/neighborhoods.p', 'w')) # neighborhoods = pickle.load(open('files/neighborhoods.p')) profile_stats = {} for nbr in neighborhoods: src_name = nbr.genes[0].src org_name = src2org[src_name] org_weight = gnm2weight[org_name] if org_name in gnm2weight else 1 for g in nbr.genes: if g.cogid == "": continue for tmpCog in g.cogid.split(): if tmpCog in target_profiles: continue if tmpCog in profile_stats: profile_stats[tmpCog].weight += org_weight profile_stats[tmpCog].count += 1 else: profile_stats[tmpCog] = cl.ProfileCount(1, org_weight) profile_weights = [(k, v.weight) for k, v in profile_stats.items()] profile_weights = sorted(profile_weights, key=itemgetter(1), reverse=True) # pickle.dump(profile_weights, open('files/profile_weights.p', 'w')) # profile_weights = pickle.load(open('files/profile_weights.p')) with open('files/profile_weights.tab','w') as f: for profile, weight in profile_weights[:limit_to]: f.write('%f\t%s\n'%(weight, profile)) top_profiles = [k for (k, v) in profile_weights[:limit_to]] print 'started counting' counted_combinations = count_combinations(neighborhoods, top_profiles, combination_size, src2org, gnm2weight) print 'Done counting' weight_values = np.array([v.weight for v in counted_combinations.values()]) weight_values.sort() weight_values = weight_values[::-1] pivot_ind = np.where(np.cumsum(weight_values)/np.sum(weight_values)>=0.9)[0][0] pivot_value = weight_values[pivot_ind] M = pd.DataFrame([], columns=['Comb', 'weight', 'count']) M['Comb'] = counted_combinations.keys() M['weight'] = [v.weight for v in counted_combinations.values()] M['count'] = [v.count for v in counted_combinations.values()] M = M[M['count'] > 1] M = M[M['weight'] > pivot_value] M = M.sort('weight',ascending=False) fname = '%d_%d.tab' % (limit_to, combination_size) fname = os.path.join(save_path, fname) fout = open(fname, 'w') M.to_csv(fout, sep="\t", index=False)
sys.path.append(os.path.join(os.path.expanduser('~'),'Projects/SystemFiles/')) import global_variables as gv import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt import numpy as np import shutil as sh import xlsxwriter as x from operator import itemgetter import scores import dendrogram from lib.utils import tools as t gnm2weight = t.map_genome2weight() # file2org = {l.split()[0]:l.strip().split()[1] for l in open(os.path.join(gv.project_data_path,'cas1402/file2org.txt')).readlines()} # file2crispr_type = {l.split('\t')[0]:l.strip().split('\t')[1].split(';') for l in open(os.path.join(gv.project_data_path,'cas1402/file2type.tab'))} import lib.utils.reporting as r from lib.db.generic import map_profiles_id2code_code2def (_, profile_code2def) = map_profiles_id2code_code2def('cas') def plot_block(block): _fname = block[0].strip() thresholds = [] singles = []
cfg.read(config_file) code_path = cfg.get('NewSystems', 'code_path') data_path = cfg.get('NewSystems', 'data_path') sys.path.append(code_path) ############################################################### import lib.utils.tools as t from lib.utils.classes import Kplet import lib.db.prok1402.db_tools as dt from lib.db.prok1402.duplets import extract_baited_duplet_aggregates import numpy as np import matplotlib.pyplot as plt file_name2file_id, file_id2file_name = dt.map_baited_file2id() genome2weight = t.map_genome2weight() def load_adjacent_duplets_graph(): duplet_rows = extract_baited_duplet_aggregates() duplets = [] graph = nx.DiGraph() for row in duplet_rows: kplet_id = row[0] profiles = (row[1], row[2]) number_of_loci = int(row[3]) weight_of_loci = float(row[4]) locus_file_ids = row[5].split(",")
def calculate_profile_based_crispricity_old(cas1402_loci, cas1402_gis, cas1402_organisms, prok1402_path_file): print "Loding global maps" global_gid2profiles, global_profile2orgs2gis = load_maps( prok1402_path_file, cas1402_gis, cas1402_organisms) print "Loading weights" gnm2weight = t.map_genome2weight() print "Counting in CRISPR loci" profile2orgs2count = {} for locus in cas1402_loci: for gene in locus: for _cogid in gene.cogid.split(','): if _cogid not in profile2orgs2count: profile2orgs2count[_cogid] = {} if gene.organism not in profile2orgs2count[_cogid]: _orgObj = ProfileInOrganismCount(gene.organism, _cogid) outside_count = 0 if gene.gid in global_gid2profiles: for _profile in global_gid2profiles[gene.gid]: for _org in global_profile2orgs2gis[_profile]: outside_count += 1 if global_profile2orgs2gis[ _profile][_org].difference( cas1402_gis) else 0 _orgObj.outside = outside_count profile2orgs2count[_cogid][gene.organism] = _orgObj profile2orgs2count[_cogid][gene.organism].in_crispr = 1 in_crispr_all = [] crispricity_all = [] profiles_all = [] print "Writing to files" with open('crispricity_profiles.tab', 'w') as outf_profiles: with open('crispricity_gis.tab', 'w') as outf_gis: for profile in profile2orgs2count: in_crispr = 0 everywhere = 0 for org in profile2orgs2count[profile]: _org = profile2orgs2count[profile][org] in_crispr += _org.in_crispr * gnm2weight[org] everywhere += (_org.in_crispr + _org.outside) * gnm2weight[org] crispricity = in_crispr / everywhere in_crispr_all.append(in_crispr) crispricity_all.append(crispricity) profiles_all.append(profile) if profile.isdigit(): outf_gis.write("%s\t%f\t%f\n" % (profile, in_crispr, crispricity)) else: outf_profiles.write("%s\t%f\t%f\n" % (profile, in_crispr, crispricity)) in_crispr_all = np.asarray(in_crispr_all) in_crispr_all = np.log10(in_crispr_all) crispricity_all = np.asarray(crispricity_all) # crispricity_all = np.log(crispricity_all) plt.ioff() fig, ax = plt.subplots() ax.scatter(in_crispr_all, crispricity_all, s=1) plt.xlabel("Effective orcurrence in CRISPR loci (log10)") plt.ylabel("X-axis / Effective occurrences") plt.savefig('crispricity.png')
def calculate_profile_based_crispricity(cas1402_loci, cas1402_gis, cas1402_organisms, prok1402_path_file): print "Loding global maps" global_profile2orgs2gis = load_maps_simple(prok1402_path_file, cas1402_gis) print "Loading weights" gnm2weight = t.map_genome2weight() print "Loading CDD definitions" profile2def = t.map_cdd_profile2def() print "Counting in CRISPR loci" profile2orgs2obj = {} for locus in cas1402_loci: for gene in locus: for _cogid in gene.cogid.split(): if _cogid not in profile2orgs2obj: profile2orgs2obj[_cogid] = {} for _org in cas1402_organisms: _orgObj = ProfileInOrganismCount(_org, _cogid) if _cogid in global_profile2orgs2gis: _orgObj.outside = len(global_profile2orgs2gis[_cogid][_org]) \ if _org in global_profile2orgs2gis[_cogid] \ else 0 else: _orgObj.outside = 0 profile2orgs2obj[_cogid][_org] = _orgObj profile2orgs2obj[_cogid][gene.organism].in_crispr += 1 out_file = os.path.join(gv.project_data_path, 'cas1402/crispricity_count.tab') in_crispr_all = [] crispricity_all = [] profiles_all = [] print "Writing to file:", out_file with open(out_file, 'w') as outf: outf.write( "Profile\tOccurrence in CRISPR loci\tCrispricity\tDefinition\n") for profile in profile2orgs2obj: in_crispr = 0 everywhere = 0 # for org in profile2orgs2obj[profile]: # _org = profile2orgs2obj[profile][org] # in_crispr += _org.in_crispr * gnm2weight[org] # everywhere += (_org.in_crispr + _org.outside) * gnm2weight[org] for org in profile2orgs2obj[profile]: _org = profile2orgs2obj[profile][org] in_crispr += _org.in_crispr everywhere += (_org.in_crispr + _org.outside) crispricity = in_crispr / everywhere in_crispr_all.append(in_crispr) crispricity_all.append(crispricity) profiles_all.append(profile) outf.write("%s\t%f\t%f\t%s\n" % (profile, in_crispr, crispricity, profile2def[profile])) in_crispr_all = np.asarray(in_crispr_all) in_crispr_all = np.log10(in_crispr_all) crispricity_all = np.asarray(crispricity_all) # crispricity_all = np.log(crispricity_all) plt.ioff() fig, ax = plt.subplots() ax.scatter(in_crispr_all, crispricity_all, s=1) plt.xlabel("Effective orcurrence in CRISPR loci (log10)") plt.ylabel("X-axis / Effective occurrences") # fig.savefig('first.png') plt.savefig('first_count.png')
def calculate_profile_based_baiticity(bacteria_loci, loci_gis, loci_organisms, arcog_path_file, bait_profiles, filter_threshold, save_path): print "Loding global maps" global_profile2orgs2gis = load_maps_simple(arcog_path_file, loci_gis) print "Loading weights" gnm2weight = t.map_genome2weight() print "Loading CDD definitions" profile2def = t.map_cdd_profile2def() profile2def.update(t.map_profile2def()) print "Counting in loci" profile2orgs2obj = {} gi_checklist = set() for locus in bacteria_loci: for gene in locus: if gene.gid in gi_checklist: continue for _cogid in gene.cogid.split(): if _cogid in bait_profiles: continue if _cogid not in profile2orgs2obj: profile2orgs2obj[_cogid] = {} for _org in loci_organisms: _orgObj = ProfileInOrganismCount(_org, _cogid) if _cogid in global_profile2orgs2gis: _orgObj.outside = len(global_profile2orgs2gis[_cogid][_org]) \ if _org in global_profile2orgs2gis[_cogid] \ else 0 else: _orgObj.outside = 0 profile2orgs2obj[_cogid][_org] = _orgObj profile2orgs2obj[_cogid][gene.organism].in_locus += 1 gi_checklist.update([gene.gid]) print len(profile2orgs2obj['arCOG08578']) # print profile2orgs2obj['arCOG08578'].keys() for org, obj in profile2orgs2obj['arCOG08578'].items(): if obj.in_locus + obj.outside > 0: print org, obj.in_locus, obj.outside sys.exit() out_file = os.path.join(save_path, 'baiticity.tab') profiles = [] in_loci_count = [] baiticity_count = [] in_loci_weight = [] baiticity_weight = [] rare_profiles_file = open(os.path.join(save_path, 'rare_profiles.tab'), 'w') rare_profiles_file.write( "Profile\tOccurence everywhere\tOccurrence in loci\tBaiticity\tDefinition\n" ) print "Writing to file:", out_file with open(out_file, 'w') as outf: outf.write( "Profile\tOccurrence in loci(count)\tBaiticity(count)\tOccurrence in loci(weight)\tBaiticity(weight)\tDefinition\n" ) for profile in profile2orgs2obj: if profile == 'arCOG14077': continue in_locus_count = 0 everywhere_count = 0 in_locus_weight = 0 everywhere_weight = 0 for org in profile2orgs2obj[profile]: if org in [ 'Nitrosoarchaeum_koreensis_MY1_MY1', 'Nitrosoarchaeum_limnia_SFB1' ]: continue _org = profile2orgs2obj[profile][org] in_locus_count += _org.in_locus everywhere_count += (_org.in_locus + _org.outside) in_locus_weight += _org.in_locus * gnm2weight[org] everywhere_weight += (_org.in_locus + _org.outside) * gnm2weight[org] _baiticity_count = 1.0 * in_locus_count / everywhere_count _baiticity_weight = in_locus_weight / everywhere_weight if everywhere_weight < filter_threshold: rare_profiles_file.write( "%s\t%f\t%f\t%f\t%s\n" % (profile, everywhere_count, in_locus_count, _baiticity_count, profile2def[profile])) continue in_loci_count.append(in_locus_count) baiticity_count.append(_baiticity_count) in_loci_weight.append(in_locus_weight) baiticity_weight.append(_baiticity_weight) profiles.append(profile) outf.write( "%s\t%f\t%f\t%f\t%f\t%s\n" % (profile, in_locus_count, _baiticity_count, in_locus_weight, _baiticity_weight, profile2def[profile])) in_loci_weight = np.asarray(in_loci_weight) in_loci_weight = np.log10(in_loci_weight) baiticity_weight = np.asarray(baiticity_weight) plt.ioff() fig, ax = plt.subplots() ax.scatter(in_loci_weight, baiticity_weight, s=1) plt.xlabel("Effective orcurrence in loci (log10)") plt.ylabel("Baiticity") image_file = os.path.join(save_path, 'baiticity.png') plt.savefig(image_file) # for i, profile in enumerate(profiles_all): # ax.annotate(profile, (in_loci_all[i], crispricity_all[i])) # fig.savefig('second.png') # plt.savefig('second.png') rare_profiles_file.close()
"pfam13011": 5.0, "pfam13276": 4.0, "pfam13481": 2.9, "pfam13542": 5.0, "pfam13551": 5.0, "pfam13592": 4.0, "pfam13700": 4.0, "pfam13817": 4.0, "pfam14261": 5.0, "pfam14294": 5.0 } profile2def = t.map_cdd_profile2def() # gid2arcog_cdd = t.map_gid2arcog_cdd() neighborhood_files_path = merged_neighborhoods_path() org2weight = t.map_genome2weight() pan_data_path = '/panfs/pan1/patternquest/Projects/NewSystems/data/Bacteria/' pickle_file = os.path.join(pan_data_path, 'pickle/10000/profile2merged_files.p.bz2') profile2files = t.load_compressed_pickle(pickle_file) baiticity_file = os.path.join(gv.project_data_path, 'baiticity/bacteria/baiticity.tab') profile2baiticity = {l.split()[0]: l.split()[4] for l in open(baiticity_file).readlines()[1:] if l.strip()} i = 1 for highlight_profile in additional_profiles: _profile_containing_files = profile2files[highlight_profile] file_summaries = merging.get_file_summaries(_profile_containing_files, neighborhood_files_path, org2weight)
def calculate_profile_based_baiticity(bacteria_loci, loci_gis, loci_organisms, prok1402_path_file, bait_profiles, filter_threshold, save_path): print "Loding global maps" global_profile2orgs2gis = load_maps_simple(prok1402_path_file, loci_gis) print "Loading weights" gnm2weight = t.map_genome2weight() print "Loading CDD definitions" profile2def = t.map_cdd_profile2def() print "Counting in loci" profile2orgs2obj = {} gi_checklist = set() for locus in bacteria_loci: for gene in locus: if gene.gid in gi_checklist: continue for _cogid in gene.cogid.split(): if _cogid in bait_profiles: continue if _cogid not in profile2orgs2obj: profile2orgs2obj[_cogid] = {} for _org in loci_organisms: _orgObj = ProfileInOrganismCount(_org, _cogid) if _cogid in global_profile2orgs2gis: _orgObj.outside = len(global_profile2orgs2gis[_cogid][_org]) \ if _org in global_profile2orgs2gis[_cogid] \ else 0 else: _orgObj.outside = 0 profile2orgs2obj[_cogid][_org] = _orgObj profile2orgs2obj[_cogid][gene.organism].in_locus += 1 gi_checklist.update([gene.gid]) out_file = os.path.join(save_path, 'baiticity.tab') profiles = [] in_loci_count = [] baiticity_count = [] in_loci_weight = [] baiticity_weight = [] rare_profiles_file = open(os.path.join(save_path, 'rare_profiles.tab'), 'w') rare_profiles_file.write("Profile\tOccurence everywhere\tOccurrence in loci\tBaiticity\tDefinition\n") print "Writing to file:", out_file with open(out_file, 'w') as outf: outf.write("Profile\tOccurrence in loci(count)\tBaiticity(count)\tOccurrence in loci(weight)\tBaiticity(weight)\tDefinition\n") for profile in profile2orgs2obj: in_locus_count = 0 everywhere_count = 0 in_locus_weight = 0 everywhere_weight = 0 for org in profile2orgs2obj[profile]: _org = profile2orgs2obj[profile][org] in_locus_count += _org.in_locus everywhere_count += (_org.in_locus + _org.outside) in_locus_weight += _org.in_locus * gnm2weight[org] everywhere_weight += (_org.in_locus + _org.outside) * gnm2weight[org] _baiticity_count = 1.0 * in_locus_count / everywhere_count _baiticity_weight = in_locus_weight / everywhere_weight if everywhere_weight < filter_threshold: rare_profiles_file.write("%s\t%f\t%f\t%f\t%s\n"%(profile, everywhere_count, in_locus_count, _baiticity_count, profile2def[profile])) continue in_loci_count.append(in_locus_count) baiticity_count.append(_baiticity_count) in_loci_weight.append(in_locus_weight) baiticity_weight.append(_baiticity_weight) profiles.append(profile) outf.write("%s\t%f\t%f\t%f\t%f\t%s\n"%(profile, in_locus_count, _baiticity_count, in_locus_weight, _baiticity_weight, profile2def[profile])) in_loci_weight = np.asarray(in_loci_weight) in_loci_weight = np.log10(in_loci_weight) baiticity_weight = np.asarray(baiticity_weight) plt.ioff() fig, ax = plt.subplots() ax.scatter(in_loci_weight, baiticity_weight, s=1) plt.xlabel("Effective orcurrence in loci (log10)") plt.ylabel("Baiticity") image_file = os.path.join(save_path, 'baiticity.png') plt.savefig(image_file) # for i, profile in enumerate(profiles_all): # ax.annotate(profile, (in_loci_all[i], crispricity_all[i])) # fig.savefig('second.png') # plt.savefig('second.png') rare_profiles_file.close()
def calculate_profile_based_crispricity_old(cas1402_loci, cas1402_gis, cas1402_organisms, prok1402_path_file): print "Loding global maps" global_gid2profiles, global_profile2orgs2gis = load_maps(prok1402_path_file, cas1402_gis, cas1402_organisms) print "Loading weights" gnm2weight = t.map_genome2weight() print "Counting in CRISPR loci" profile2orgs2count = {} for locus in cas1402_loci: for gene in locus: for _cogid in gene.cogid.split(','): if _cogid not in profile2orgs2count: profile2orgs2count[_cogid] = {} if gene.organism not in profile2orgs2count[_cogid]: _orgObj = ProfileInOrganismCount(gene.organism, _cogid) outside_count = 0 if gene.gid in global_gid2profiles: for _profile in global_gid2profiles[gene.gid]: for _org in global_profile2orgs2gis[_profile]: outside_count += 1 if global_profile2orgs2gis[_profile][_org].difference(cas1402_gis) else 0 _orgObj.outside = outside_count profile2orgs2count[_cogid][gene.organism] = _orgObj profile2orgs2count[_cogid][gene.organism].in_crispr = 1 in_crispr_all = [] crispricity_all = [] profiles_all = [] print "Writing to files" with open('crispricity_profiles.tab', 'w') as outf_profiles: with open('crispricity_gis.tab', 'w') as outf_gis: for profile in profile2orgs2count: in_crispr = 0 everywhere = 0 for org in profile2orgs2count[profile]: _org = profile2orgs2count[profile][org] in_crispr += _org.in_crispr * gnm2weight[org] everywhere += (_org.in_crispr + _org.outside) * gnm2weight[org] crispricity = in_crispr / everywhere in_crispr_all.append(in_crispr) crispricity_all.append(crispricity) profiles_all.append(profile) if profile.isdigit(): outf_gis.write("%s\t%f\t%f\n"%(profile, in_crispr, crispricity)) else: outf_profiles.write("%s\t%f\t%f\n"%(profile, in_crispr, crispricity)) in_crispr_all = np.asarray(in_crispr_all) in_crispr_all = np.log10(in_crispr_all) crispricity_all = np.asarray(crispricity_all) # crispricity_all = np.log(crispricity_all) plt.ioff() fig, ax = plt.subplots() ax.scatter(in_crispr_all, crispricity_all,s=1) plt.xlabel("Effective orcurrence in CRISPR loci (log10)") plt.ylabel("X-axis / Effective occurrences") plt.savefig('crispricity.png')
def calculate_profile_based_crispricity(cas1402_loci, cas1402_gis, cas1402_organisms, prok1402_path_file): print "Loding global maps" global_profile2orgs2gis = load_maps_simple(prok1402_path_file, cas1402_gis) print "Loading weights" gnm2weight = t.map_genome2weight() print "Loading CDD definitions" profile2def = t.map_cdd_profile2def() print "Counting in CRISPR loci" profile2orgs2obj = {} for locus in cas1402_loci: for gene in locus: for _cogid in gene.cogid.split(): if _cogid not in profile2orgs2obj: profile2orgs2obj[_cogid] = {} for _org in cas1402_organisms: _orgObj = ProfileInOrganismCount(_org, _cogid) if _cogid in global_profile2orgs2gis: _orgObj.outside = len(global_profile2orgs2gis[_cogid][_org]) \ if _org in global_profile2orgs2gis[_cogid] \ else 0 else: _orgObj.outside = 0 profile2orgs2obj[_cogid][_org] = _orgObj profile2orgs2obj[_cogid][gene.organism].in_crispr += 1 out_file = os.path.join(gv.project_data_path, 'cas1402/crispricity_count.tab') in_crispr_all = [] crispricity_all = [] profiles_all = [] print "Writing to file:", out_file with open(out_file, 'w') as outf: outf.write("Profile\tOccurrence in CRISPR loci\tCrispricity\tDefinition\n") for profile in profile2orgs2obj: in_crispr = 0 everywhere = 0 # for org in profile2orgs2obj[profile]: # _org = profile2orgs2obj[profile][org] # in_crispr += _org.in_crispr * gnm2weight[org] # everywhere += (_org.in_crispr + _org.outside) * gnm2weight[org] for org in profile2orgs2obj[profile]: _org = profile2orgs2obj[profile][org] in_crispr += _org.in_crispr everywhere += (_org.in_crispr + _org.outside) crispricity = in_crispr / everywhere in_crispr_all.append(in_crispr) crispricity_all.append(crispricity) profiles_all.append(profile) outf.write("%s\t%f\t%f\t%s\n"%(profile, in_crispr, crispricity, profile2def[profile])) in_crispr_all = np.asarray(in_crispr_all) in_crispr_all = np.log10(in_crispr_all) crispricity_all = np.asarray(crispricity_all) # crispricity_all = np.log(crispricity_all) plt.ioff() fig, ax = plt.subplots() ax.scatter(in_crispr_all, crispricity_all, s=1) plt.xlabel("Effective orcurrence in CRISPR loci (log10)") plt.ylabel("X-axis / Effective occurrences") # fig.savefig('first.png') plt.savefig('first_count.png')
def count_profiles_in_neighborhoods(neighborhoods_path, save_path, limit_to, combination_size): target_profiles = [ l.strip() for l in open( os.path.join(gv.project_data_path, 'Archea', 'arCOG/selected_arcogs.txt')) ] src2org = t.map_src2org() gnm2weight = t.map_genome2weight() neighborhoods = [ cl.Neighborhood(os.path.join(neighborhoods_path, f)) for f in os.listdir(neighborhoods_path) ] # pickle.dump(neighborhoods, open('files/neighborhoods.p', 'w')) # neighborhoods = pickle.load(open('files/neighborhoods.p')) profile_stats = {} for nbr in neighborhoods: src_name = nbr.genes[0].src org_name = src2org[src_name] for g in nbr.genes: if g.cogid == "": continue for tmpCog in g.cogid.split(): if tmpCog in target_profiles: continue if tmpCog in profile_stats: profile_stats[tmpCog].weight += gnm2weight[org_name] profile_stats[tmpCog].count += 1 else: profile_stats[tmpCog] = cl.ProfileCount( 1, gnm2weight[org_name]) profile_weights = [(k, v.weight) for k, v in profile_stats.items()] profile_weights = sorted(profile_weights, key=itemgetter(1), reverse=True) # pickle.dump(profile_weights, open('files/profile_weights.p', 'w')) # profile_weights = pickle.load(open('files/profile_weights.p')) top_profiles = [k for (k, v) in profile_weights[:limit_to]] print 'started counting' counted_combinations = count_combinations(neighborhoods, top_profiles, combination_size, src2org, gnm2weight) print 'Done counting' weight_values = np.array([v.weight for v in counted_combinations.values()]) weight_values.sort() weight_values = weight_values[::-1] pivot_ind = np.where( np.cumsum(weight_values) / np.sum(weight_values) >= 0.9)[0][0] pivot_value = weight_values[pivot_ind] M = pd.DataFrame([], columns=['Comb', 'weight', 'count']) M['Comb'] = counted_combinations.keys() M['weight'] = [v.weight for v in counted_combinations.values()] M['count'] = [v.count for v in counted_combinations.values()] M = M[M['count'] > 1] M = M[M['weight'] > pivot_value] M = M.sort('weight', ascending=False) fname = '%d_%d.tab' % (limit_to, combination_size) fname = os.path.join(save_path, fname) print fname fout = open(fname, 'w') M.to_csv(fout, sep="\t", index=False)
def extract_all_duplets_from_prok1402(): """ Extraction adjacent duplets is done by means of recording them in the dictionary pair2weight The overall abundance of profiles is also needed. It's recorded in profile2weight """ pty_path = "/panfs/pan1/patternquest/data/Pty/genomes/" work_dir = os.path.join(data_path, 'prok1402/graph/graph_files/') print("Loading dictionaries") gi2profiles = t.map_gi2profiles() genome2weight = t.map_genome2weight() pair2weight = defaultdict(float) pair2count = defaultdict(int) profile2weight=defaultdict(float) print("Reading Prok1402") for root, dirs, files in os.walk(pty_path): for f in files: if not f.endswith(".pty"): continue file_name = os.path.join(root, f) genome = os.path.basename(root) genes = t.parse_pty_file(file_name) for gene in genes: gene.profiles = gi2profiles[gene.gid] for profile in gene.profiles: t.update_dictionary(profile2weight, profile, genome2weight[genome]) previous_profiles = genes[0].profiles if len(previous_profiles) > 1: domain_duplets = list(combinations(previous_profiles,2)) for duplet in domain_duplets: [kplet_1, kplet_2] = sorted(duplet) key = "%s-%s" % (kplet_1, kplet_2) t.update_dictionary(pair2weight, key, genome2weight[genome]) t.update_dictionary(pair2count, key, 1) for gene in genes[1:]: cur_profiles = gene.profiles if not previous_profiles: previous_profiles = cur_profiles continue if len(cur_profiles) > 1: domain_duplets = list(combinations(previous_profiles, 2)) for duplet in domain_duplets: [kplet_1, kplet_2] = sorted(duplet) key = "%s-%s" % (kplet_1, kplet_2) t.update_dictionary(pair2weight, key, genome2weight[genome]) t.update_dictionary(pair2count, key, 1) adjacent_duplets = list(product(previous_profiles, cur_profiles)) for duplet in adjacent_duplets: [kplet_1, kplet_2] = sorted(duplet) key = "%s-%s" % (kplet_1, kplet_2) t.update_dictionary(pair2weight, key, genome2weight[genome]) t.update_dictionary(pair2count, key, 1) previous_profiles = cur_profiles print("Writing to files") with open(os.path.join(work_dir, "prok1402_adj_duplets_weights.txt"), "w") as outf: for (key,weight) in sorted(pair2weight.items(), key=lambda x: x[1], reverse=True): [kplet_1, kplet_2] = key.split("-") outf.write("%s\t%s\t%f\n" % (kplet_1, kplet_2, weight)) with open(os.path.join(work_dir, "prok1402_profile_abundance.txt"), "w") as outf: for (profile,weight) in sorted(profile2weight.items(), key=lambda x: x[1], reverse=True): outf.write("%s\t%f\n" % (profile, weight))