def generate_reports_for_experiment(weight_cutoff, n_clusters, neighborhoods=None, clustered_profiles=None, target_profiles=None, profile2def=None): if not target_profiles: target_profiles = t.target_profiles() if not profile2def: profile2def = t.map_profile2def() if not neighborhoods: neighborhoods = pickle.load(open('files/neighborhoods.p')) if not clustered_profiles: clustered_profiles_file = os.path.join( gv.project_data_path, 'clustering/10/cluster_profiles_cutoff_%1.1f.txt' % weight_cutoff) clustered_profiles = [ l.strip().split() for l in open(clustered_profiles_file).readlines() ] reports_file_dir = os.path.join('reports', str(n_clusters)) if not os.path.exists(reports_file_dir): os.mkdir(reports_file_dir) reports_file_dir = os.path.join(reports_file_dir, 'weight_cutoff_' + str(weight_cutoff)) if not os.path.exists(reports_file_dir): os.mkdir(reports_file_dir) print 'Data ready. Starting report generation.' cnt = 1 for cl in clustered_profiles: print 'Cluster no:', cnt tmp_neighborhoods = [] for nbr in neighborhoods: tmp_profiles = [] for g in nbr.genes: if g.cogid not in ["", None]: tmp_profiles += g.cogid.split() prof_cnt = 0 for prof in set(tmp_profiles): if prof in cl: prof_cnt += 1 if prof_cnt >= 7: tmp_neighborhoods.append(nbr) xls_file = os.path.join(reports_file_dir, "cl_no_%d.xls" % cnt) write_to_xls(xls_file, tmp_neighborhoods, cl, target_profiles, profile2def) cnt += 1
def generate_reports_for_experiment(weight_cutoff, n_clusters, neighborhoods=None, clustered_profiles=None, target_profiles=None, profile2def=None): if not target_profiles: target_profiles = t.target_profiles() if not profile2def: profile2def = t.map_profile2def() if not neighborhoods: neighborhoods = pickle.load(open('files/neighborhoods.p')) if not clustered_profiles: clustered_profiles_file = os.path.join(gv.project_data_path, 'clustering/10/cluster_profiles_cutoff_%1.1f.txt' % weight_cutoff) clustered_profiles = [l.strip().split() for l in open(clustered_profiles_file).readlines()] reports_file_dir = os.path.join('reports', str(n_clusters)) if not os.path.exists(reports_file_dir): os.mkdir(reports_file_dir) reports_file_dir = os.path.join(reports_file_dir, 'weight_cutoff_'+str(weight_cutoff)) if not os.path.exists(reports_file_dir): os.mkdir(reports_file_dir) print 'Data ready. Starting report generation.' cnt = 1 for cl in clustered_profiles: print 'Cluster no:', cnt tmp_neighborhoods = [] for nbr in neighborhoods: tmp_profiles = [] for g in nbr.genes: if g.cogid not in ["", None]: tmp_profiles += g.cogid.split() prof_cnt = 0 for prof in set(tmp_profiles): if prof in cl: prof_cnt += 1 if prof_cnt >= 7: tmp_neighborhoods.append(nbr) xls_file = os.path.join(reports_file_dir, "cl_no_%d.xls" % cnt) write_to_xls(xls_file, tmp_neighborhoods, cl, target_profiles, profile2def) cnt += 1
elif sys.platform=='linux2': sys.path.append('/home/hudaiber/Projects/lib/BioPy/') sys.path.append('/home/hudaiber/Projects/SystemFiles/') import global_variables as gv sys.path.append(gv.project_code_path) from lib.db.archea import db_tools, neighborhoods_path from lib.utils import tools as t import os target_profiles = t.target_profiles() target_profiles = [l.strip() for l in open('/Volumes/pan1/patternquest/Projects/NewSystems/data/Archea/arCOG/selected_arcogs.txt').readlines()] profile2def = t.map_profile2def() gid2arcog_cdd = t.map_gid2arcog_cdd() neighborhood_files_path = neighborhoods_path() neighborhood_files_path = '/Volumes/pan1/patternquest/Projects/NewSystems/data/Archea/genes_and_flanks/win_10/pty/' def write_to_xls(xls_file, kplets): community = set() [community.update(kplet.codes) for kplet in kplets] _file2kplets = {} for kplet in kplets: for f in kplet.files: if f in _file2kplets: _file2kplets[f].append(kplet)
from lib.db.archea import triplets as tr from lib.db.archea import duplets as d from lib.db.archea.db_tools import file2src_src2org_map from lib.db.archea import neighborhoods_path # import report_generation as r from lib.utils import reporting as r import lib.utils.merging as merging import lib.utils.tools as t import pickle if __name__ == '__main__': print 'Pre-Loading dictionaries' target_profiles = t.target_profiles() profile2def = t.map_profile2def() gid2arcog_cdd = t.map_gid2arcog_cdd() neighborhood_files_path = neighborhoods_path() print "\n" limit_to = 1000000 data_path = os.path.join(gv.project_data_path, 'Archea/pickle/') fname = os.path.join(data_path, str(limit_to), 'pentaplets_merged_across.p.bz2') pentaplets = t.load_compressed_pickle(fname) report_dir = 'all' report_files_dir = os.path.join(gv.project_data_path, 'Archea/reports/merged_across_orders/', report_dir)
def calculate_profile_based_baiticity(bacteria_loci, loci_gis, loci_organisms, arcog_path_file, bait_profiles, filter_threshold, save_path): print "Loding global maps" global_profile2orgs2gis = load_maps_simple(arcog_path_file, loci_gis) print "Loading weights" gnm2weight = t.map_genome2weight() print "Loading CDD definitions" profile2def = t.map_cdd_profile2def() profile2def.update(t.map_profile2def()) print "Counting in loci" profile2orgs2obj = {} gi_checklist = set() for locus in bacteria_loci: for gene in locus: if gene.gid in gi_checklist: continue for _cogid in gene.cogid.split(): if _cogid in bait_profiles: continue if _cogid not in profile2orgs2obj: profile2orgs2obj[_cogid] = {} for _org in loci_organisms: _orgObj = ProfileInOrganismCount(_org, _cogid) if _cogid in global_profile2orgs2gis: _orgObj.outside = len(global_profile2orgs2gis[_cogid][_org]) \ if _org in global_profile2orgs2gis[_cogid] \ else 0 else: _orgObj.outside = 0 profile2orgs2obj[_cogid][_org] = _orgObj profile2orgs2obj[_cogid][gene.organism].in_locus += 1 gi_checklist.update([gene.gid]) print len(profile2orgs2obj['arCOG08578']) # print profile2orgs2obj['arCOG08578'].keys() for org, obj in profile2orgs2obj['arCOG08578'].items(): if obj.in_locus + obj.outside > 0: print org, obj.in_locus, obj.outside sys.exit() out_file = os.path.join(save_path, 'baiticity.tab') profiles = [] in_loci_count = [] baiticity_count = [] in_loci_weight = [] baiticity_weight = [] rare_profiles_file = open(os.path.join(save_path, 'rare_profiles.tab'), 'w') rare_profiles_file.write( "Profile\tOccurence everywhere\tOccurrence in loci\tBaiticity\tDefinition\n" ) print "Writing to file:", out_file with open(out_file, 'w') as outf: outf.write( "Profile\tOccurrence in loci(count)\tBaiticity(count)\tOccurrence in loci(weight)\tBaiticity(weight)\tDefinition\n" ) for profile in profile2orgs2obj: if profile == 'arCOG14077': continue in_locus_count = 0 everywhere_count = 0 in_locus_weight = 0 everywhere_weight = 0 for org in profile2orgs2obj[profile]: if org in [ 'Nitrosoarchaeum_koreensis_MY1_MY1', 'Nitrosoarchaeum_limnia_SFB1' ]: continue _org = profile2orgs2obj[profile][org] in_locus_count += _org.in_locus everywhere_count += (_org.in_locus + _org.outside) in_locus_weight += _org.in_locus * gnm2weight[org] everywhere_weight += (_org.in_locus + _org.outside) * gnm2weight[org] _baiticity_count = 1.0 * in_locus_count / everywhere_count _baiticity_weight = in_locus_weight / everywhere_weight if everywhere_weight < filter_threshold: rare_profiles_file.write( "%s\t%f\t%f\t%f\t%s\n" % (profile, everywhere_count, in_locus_count, _baiticity_count, profile2def[profile])) continue in_loci_count.append(in_locus_count) baiticity_count.append(_baiticity_count) in_loci_weight.append(in_locus_weight) baiticity_weight.append(_baiticity_weight) profiles.append(profile) outf.write( "%s\t%f\t%f\t%f\t%f\t%s\n" % (profile, in_locus_count, _baiticity_count, in_locus_weight, _baiticity_weight, profile2def[profile])) in_loci_weight = np.asarray(in_loci_weight) in_loci_weight = np.log10(in_loci_weight) baiticity_weight = np.asarray(baiticity_weight) plt.ioff() fig, ax = plt.subplots() ax.scatter(in_loci_weight, baiticity_weight, s=1) plt.xlabel("Effective orcurrence in loci (log10)") plt.ylabel("Baiticity") image_file = os.path.join(save_path, 'baiticity.png') plt.savefig(image_file) # for i, profile in enumerate(profiles_all): # ax.annotate(profile, (in_loci_all[i], crispricity_all[i])) # fig.savefig('second.png') # plt.savefig('second.png') rare_profiles_file.close()
def calculate_profile_based_baiticity(bacteria_loci, loci_gis, loci_organisms, arcog_path_file, bait_profiles, filter_threshold, save_path): print "Loding global maps" global_profile2orgs2gis = load_maps_simple(arcog_path_file, loci_gis) print "Loading weights" gnm2weight = t.map_genome2weight() print "Loading CDD definitions" profile2def = t.map_cdd_profile2def() profile2def.update(t.map_profile2def()) print "Counting in loci" profile2orgs2obj = {} gi_checklist = set() for locus in bacteria_loci: for gene in locus: if gene.gid in gi_checklist: continue for _cogid in gene.cogid.split(): if _cogid in bait_profiles: continue if _cogid not in profile2orgs2obj: profile2orgs2obj[_cogid] = {} for _org in loci_organisms: _orgObj = ProfileInOrganismCount(_org, _cogid) if _cogid in global_profile2orgs2gis: _orgObj.outside = len(global_profile2orgs2gis[_cogid][_org]) \ if _org in global_profile2orgs2gis[_cogid] \ else 0 else: _orgObj.outside = 0 profile2orgs2obj[_cogid][_org] = _orgObj profile2orgs2obj[_cogid][gene.organism].in_locus += 1 gi_checklist.update([gene.gid]) print len(profile2orgs2obj['arCOG08578']) # print profile2orgs2obj['arCOG08578'].keys() for org, obj in profile2orgs2obj['arCOG08578'].items(): if obj.in_locus + obj.outside > 0: print org, obj.in_locus, obj.outside sys.exit() out_file = os.path.join(save_path, 'baiticity.tab') profiles = [] in_loci_count = [] baiticity_count = [] in_loci_weight = [] baiticity_weight = [] rare_profiles_file = open(os.path.join(save_path, 'rare_profiles.tab'), 'w') rare_profiles_file.write("Profile\tOccurence everywhere\tOccurrence in loci\tBaiticity\tDefinition\n") print "Writing to file:", out_file with open(out_file, 'w') as outf: outf.write("Profile\tOccurrence in loci(count)\tBaiticity(count)\tOccurrence in loci(weight)\tBaiticity(weight)\tDefinition\n") for profile in profile2orgs2obj: if profile=='arCOG14077': continue in_locus_count = 0 everywhere_count = 0 in_locus_weight = 0 everywhere_weight = 0 for org in profile2orgs2obj[profile]: if org in ['Nitrosoarchaeum_koreensis_MY1_MY1','Nitrosoarchaeum_limnia_SFB1']: continue _org = profile2orgs2obj[profile][org] in_locus_count += _org.in_locus everywhere_count += (_org.in_locus + _org.outside) in_locus_weight += _org.in_locus * gnm2weight[org] everywhere_weight += (_org.in_locus + _org.outside) * gnm2weight[org] _baiticity_count = 1.0 * in_locus_count / everywhere_count _baiticity_weight = in_locus_weight / everywhere_weight if everywhere_weight < filter_threshold: rare_profiles_file.write("%s\t%f\t%f\t%f\t%s\n"%(profile, everywhere_count, in_locus_count, _baiticity_count, profile2def[profile])) continue in_loci_count.append(in_locus_count) baiticity_count.append(_baiticity_count) in_loci_weight.append(in_locus_weight) baiticity_weight.append(_baiticity_weight) profiles.append(profile) outf.write("%s\t%f\t%f\t%f\t%f\t%s\n"%(profile, in_locus_count, _baiticity_count, in_locus_weight, _baiticity_weight, profile2def[profile])) in_loci_weight = np.asarray(in_loci_weight) in_loci_weight = np.log10(in_loci_weight) baiticity_weight = np.asarray(baiticity_weight) plt.ioff() fig, ax = plt.subplots() ax.scatter(in_loci_weight, baiticity_weight, s=1) plt.xlabel("Effective orcurrence in loci (log10)") plt.ylabel("Baiticity") image_file = os.path.join(save_path, 'baiticity.png') plt.savefig(image_file) # for i, profile in enumerate(profiles_all): # ax.annotate(profile, (in_loci_all[i], crispricity_all[i])) # fig.savefig('second.png') # plt.savefig('second.png') rare_profiles_file.close()