def generate_plots_from_pickle(limit_to, report_dir, target_profiles, profile2def, gid2arcog_cdd, neighborhood_files_path): data_path = os.path.join(gv.project_data_path, 'Archea/pickle/') fname = os.path.join(data_path, str(limit_to), 'pentaplets_merged_within.p.bz2') pentaplets = t.load_compressed_pickle(fname) # fname = os.path.join(data_path, str(limit_to), 'quadruplets_merged_within.p.bz2') # quadruplets = t.load_compressed_pickle(fname) # fname = os.path.join(data_path, str(limit_to), 'triplets_merged_within.p.bz2') # triplets = t.load_compressed_pickle(fname) # fname = os.path.join(data_path, str(limit_to), 'duplets_merged_within.p.bz2') # duplets = t.load_compressed_pickle(fname) print 'Generationg reports for within orders merged lists' report_files_dir = os.path.join(gv.project_data_path, 'Archea/reports/merged_within_orders/', report_dir) if not os.path.exists(report_files_dir): os.mkdir(report_files_dir) for i, kplet_pool in zip([5, 4, 3, 2], [pentaplets, pentaplets, pentaplets, pentaplets]): print 'Starting for', i j = 0 profile2counts_pool = dist.get_flank_distributions(kplet_pool, neighborhood_files_path, target_profiles) for k in range(len(kplet_pool)): kplet_sublist = kplet_pool[k] cur_reports_folder = os.path.join(report_files_dir, str(i)) if not os.path.exists(cur_reports_folder): os.mkdir(cur_reports_folder) src2org, file_summaries, community, community_count, community_count_with_flanks = \ merging.merge_into_file_summaries(kplet_sublist, neighborhood_files_path, file2src_src2org_map, 'archaea') if not src2org: continue xls_file_name = os.path.join(cur_reports_folder, "%d_%d.xls" % (j+1, i)) class2counts, class2profiles = merging.arcog_profile_count_into_class_count(community_count) class2counts_flank, class2profiles_flank = merging.arcog_profile_count_into_class_count(community_count_with_flanks) profile2counts = profile2counts_pool[k] j += 1 params = dict() params['xls_file_name'] = xls_file_name params['src2org'] = src2org params['file_summaries'] = file_summaries params['community'] = community params['target_profiles'] = target_profiles params['profile2def'] = profile2def params['gid2arcog_cdd'] = gid2arcog_cdd params['class2counts'] = class2counts params['class2profiles'] = class2profiles params['class2counts_flank'] = class2counts_flank params['profile2counts'] = profile2counts r.write_to_xls(params)
def generate_plots(limit_to, report_dir, target_profiles, profile2def, gid2arcog_cdd, neighborhood_files_path): print 'Reading kplets from database' pentaplets = p.get_report_kplets(limit_to=limit_to, load_locations=True) quadruplets = q.get_report_kplets(limit_to=limit_to, load_locations=True) triplets = tr.get_report_kplets(limit_to=limit_to, load_locations=True) duplets = d.get_report_kplets(limit_to=limit_to, load_locations=True) print 'Merging within order' pentaplets = merging.merge_kplets_within_orders(pentaplets, target_profiles) quadruplets= merging.merge_kplets_within_orders(quadruplets, target_profiles) triplets = merging.merge_kplets_within_orders(triplets, target_profiles) duplets = merging.merge_kplets_within_orders(duplets, target_profiles) print 'Generationg reports for within orders merged lists' report_files_dir = os.path.join(gv.project_data_path, 'Archea/reports/merged_within_orders/', report_dir) if not os.path.exists(report_files_dir): os.mkdir(report_files_dir) for i, kplet_pool in zip([5, 4, 3, 2], [pentaplets, quadruplets, triplets, duplets]): for j, kplet_sublist in enumerate(kplet_pool): cur_reports_folder = os.path.join(report_files_dir, str(i)) if not os.path.exists(cur_reports_folder): os.mkdir(cur_reports_folder) xls_file_name = os.path.join(cur_reports_folder, "%d_%d.xls" % (j+1, i)) r.write_to_xls(xls_file_name,kplet_sublist,target_profiles,profile2def,gid2arcog_cdd,neighborhood_files_path,file2src_src2org_map) print 'Merging across order' triplets, duplets = merging.merge_kplets_across_order(triplets, duplets) quadruplets, triplets = merging.merge_kplets_across_order(quadruplets, triplets) pentaplets, quadruplets = merging.merge_kplets_across_order(pentaplets, quadruplets) print 'Generationg reports for across orders merged lists' report_files_dir = os.path.join(gv.project_data_path, 'Archea/reports/merged_across_orders/', report_dir) if not os.path.exists(report_files_dir): os.mkdir(report_files_dir) for i, kplet_pool in zip([5, 4, 3, 2], [pentaplets, quadruplets, triplets, duplets]): for j, kplet_sublist in enumerate(kplet_pool): cur_reports_folder = os.path.join(report_files_dir, str(i)) if not os.path.exists(cur_reports_folder): os.mkdir(cur_reports_folder) xls_file_name = os.path.join(cur_reports_folder, "%d_%d.xls" % (j+1, i)) r.write_to_xls(xls_file_name,kplet_sublist,target_profiles,profile2def,gid2arcog_cdd,neighborhood_files_path,file2src_src2org_map)
def generate_plots_from_pickle(limit_to, report_dir, target_profiles, profile2def, gid2arcog_cdd, neighborhood_files_path): data_path = os.path.join(gv.project_data_path, 'Archea/pickle/') fname = os.path.join(data_path, str(limit_to), 'pentaplets_merged_within.p.bz2') pentaplets = t.load_compressed_pickle(fname) fname = os.path.join(data_path, str(limit_to), 'quadruplets_merged_within.p.bz2') quadruplets = t.load_compressed_pickle(fname) fname = os.path.join(data_path, str(limit_to), 'triplets_merged_within.p.bz2') triplets = t.load_compressed_pickle(fname) fname = os.path.join(data_path, str(limit_to), 'duplets_merged_within.p.bz2') duplets = t.load_compressed_pickle(fname) # pentaplets, quadruplets, triplets = None, None, None print 'Generationg reports for within orders merged lists' flank_report_fnames = ['pentaplets_flank_counts.xls', 'quadruplets_flank_counts.xls', \ 'triplets_flank_counts.xls', 'duplets_flank_counts.xls'] kplets = [pentaplets, quadruplets, triplets, duplets] titles = ['Pentaplets', 'Quadruplets', 'Triplets', 'Duplets'] report_files_dir = os.path.join(gv.project_data_path, 'Archea/reports/merged_within_orders/', report_dir) for i in range(len(flank_report_fnames)): _fname = flank_report_fnames[i] _kplet_list = kplets[i] _title = titles[i] flank_counts, cog2gids, gid2weight = dist.get_flank_distributions( _kplet_list, neighborhood_files_path, target_profiles) # universal_flank_counts = t.merge_dict_list(flank_counts) # universal_cog2gids = t.merge_dict_set_list(cog2gids, gid2weight) universal_flank_counts = t.merge_dict_set_list(cog2gids, gid2weight) params = dict() params['xls_file_name'] = os.path.join(report_files_dir, _fname) params['profile2def'] = profile2def params['flank_counts'] = universal_flank_counts params['title'] = _title params['target_profiles'] = target_profiles r.write_flanking_count_xls(params) if not os.path.exists(report_files_dir): os.mkdir(report_files_dir) for i, kplet_pool in zip([2, 3, 4, 5], [duplets, triplets, quadruplets, pentaplets]): j = 0 # profile2counts_pool = dist.get_flank_distributions(kplet_pool, neighborhood_files_path, target_profiles) file_summaries_pool = [] for kplet_list in kplet_pool: summary_terms = merging.merge_into_file_summaries( kplet_list, neighborhood_files_path, file2src_src2org_map, 'archaea') file_summaries_pool.append(summary_terms) for summary_terms in sorted(file_summaries_pool, key=itemgetter(5), reverse=True): src2org, file_summaries, community, community_count, community_count_with_flanks, weight = summary_terms if not src2org: continue cur_reports_folder = os.path.join(report_files_dir, str(i)) if not os.path.exists(cur_reports_folder): os.mkdir(cur_reports_folder) xls_file_name = os.path.join(cur_reports_folder, "%d_%d.xls" % (j + 1, i)) class2counts, class2profiles = merging.arcog_profile_count_into_class_count( community_count) class2counts_flank, class2profiles_flank = merging.arcog_profile_count_into_class_count( community_count_with_flanks) j += 1 params = dict() params['xls_file_name'] = xls_file_name params['src2org'] = src2org params['file_summaries'] = file_summaries params['community'] = community params['target_profiles'] = target_profiles params['profile2def'] = profile2def params['gid2arcog_cdd'] = gid2arcog_cdd params['class2counts'] = class2counts params['class2profiles'] = class2profiles params['class2counts_flank'] = class2counts_flank # params['profile2counts'] = profile2counts r.write_to_xls(params)
def generate_plots_from_pickle(limit_to, report_dir, target_profiles, profile2def, gid2arcog_cdd, neighborhood_files_path): data_path = os.path.join(gv.project_data_path, 'Bacteria/pickle/') fname = os.path.join(data_path, str(limit_to), 'pentaplets_merged_within.p.bz2') pentaplets = t.load_compressed_pickle(fname) fname = os.path.join(data_path, str(limit_to), 'quadruplets_merged_within.p.bz2') quadruplets = t.load_compressed_pickle(fname) fname = os.path.join(data_path, str(limit_to), 'triplets_merged_within.p.bz2') triplets = t.load_compressed_pickle(fname) fname = os.path.join(data_path, str(limit_to), 'duplets_merged_within.p.bz2') duplets = t.load_compressed_pickle(fname) print 'Generationg reports for across orders merged lists' flank_report_fnames = ['pentaplets_flank_counts.xls', 'quadruplets_flank_counts.xls', \ 'triplets_flank_counts.xls', 'duplets_flank_counts.xls'] kplets = [pentaplets, quadruplets, triplets, duplets] titles = ['Pentaplets', 'Quadruplets', 'Triplets', 'Duplets'] report_files_dir = os.path.join(gv.project_data_path, 'Bacteria/reports/merged_within_orders/', report_dir) for i in range(len(flank_report_fnames)): _fname = flank_report_fnames[i] _kplet_list = kplets[i] _title = titles[i] flank_counts, cog2gids, gid2weight = dist.get_flank_distributions( _kplet_list, neighborhood_files_path, target_profiles) universal_flank_counts = t.merge_dict_set_list(cog2gids, gid2weight) params = dict() params['xls_file_name'] = os.path.join(report_files_dir, _fname) params['profile2def'] = profile2def params['flank_counts'] = universal_flank_counts params['title'] = _title params['target_profiles'] = target_profiles r.write_flanking_count_xls(params) sys.exit() if not os.path.exists(report_files_dir): os.mkdir(report_files_dir) for i, kplet_pool in zip([5, 4, 3, 2], [pentaplets, quadruplets, triplets, duplets]): print i j = 0 # flank_counts_pool = dist.get_flank_distributions(kplet_pool, neighborhood_files_path, target_profiles) for k in range(len(kplet_pool)): kplet_sublist = kplet_pool[k] cur_reports_folder = os.path.join(report_files_dir, str(i)) if not os.path.exists(cur_reports_folder): os.mkdir(cur_reports_folder) src2org, file_summaries, community, community_count, community_count_with_flanks\ = merging.merge_into_file_summaries(kplet_sublist, neighborhood_files_path, file2src_src2org_map) if not src2org: continue xls_file_name = os.path.join(cur_reports_folder, "%d_%d.xls" % (j + 1, i)) class2counts, class2profiles = merging.cdd_profile_count_into_class_count( community_count) class2counts_flank, class2profiles_flank = merging.cdd_profile_count_into_class_count( community_count_with_flanks) # profile2counts = flank_counts_pool[k] j += 1 params = dict() params['xls_file_name'] = xls_file_name params['src2org'] = src2org params['file_summaries'] = file_summaries params['community'] = community params['target_profiles'] = target_profiles params['profile2def'] = profile2def params['gid2arcog_cdd'] = gid2arcog_cdd params['class2counts'] = class2counts params['class2profiles'] = class2profiles params['class2counts_flank'] = class2counts_flank # params['profile2counts'] = profile2counts r.write_to_xls(params)
for kplet_sublist in pentaplets: cur_reports_folder = os.path.join(report_files_dir, str(5)) src2org, file_summaries, community, community_count, community_count_with_flanks = \ merging.merge_into_file_summaries(kplet_sublist, neighborhood_files_path, file2src_src2org_map, 'archaea') if not src2org: continue community_classes = merging.arcog_profile_count_into_class_count( community_count) community_flank_classes = merging.arcog_profile_count_into_class_count( community_count_with_flanks) xls_file_name = os.path.join(cur_reports_folder, "%d_%d.xls" % (j + 1, 5)) print xls_file_name j += 1 params = dict() params['xls_file_name'] = xls_file_name params['src2org'] = src2org params['file_summaries'] = file_summaries params['community'] = community params['target_profiles'] = target_profiles params['profile2def'] = profile2def params['gid2arcog_cdd'] = gid2arcog_cdd params['class_counts'] = community_classes params['class_flank_counts'] = community_flank_classes r.write_to_xls(params) break
src2org, file_summaries, community, community_count, community_count_with_flanks = \ merging.merge_into_file_summaries(kplet_sublist, neighborhood_files_path, file2src_src2org_map, 'archaea') if not src2org: continue class2counts, class2profiles = merging.arcog_profile_count_into_class_count(community_count) class2counts_flank, class2profiles_flank = merging.arcog_profile_count_into_class_count(community_count_with_flanks) profile2counts = profile2counts_pool[k] xls_file_name = os.path.join(cur_reports_folder, "%d_%d.xls" % (j+1, 5)) print xls_file_name j += 1 params = dict() params['xls_file_name'] = xls_file_name params['src2org'] = src2org params['file_summaries'] = file_summaries params['community'] = community params['target_profiles'] = target_profiles params['profile2def'] = profile2def params['gid2arcog_cdd'] = gid2arcog_cdd params['class2counts'] = class2counts params['class2profiles'] = class2profiles params['class2counts_flank'] = class2counts_flank params['profile2counts'] = profile2counts r.write_to_xls(params) break
def generate_plots_from_pickle(limit_to, report_dir, target_profiles, profile2def, gid2arcog_cdd, neighborhood_files_path): data_path = os.path.join(gv.project_data_path, 'Bacteria/pickle/') fname = os.path.join(data_path, str(limit_to), 'pentaplets_merged_within.p.bz2') pentaplets = t.load_compressed_pickle(fname) fname = os.path.join(data_path, str(limit_to), 'quadruplets_merged_within.p.bz2') quadruplets = t.load_compressed_pickle(fname) fname = os.path.join(data_path, str(limit_to), 'triplets_merged_within.p.bz2') triplets = t.load_compressed_pickle(fname) fname = os.path.join(data_path, str(limit_to), 'duplets_merged_within.p.bz2') duplets = t.load_compressed_pickle(fname) print 'Generationg reports for across orders merged lists' flank_report_fnames = ['pentaplets_flank_counts.xls', 'quadruplets_flank_counts.xls', \ 'triplets_flank_counts.xls', 'duplets_flank_counts.xls'] kplets = [pentaplets, quadruplets, triplets, duplets] titles = ['Pentaplets', 'Quadruplets', 'Triplets', 'Duplets'] report_files_dir = os.path.join(gv.project_data_path, 'Bacteria/reports/merged_within_orders/', report_dir) for i in range(len(flank_report_fnames)): _fname = flank_report_fnames[i] _kplet_list = kplets[i] _title = titles[i] flank_counts, cog2gids, gid2weight = dist.get_flank_distributions(_kplet_list, neighborhood_files_path, target_profiles) universal_flank_counts = t.merge_dict_set_list(cog2gids, gid2weight) params = dict() params['xls_file_name'] = os.path.join(report_files_dir, _fname) params['profile2def'] = profile2def params['flank_counts'] = universal_flank_counts params['title'] = _title params['target_profiles'] = target_profiles r.write_flanking_count_xls(params) sys.exit() if not os.path.exists(report_files_dir): os.mkdir(report_files_dir) for i, kplet_pool in zip([5, 4, 3, 2], [pentaplets, quadruplets, triplets, duplets]): print i j = 0 # flank_counts_pool = dist.get_flank_distributions(kplet_pool, neighborhood_files_path, target_profiles) for k in range(len(kplet_pool)): kplet_sublist = kplet_pool[k] cur_reports_folder = os.path.join(report_files_dir, str(i)) if not os.path.exists(cur_reports_folder): os.mkdir(cur_reports_folder) src2org, file_summaries, community, community_count, community_count_with_flanks\ = merging.merge_into_file_summaries(kplet_sublist, neighborhood_files_path, file2src_src2org_map) if not src2org: continue xls_file_name = os.path.join(cur_reports_folder, "%d_%d.xls" % (j+1, i)) class2counts, class2profiles = merging.cdd_profile_count_into_class_count(community_count) class2counts_flank, class2profiles_flank = merging.cdd_profile_count_into_class_count(community_count_with_flanks) # profile2counts = flank_counts_pool[k] j += 1 params = dict() params['xls_file_name'] = xls_file_name params['src2org'] = src2org params['file_summaries'] = file_summaries params['community'] = community params['target_profiles'] = target_profiles params['profile2def'] = profile2def params['gid2arcog_cdd'] = gid2arcog_cdd params['class2counts'] = class2counts params['class2profiles'] = class2profiles params['class2counts_flank'] = class2counts_flank # params['profile2counts'] = profile2counts r.write_to_xls(params)
def generate_plots_from_pickle(limit_to, report_dir, target_profiles, profile2def, gid2arcog_cdd, neighborhood_files_path): data_path = os.path.join(gv.project_data_path, 'Archea/pickle/') fname = os.path.join(data_path, str(limit_to), 'pentaplets_merged_within.p.bz2') pentaplets = t.load_compressed_pickle(fname) fname = os.path.join(data_path, str(limit_to), 'quadruplets_merged_within.p.bz2') quadruplets = t.load_compressed_pickle(fname) fname = os.path.join(data_path, str(limit_to), 'triplets_merged_within.p.bz2') triplets = t.load_compressed_pickle(fname) fname = os.path.join(data_path, str(limit_to), 'duplets_merged_within.p.bz2') duplets = t.load_compressed_pickle(fname) # pentaplets, quadruplets, triplets = None, None, None print 'Generationg reports for within orders merged lists' flank_report_fnames = ['pentaplets_flank_counts.xls', 'quadruplets_flank_counts.xls', \ 'triplets_flank_counts.xls', 'duplets_flank_counts.xls'] kplets = [pentaplets, quadruplets, triplets, duplets] titles = ['Pentaplets', 'Quadruplets', 'Triplets', 'Duplets'] report_files_dir = os.path.join(gv.project_data_path, 'Archea/reports/merged_within_orders/', report_dir) for i in range(len(flank_report_fnames)): _fname = flank_report_fnames[i] _kplet_list = kplets[i] _title = titles[i] flank_counts, cog2gids, gid2weight = dist.get_flank_distributions(_kplet_list, neighborhood_files_path, target_profiles) # universal_flank_counts = t.merge_dict_list(flank_counts) # universal_cog2gids = t.merge_dict_set_list(cog2gids, gid2weight) universal_flank_counts = t.merge_dict_set_list(cog2gids, gid2weight) params = dict() params[ 'xls_file_name'] = os.path.join(report_files_dir, _fname) params[ 'profile2def'] = profile2def params[ 'flank_counts'] = universal_flank_counts params[ 'title'] = _title params['target_profiles'] = target_profiles r.write_flanking_count_xls(params) if not os.path.exists(report_files_dir): os.mkdir(report_files_dir) for i, kplet_pool in zip([2, 3, 4, 5], [duplets, triplets, quadruplets, pentaplets]): j = 0 # profile2counts_pool = dist.get_flank_distributions(kplet_pool, neighborhood_files_path, target_profiles) file_summaries_pool = [] for kplet_list in kplet_pool: summary_terms = merging.merge_into_file_summaries(kplet_list, neighborhood_files_path, file2src_src2org_map, 'archaea') file_summaries_pool.append(summary_terms) for summary_terms in sorted(file_summaries_pool, key=itemgetter(5), reverse=True): src2org, file_summaries, community, community_count, community_count_with_flanks, weight = summary_terms if not src2org: continue cur_reports_folder = os.path.join(report_files_dir, str(i)) if not os.path.exists(cur_reports_folder): os.mkdir(cur_reports_folder) xls_file_name = os.path.join(cur_reports_folder, "%d_%d.xls" % (j+1, i)) class2counts, class2profiles = merging.arcog_profile_count_into_class_count(community_count) class2counts_flank, class2profiles_flank = merging.arcog_profile_count_into_class_count(community_count_with_flanks) j += 1 params = dict() params[ 'xls_file_name'] = xls_file_name params[ 'src2org'] = src2org params[ 'file_summaries'] = file_summaries params[ 'community'] = community params[ 'target_profiles'] = target_profiles params[ 'profile2def'] = profile2def params[ 'gid2arcog_cdd'] = gid2arcog_cdd params[ 'class2counts'] = class2counts params[ 'class2profiles'] = class2profiles params['class2counts_flank'] = class2counts_flank # params['profile2counts'] = profile2counts r.write_to_xls(params)