def merging_pipeline_for_order(order, load_from_db=False): limit_to = 1000000000 print "starting for ", order if load_from_db: print "Loading kplets from DB" if order == 2: kplet_file = 'duplets.p.bz2' #kplets = t.load_compressed_pickle(os.path.join(data_path, kplet_file)) elif order == 3: kplet_file = 'triplets.p.bz2' #kplets = t.load_compressed_pickle(os.path.join(data_path, kplet_file)) elif order == 4: kplet_file = 'quadruplets.p.bz2' kplets = q.get_report_kplets(profile_id2code, limit_to=limit_to) elif order == 5: kplet_file = 'pentaplets.p.bz2' kplets = p.get_report_kplets(profile_id2code, limit_to=limit_to) else: print "Loading kplets from pickle file" if order == 2: kplet_file = 'duplets.p.bz2' elif order == 3: kplet_file = 'triplets.p.bz2' elif order == 4: kplet_file = 'quadruplets.p.bz2' elif order == 5: kplet_file = 'pentaplets.p.bz2' kplet_file_full = os.path.join(data_path, kplet_file) print kplet_file_full kplets = t.load_compressed_pickle(kplet_file_full) # print "Starting for", kplet_file # print "Loading kplets" # kplets = t.load_compressed_pickle(os.path.join(data_path, kplet_file)) print "No of kplets:", len(kplets) #print "Loading file2genes" #_file2genes = {} #for _f in os.listdir(neighborhood_files_path): # _file2genes[_f] = dt.get_wgs_file(os.path.join(neighborhood_files_path, _f)) # print 'Filtering' # kplets = filter_seed(kplets, _file2genes) # print "No of kplets:", len(kplets) # fname = os.path.join(data_path, kplet_file.split('.')[0]+'_seed.p.bz2') # print 'Dumping', fname # t.dump_compressed_pickle(fname, kplets) print "Basic merging" merged_lists = merging.basic_merge_within_orders(kplets) fname = os.path.join(data_path, "basic_merged_" + kplet_file) print "Dumping basic merging: ", fname t.dump_compressed_pickle(fname, merged_lists) print "Iterative merging" merged_lists = merging.merge_kplets_within_orders_iterative(merged_lists) fname = os.path.join(data_path, "iterative_merged_" + kplet_file) print "Dumping Iterative merging: ", fname t.dump_compressed_pickle(fname, merged_lists)
def generate_plots_from_pickle(limit_to, report_dir, target_profiles, profile2def, gid2arcog_cdd, neighborhood_files_path): data_path = os.path.join(gv.project_data_path, 'Archea/pickle/') fname = os.path.join(data_path, str(limit_to), 'pentaplets_merged_within.p.bz2') pentaplets = t.load_compressed_pickle(fname) # fname = os.path.join(data_path, str(limit_to), 'quadruplets_merged_within.p.bz2') # quadruplets = t.load_compressed_pickle(fname) # fname = os.path.join(data_path, str(limit_to), 'triplets_merged_within.p.bz2') # triplets = t.load_compressed_pickle(fname) # fname = os.path.join(data_path, str(limit_to), 'duplets_merged_within.p.bz2') # duplets = t.load_compressed_pickle(fname) print 'Generationg reports for within orders merged lists' report_files_dir = os.path.join(gv.project_data_path, 'Archea/reports/merged_within_orders/', report_dir) if not os.path.exists(report_files_dir): os.mkdir(report_files_dir) for i, kplet_pool in zip([5, 4, 3, 2], [pentaplets, pentaplets, pentaplets, pentaplets]): print 'Starting for', i j = 0 profile2counts_pool = dist.get_flank_distributions(kplet_pool, neighborhood_files_path, target_profiles) for k in range(len(kplet_pool)): kplet_sublist = kplet_pool[k] cur_reports_folder = os.path.join(report_files_dir, str(i)) if not os.path.exists(cur_reports_folder): os.mkdir(cur_reports_folder) src2org, file_summaries, community, community_count, community_count_with_flanks = \ merging.merge_into_file_summaries(kplet_sublist, neighborhood_files_path, file2src_src2org_map, 'archaea') if not src2org: continue xls_file_name = os.path.join(cur_reports_folder, "%d_%d.xls" % (j+1, i)) class2counts, class2profiles = merging.arcog_profile_count_into_class_count(community_count) class2counts_flank, class2profiles_flank = merging.arcog_profile_count_into_class_count(community_count_with_flanks) profile2counts = profile2counts_pool[k] j += 1 params = dict() params['xls_file_name'] = xls_file_name params['src2org'] = src2org params['file_summaries'] = file_summaries params['community'] = community params['target_profiles'] = target_profiles params['profile2def'] = profile2def params['gid2arcog_cdd'] = gid2arcog_cdd params['class2counts'] = class2counts params['class2profiles'] = class2profiles params['class2counts_flank'] = class2counts_flank params['profile2counts'] = profile2counts r.write_to_xls(params)
def process_reporting_single_order(order, threshold): if order == 2: prefix = 'duplets' elif order == 3: prefix = 'triplets' elif order == 4: prefix = 'quadruplets' elif order == 5: prefix = 'pentaplets' pickle_file = os.path.join(data_path, "%s.p.bz2" % prefix) print "Loading kplets:", pickle_file kplets = t.load_compressed_pickle(pickle_file) kplets = {kplet.id: kplet for kplet in kplets} merged_data_file = os.path.join( data_path, "%s.merged_%s.txt" % (prefix, threshold.split('.')[1])) print "Loading merged file:", merged_data_file merged_lists = [[kplets[int(id)] for id in cur_list.split(',')[0].split()] for cur_list in open(merged_data_file).readlines() if cur_list.split(',')[0]] reports_dir = os.path.join(gv.project_data_path, 'cas1402/reports/thr_%s/' % threshold) if not os.path.exists(reports_dir): os.mkdir(reports_dir) reports_dir = os.path.join(reports_dir, str(order)) print "Generating reports to:", reports_dir generate_reports(merged_lists, reports_dir, neighborhood_files_path)
def process_reporting_single_order(order, threshold): if order == 2: prefix = 'duplets' elif order == 3: prefix = 'triplets' elif order == 4: prefix = 'quadruplets' elif order == 5: prefix = 'pentaplets' pickle_file = os.path.join(data_path, "%s.p.bz2"%prefix) print "Loading kplets:", pickle_file kplets = t.load_compressed_pickle(pickle_file) kplets = {kplet.id:kplet for kplet in kplets} merged_data_file = os.path.join(data_path, "%s.merged_%s.txt"%(prefix, threshold.split('.')[1])) print "Loading merged file:", merged_data_file merged_lists = [[kplets[int(id)] for id in cur_list.split(',')[0].split()] for cur_list in open(merged_data_file).readlines() if cur_list.split(',')[0]] reports_dir = os.path.join(gv.project_data_path,'cas1402/reports/thr_%s/' % threshold) if not os.path.exists(reports_dir): os.mkdir(reports_dir) reports_dir = os.path.join(reports_dir, str(order)) print "Generating reports to:", reports_dir generate_reports(merged_lists, reports_dir, neighborhood_files_path)
def prok1603_architecture_frequencies(): work_dir = os.path.join(gv.project_data_path, 'UvrD/') map_file = os.path.join(work_dir, 'prok1603/prok1603_weights.txt') locus2weight = {l.split()[0]:float(l.split()[1]) for l in open(map_file)} def_file = os.path.join(gv.project_data_path, 'cas4/profiles/defenseProfiles.tab') profile2gene={} profile2def = {} for l in open(def_file): terms = l.strip().split('\t') profile = terms[0] gene_names = terms[3].split(',') if len(gene_names)>1: profile2gene[profile] = gene_names[1] else: profile2gene[profile] = gene_names[0] profile2def[profile] = terms[4] cdd_profile2gene = t.map_cdd_profile2gene_name() cdd_profile2gene.update(profile2gene) cdd_profile2def = t.map_cdd_profile2def() cdd_profile2def.update(profile2def) prok1603_loci_file = os.path.join(work_dir, 'prok1603The CRISPR/prok1603_loci.p.bz2') loci = t.load_compressed_pickle(prok1603_loci_file) profile2loci = {} for locus in loci: for _profile in locus.profiles: if _profile in profile2loci: profile2loci[_profile].append(locus) else: profile2loci[_profile] = [locus] for (profile, loci) in sorted(profile2loci.items(), key=lambda x: len(x[1]), reverse=True): _weight = sum([locus2weight[locus.base_file_name] for locus in loci]) print "%s\t%s\t%d\t%f\t%s" % (profile, cdd_profile2gene[profile] if profile in cdd_profile2gene else "", len(loci), _weight, cdd_profile2def[profile] if profile in cdd_profile2def else "")
def process_reporting_single_order_old(order): if order == 2: data_file = 'iterative_merged_duplets.p.bz2' elif order == 3: data_file = 'iterative_merged_triplets.p.bz2' elif order == 4: data_file = 'iterative_merged_quadruplets.p.bz2' elif order == 5: data_file = 'iterative_merged_pentaplets.p.bz2' data_file = os.path.join(data_path, data_file) print "Loading file", data_file merged_lists = t.load_compressed_pickle(data_file) reports_dir = os.path.join(reports_path,'merged',str(order)) print "Generating reports to:", reports_dir generate_reports(merged_lists, reports_dir, neighborhood_files_path)
def process_reporting_single_order_old(order): if order == 2: data_file = 'iterative_merged_duplets.p.bz2' elif order == 3: data_file = 'iterative_merged_triplets.p.bz2' elif order == 4: data_file = 'iterative_merged_quadruplets.p.bz2' elif order == 5: data_file = 'iterative_merged_pentaplets.p.bz2' data_file = os.path.join(data_path, data_file) print "Loading file", data_file merged_lists = t.load_compressed_pickle(data_file) reports_dir = os.path.join(reports_path, 'merged', str(order)) print "Generating reports to:", reports_dir generate_reports(merged_lists, reports_dir, neighborhood_files_path)
def process_reporting_single_order(order): if order == 2: data_file = 'iterative_merged_duplets.p.bz2' elif order == 3: data_file = 'iterative_merged_triplets.p.bz2' elif order == 4: data_file = 'iterative_merged_quadruplets.p.bz2' elif order == 5: data_file = 'iterative_merged_pentaplets.p.bz2' data_file = os.path.join(gv.project_data_path, 'CRISPR/pickle/',dataset_code,data_file) print "Loading file", data_file merged_lists = t.load_compressed_pickle(data_file) reports_dir = os.path.join(gv.project_data_path,'CRISPR/reports',dataset_code,'merged',str(order)) print "Generating reports to:", reports_dir neighborhood_files_path = os.path.join(gv.project_data_path,'CRISPR/datasets/',dataset_code,'wgs') generate_reports(merged_lists, reports_dir, neighborhood_files_path)
def process_reporting_single_order(order): if order == 2: data_file = 'iterative_merged_duplets.p.bz2' elif order == 3: data_file = 'iterative_merged_triplets.p.bz2' elif order == 4: data_file = 'iterative_merged_quadruplets.p.bz2' elif order == 5: data_file = 'iterative_merged_pentaplets.p.bz2' data_file = os.path.join(gv.project_data_path, 'CRISPR/pickle/', dataset_code, data_file) print "Loading file", data_file merged_lists = t.load_compressed_pickle(data_file) reports_dir = os.path.join(gv.project_data_path, 'CRISPR/reports', dataset_code, 'merged', str(order)) print "Generating reports to:", reports_dir neighborhood_files_path = os.path.join(gv.project_data_path, 'CRISPR/datasets/', dataset_code, 'wgs') generate_reports(merged_lists, reports_dir, neighborhood_files_path)
def generate_pickles(save_path, limit_to): if not os.path.exists(save_path): os.mkdir(save_path) # print "Loading kplets from DB" # pentaplets = p.get_report_kplets(limit_to=limit_to, load_locations=True) # quadruplets = q.get_report_kplets(limit_to=limit_to, load_locations=True) # triplets = tr.get_report_kplets(limit_to=limit_to, load_locations=True) # duplets = d.get_report_kplets(limit_to=limit_to, load_locations=True) # print "Dumping raw kplet data to files" # dump_file = bz2.BZ2File(os.path.join(save_path, 'duplets_raw.p.bz2'), 'w') # pickle.dump(duplets, dump_file) # dump_file = bz2.BZ2File(os.path.join(save_path, 'triplets_raw.p.bz2'), 'w') # pickle.dump(triplets, dump_file) # dump_file = bz2.BZ2File(os.path.join(save_path, 'quadruplets_raw.p.bz2'), 'w') # pickle.dump(quadruplets, dump_file) # dump_file = bz2.BZ2File(os.path.join(save_path, 'pentaplets_raw.p.bz2'), 'w') # pickle.dump(pentaplets, dump_file) print "Loading raw kplets from pickles" dump_file = os.path.join(save_path, 'duplets_raw.p.bz2') duplets = t.load_compressed_pickle(dump_file) dump_file = os.path.join(save_path, 'triplets_raw.p.bz2') triplets = t.load_compressed_pickle(dump_file) dump_file = os.path.join(save_path, 'quadruplets_raw.p.bz2') quadruplets = t.load_compressed_pickle(dump_file) # dump_file = os.path.join(save_path, 'pentaplets_raw.p.bz2') # pentaplets = t.load_compressed_pickle(dump_file) print "Basic within merging" # pentaplets = merging.basic_merge_within_orders(pentaplets) quadruplets = merging.basic_merge_within_orders(quadruplets) triplets = merging.basic_merge_within_orders(triplets) duplets = merging.basic_merge_within_orders(duplets) print "Dumping basic merges" # dump_file = os.path.join(save_path, 'pentaplets_basic_merged.p.bz2') # t.dump_compressed_pickle(dump_file, pentaplets) dump_file = os.path.join(save_path, 'quadruplets_basic_merged.p.bz2') t.dump_compressed_pickle(dump_file, quadruplets) dump_file = os.path.join(save_path, 'triplets_basic_merged.p.bz2') t.dump_compressed_pickle(dump_file, triplets) dump_file = os.path.join(save_path, 'duplets_basic_merged.p.bz2') t.dump_compressed_pickle(dump_file, duplets) # print "Loading basic merges" # dump_file = os.path.join(save_path, 'pentaplets_basic_merged.p.bz2') # pentaplets = t.load_compressed_pickle(dump_file) # dump_file = os.path.join(save_path, 'quadruplets_basic_merged.p.bz2') # quadruplets = t.load_compressed_pickle(dump_file) # dump_file = os.path.join(save_path, 'triplets_basic_merged.p.bz2') # triplets = t.load_compressed_pickle(dump_file) # dump_file = os.path.join(save_path, 'duplets_basic_merged.p.bz2') # duplets = t.load_compressed_pickle(dump_file) sys.exit() print 'Starting iterative within mergings' pentaplets = merging.merge_kplets_within_orders_iterative_2(pentaplets) quadruplets = merging.merge_kplets_within_orders_iterative_2(quadruplets) triplets = merging.merge_kplets_within_orders_iterative_2(triplets) duplets = merging.merge_kplets_within_orders_iterative_2(duplets) print "Dumping iterative merges" dump_file = os.path.join(save_path, 'pentaplets_iterative_merged.p.bz2') t.dump_compressed_pickle(dump_file, pentaplets) dump_file = os.path.join(save_path, 'quadruplets_iterative_merged.p.bz2') t.dump_compressed_pickle(dump_file, quadruplets) dump_file = os.path.join(save_path, 'triplets_iterative_merged.p.bz2') t.dump_compressed_pickle(dump_file, triplets) dump_file = os.path.join(save_path, 'duplets_iterative_merged.p.bz2') t.dump_compressed_pickle(dump_file, duplets) sys.exit() print 'Dumping merged kplet lists to files' dump_file = bz2.BZ2File( os.path.join(save_path, 'pentaplets_merged_within.p.bz2'), 'w') pickle.dump(pentaplets, dump_file) dump_file = bz2.BZ2File( os.path.join(save_path, 'quadruplets_merged_within.p.bz2'), 'w') pickle.dump(quadruplets, dump_file) dump_file = bz2.BZ2File( os.path.join(save_path, 'triplets_merged_within.p.bz2'), 'w') pickle.dump(triplets, dump_file) dump_file = bz2.BZ2File( os.path.join(save_path, 'duplets_merged_within.p.bz2'), 'w') pickle.dump(duplets, dump_file) print 'Starting accross mergings' triplets, duplets = merging.merge_kplets_across_orders(triplets, duplets) quadruplets, triplets = merging.merge_kplets_across_orders( quadruplets, triplets) pentaplets, quadruplets = merging.merge_kplets_across_orders( pentaplets, quadruplets) print 'Dumping to files' dump_file = bz2.BZ2File( os.path.join(save_path, 'pentaplets_merged_across.p.bz2'), 'w') pickle.dump(pentaplets, dump_file) dump_file = bz2.BZ2File( os.path.join(save_path, 'quadruplets_merged_across.p.bz2'), 'w') pickle.dump(quadruplets, dump_file) dump_file = bz2.BZ2File( os.path.join(save_path, 'triplets_merged_across.p.bz2'), 'w') pickle.dump(triplets, dump_file) dump_file = bz2.BZ2File( os.path.join(save_path, 'duplets_merged_across.p.bz2'), 'w') pickle.dump(duplets, dump_file) print 'Done for limit_to:', limit_to print print
def generate_plots_from_pickle(limit_to, report_dir, target_profiles, profile2def, gid2arcog_cdd, neighborhood_files_path): data_path = os.path.join(gv.project_data_path, 'Bacteria/pickle/') fname = os.path.join(data_path, str(limit_to), 'pentaplets_merged_within.p.bz2') pentaplets = t.load_compressed_pickle(fname) fname = os.path.join(data_path, str(limit_to), 'quadruplets_merged_within.p.bz2') quadruplets = t.load_compressed_pickle(fname) fname = os.path.join(data_path, str(limit_to), 'triplets_merged_within.p.bz2') triplets = t.load_compressed_pickle(fname) fname = os.path.join(data_path, str(limit_to), 'duplets_merged_within.p.bz2') duplets = t.load_compressed_pickle(fname) print 'Generationg reports for across orders merged lists' flank_report_fnames = ['pentaplets_flank_counts.xls', 'quadruplets_flank_counts.xls', \ 'triplets_flank_counts.xls', 'duplets_flank_counts.xls'] kplets = [pentaplets, quadruplets, triplets, duplets] titles = ['Pentaplets', 'Quadruplets', 'Triplets', 'Duplets'] report_files_dir = os.path.join(gv.project_data_path, 'Bacteria/reports/merged_within_orders/', report_dir) for i in range(len(flank_report_fnames)): _fname = flank_report_fnames[i] _kplet_list = kplets[i] _title = titles[i] flank_counts, cog2gids, gid2weight = dist.get_flank_distributions( _kplet_list, neighborhood_files_path, target_profiles) universal_flank_counts = t.merge_dict_set_list(cog2gids, gid2weight) params = dict() params['xls_file_name'] = os.path.join(report_files_dir, _fname) params['profile2def'] = profile2def params['flank_counts'] = universal_flank_counts params['title'] = _title params['target_profiles'] = target_profiles r.write_flanking_count_xls(params) sys.exit() if not os.path.exists(report_files_dir): os.mkdir(report_files_dir) for i, kplet_pool in zip([5, 4, 3, 2], [pentaplets, quadruplets, triplets, duplets]): print i j = 0 # flank_counts_pool = dist.get_flank_distributions(kplet_pool, neighborhood_files_path, target_profiles) for k in range(len(kplet_pool)): kplet_sublist = kplet_pool[k] cur_reports_folder = os.path.join(report_files_dir, str(i)) if not os.path.exists(cur_reports_folder): os.mkdir(cur_reports_folder) src2org, file_summaries, community, community_count, community_count_with_flanks\ = merging.merge_into_file_summaries(kplet_sublist, neighborhood_files_path, file2src_src2org_map) if not src2org: continue xls_file_name = os.path.join(cur_reports_folder, "%d_%d.xls" % (j + 1, i)) class2counts, class2profiles = merging.cdd_profile_count_into_class_count( community_count) class2counts_flank, class2profiles_flank = merging.cdd_profile_count_into_class_count( community_count_with_flanks) # profile2counts = flank_counts_pool[k] j += 1 params = dict() params['xls_file_name'] = xls_file_name params['src2org'] = src2org params['file_summaries'] = file_summaries params['community'] = community params['target_profiles'] = target_profiles params['profile2def'] = profile2def params['gid2arcog_cdd'] = gid2arcog_cdd params['class2counts'] = class2counts params['class2profiles'] = class2profiles params['class2counts_flank'] = class2counts_flank # params['profile2counts'] = profile2counts r.write_to_xls(params)
params['xls_file_name'] = xls_file_name params['src2org'] = src2org params['file_summaries'] = file_summaries params['community'] = community params['target_profiles'] = target_profiles params['profile2def'] = profile2def params['gid2arcog_cdd'] = gid2arcog_cdd r.write_to_xls(params) if cnt == 200: break filtered_merged_kplets = [] fname = '/Users/hudaiber/Projects/NewSystems/data/Bacteria/pickle/100000/pentaplets_community_count.p.bz2' pp_com_cnt = t.load_compressed_pickle(fname) pentaplets pivot_files = set() for kplet in pentaplets[1].kplets: pivot_files.update(set(kplet.files)) for i in range(2, len(pentaplets)): _files = set() for kplet in pentaplets[i].kplets: _files.update(set(kplet.files)) min_len = min(len(pivot_files), len(_files)) if len(pivot_files.intersection(_files)) / float(min_len) > 0.3: print i, min_len, len(pivot_files.intersection(_files))
pan_data_path = '/panfs/pan1/patternquest/Projects/NewSystems/data/Archea/' fpath = '/panfs/pan1/patternquest/Projects/NewSystems/data/Archea/arCOG/arcogs_integrase.txt' integrases = [l.strip() for l in open(fpath).readlines()] fpath = '/panfs/pan1/patternquest/Projects/NewSystems/data/Archea/arCOG/arcogs_recombinase.txt' recombinases = [l.strip() for l in open(fpath).readlines()] fpath = '/panfs/pan1/patternquest/Projects/NewSystems/data/Archea/arCOG/arcogs_transposase.txt' transposases = [l.strip() for l in open(fpath).readlines()] pty_files_path = pan_data_path + 'genes_and_flanks/win_10/pty/' print "Loading loci" pickle_file = os.path.join(pan_data_path, 'pickle/10000/loci.p.bz2') loci = t.load_compressed_pickle(pickle_file) # all_bait_profiles = integrases + recombinases + resolvases + transposases # loci = [] # # cnt = 0 # for f in os.listdir(pty_files_path): # loci.append(dt.get_pty_file(pty_files_path + f)) # if cnt % 1000 == 0: # print cnt # cnt += 1 # # print "Dumping loci" # t.dump_compressed_pickle(pickle_file, loci) # print "Finished dumping"
_values /= np.sum(_values) entropy = -1 * np.sum(_values * np.log(_values)) entropies.append(entropy) to_collapse_retval.append((cluster, type2cnt, entropy, gene2cnt)) entropies = np.array(entropies) average_entropy = np.average(entropies) return singles, to_collapse_retval, average_entropy if __name__=='__main__': fname = '/Volumes/hudaiber/Projects/NewSystems/data/cas4/dendrograms_10.p.bz2' data = t.load_compressed_pickle(fname) loci = data['loci'] tree = data['root'] leaf_names = [os.path.basename(l.file_name) for l in loci] newick = tree_to_newick(tree, "", tree.dist, leaf_names) fname = '/Volumes/hudaiber/Projects/NewSystems/data/cas4/tmp.nw' outf = open(fname, 'w') outf.write(newick) outf.close()
if root_node.count<size_limit: return [root_node] else: return break_down(root_node.left, size_limit) + break_down(root_node.right, size_limit) if __name__=='__main__': files_path = os.path.join(gv.project_data_path,'CRISPR/Cas1/genes_and_flanks/wgs/') pickle_path = os.path.join(gv.project_data_path, 'CRISPR/pickle/') print "Loading data" Z = np.load('/Users/hudaiber/Projects/NewSystems/data/CRISPR/pickle/upgma_distance_array.npz' ).items()[0][1] jwd = np.load('/Users/hudaiber/Projects/NewSystems/data/CRISPR/pickle/jw_distances.npz').items()[0][1] fname = os.path.join(pickle_path, 'loci.p.bz2') loci = t.load_compressed_pickle(fname) print "Starting " root = to_tree(Z) root = clone_graph(root) nodes = get_nodes(root) id2node = {node.id: node for node in nodes} leaf_ids = leaves_list(Z) cnt = 0 i = 0 threshold = 0.5 clusters = [] total_count = 0
def generate_plots_from_pickle(limit_to, report_dir, target_profiles, profile2def, gid2arcog_cdd, neighborhood_files_path): data_path = os.path.join(gv.project_data_path, 'Archea/pickle/') fname = os.path.join(data_path, str(limit_to), 'pentaplets_merged_within.p.bz2') pentaplets = t.load_compressed_pickle(fname) fname = os.path.join(data_path, str(limit_to), 'quadruplets_merged_within.p.bz2') quadruplets = t.load_compressed_pickle(fname) fname = os.path.join(data_path, str(limit_to), 'triplets_merged_within.p.bz2') triplets = t.load_compressed_pickle(fname) fname = os.path.join(data_path, str(limit_to), 'duplets_merged_within.p.bz2') duplets = t.load_compressed_pickle(fname) # pentaplets, quadruplets, triplets = None, None, None print 'Generationg reports for within orders merged lists' flank_report_fnames = ['pentaplets_flank_counts.xls', 'quadruplets_flank_counts.xls', \ 'triplets_flank_counts.xls', 'duplets_flank_counts.xls'] kplets = [pentaplets, quadruplets, triplets, duplets] titles = ['Pentaplets', 'Quadruplets', 'Triplets', 'Duplets'] report_files_dir = os.path.join(gv.project_data_path, 'Archea/reports/merged_within_orders/', report_dir) for i in range(len(flank_report_fnames)): _fname = flank_report_fnames[i] _kplet_list = kplets[i] _title = titles[i] flank_counts, cog2gids, gid2weight = dist.get_flank_distributions(_kplet_list, neighborhood_files_path, target_profiles) # universal_flank_counts = t.merge_dict_list(flank_counts) # universal_cog2gids = t.merge_dict_set_list(cog2gids, gid2weight) universal_flank_counts = t.merge_dict_set_list(cog2gids, gid2weight) params = dict() params[ 'xls_file_name'] = os.path.join(report_files_dir, _fname) params[ 'profile2def'] = profile2def params[ 'flank_counts'] = universal_flank_counts params[ 'title'] = _title params['target_profiles'] = target_profiles r.write_flanking_count_xls(params) if not os.path.exists(report_files_dir): os.mkdir(report_files_dir) for i, kplet_pool in zip([2, 3, 4, 5], [duplets, triplets, quadruplets, pentaplets]): j = 0 # profile2counts_pool = dist.get_flank_distributions(kplet_pool, neighborhood_files_path, target_profiles) file_summaries_pool = [] for kplet_list in kplet_pool: summary_terms = merging.merge_into_file_summaries(kplet_list, neighborhood_files_path, file2src_src2org_map, 'archaea') file_summaries_pool.append(summary_terms) for summary_terms in sorted(file_summaries_pool, key=itemgetter(5), reverse=True): src2org, file_summaries, community, community_count, community_count_with_flanks, weight = summary_terms if not src2org: continue cur_reports_folder = os.path.join(report_files_dir, str(i)) if not os.path.exists(cur_reports_folder): os.mkdir(cur_reports_folder) xls_file_name = os.path.join(cur_reports_folder, "%d_%d.xls" % (j+1, i)) class2counts, class2profiles = merging.arcog_profile_count_into_class_count(community_count) class2counts_flank, class2profiles_flank = merging.arcog_profile_count_into_class_count(community_count_with_flanks) j += 1 params = dict() params[ 'xls_file_name'] = xls_file_name params[ 'src2org'] = src2org params[ 'file_summaries'] = file_summaries params[ 'community'] = community params[ 'target_profiles'] = target_profiles params[ 'profile2def'] = profile2def params[ 'gid2arcog_cdd'] = gid2arcog_cdd params[ 'class2counts'] = class2counts params[ 'class2profiles'] = class2profiles params['class2counts_flank'] = class2counts_flank # params['profile2counts'] = profile2counts r.write_to_xls(params)
def generate_pickles(save_path, limit_to): if not os.path.exists(save_path): os.mkdir(save_path) # print "Loading kplets from DB" # pentaplets = p.get_report_kplets(limit_to=limit_to, load_locations=True) # quadruplets = q.get_report_kplets(limit_to=limit_to, load_locations=True) # triplets = tr.get_report_kplets(limit_to=limit_to, load_locations=True) # duplets = d.get_report_kplets(limit_to=limit_to, load_locations=True) # print "Dumping raw kplet data to files" # dump_file = bz2.BZ2File(os.path.join(save_path, 'duplets_raw.p.bz2'), 'w') # pickle.dump(duplets, dump_file) # dump_file = bz2.BZ2File(os.path.join(save_path, 'triplets_raw.p.bz2'), 'w') # pickle.dump(triplets, dump_file) # dump_file = bz2.BZ2File(os.path.join(save_path, 'quadruplets_raw.p.bz2'), 'w') # pickle.dump(quadruplets, dump_file) # dump_file = bz2.BZ2File(os.path.join(save_path, 'pentaplets_raw.p.bz2'), 'w') # pickle.dump(pentaplets, dump_file) print "Loading raw kplets from pickles" dump_file = os.path.join(save_path, 'duplets_raw.p.bz2') duplets = t.load_compressed_pickle(dump_file) dump_file = os.path.join(save_path, 'triplets_raw.p.bz2') triplets= t.load_compressed_pickle(dump_file) dump_file = os.path.join(save_path, 'quadruplets_raw.p.bz2') quadruplets = t.load_compressed_pickle(dump_file) # dump_file = os.path.join(save_path, 'pentaplets_raw.p.bz2') # pentaplets = t.load_compressed_pickle(dump_file) print "Basic within merging" # pentaplets = merging.basic_merge_within_orders(pentaplets) quadruplets= merging.basic_merge_within_orders(quadruplets) triplets = merging.basic_merge_within_orders(triplets) duplets = merging.basic_merge_within_orders(duplets) print "Dumping basic merges" # dump_file = os.path.join(save_path, 'pentaplets_basic_merged.p.bz2') # t.dump_compressed_pickle(dump_file, pentaplets) dump_file = os.path.join(save_path, 'quadruplets_basic_merged.p.bz2') t.dump_compressed_pickle(dump_file, quadruplets) dump_file = os.path.join(save_path, 'triplets_basic_merged.p.bz2') t.dump_compressed_pickle(dump_file, triplets) dump_file = os.path.join(save_path, 'duplets_basic_merged.p.bz2') t.dump_compressed_pickle(dump_file, duplets) # print "Loading basic merges" # dump_file = os.path.join(save_path, 'pentaplets_basic_merged.p.bz2') # pentaplets = t.load_compressed_pickle(dump_file) # dump_file = os.path.join(save_path, 'quadruplets_basic_merged.p.bz2') # quadruplets = t.load_compressed_pickle(dump_file) # dump_file = os.path.join(save_path, 'triplets_basic_merged.p.bz2') # triplets = t.load_compressed_pickle(dump_file) # dump_file = os.path.join(save_path, 'duplets_basic_merged.p.bz2') # duplets = t.load_compressed_pickle(dump_file) sys.exit() print 'Starting iterative within mergings' pentaplets = merging.merge_kplets_within_orders_iterative_2(pentaplets) quadruplets = merging.merge_kplets_within_orders_iterative_2(quadruplets) triplets = merging.merge_kplets_within_orders_iterative_2(triplets) duplets = merging.merge_kplets_within_orders_iterative_2(duplets) print "Dumping iterative merges" dump_file = os.path.join(save_path, 'pentaplets_iterative_merged.p.bz2') t.dump_compressed_pickle(dump_file, pentaplets) dump_file = os.path.join(save_path, 'quadruplets_iterative_merged.p.bz2') t.dump_compressed_pickle(dump_file, quadruplets) dump_file = os.path.join(save_path, 'triplets_iterative_merged.p.bz2') t.dump_compressed_pickle(dump_file, triplets) dump_file = os.path.join(save_path, 'duplets_iterative_merged.p.bz2') t.dump_compressed_pickle(dump_file, duplets) sys.exit() print 'Dumping merged kplet lists to files' dump_file = bz2.BZ2File(os.path.join(save_path, 'pentaplets_merged_within.p.bz2'), 'w') pickle.dump(pentaplets, dump_file) dump_file = bz2.BZ2File(os.path.join(save_path, 'quadruplets_merged_within.p.bz2'), 'w') pickle.dump(quadruplets, dump_file) dump_file = bz2.BZ2File(os.path.join(save_path, 'triplets_merged_within.p.bz2'), 'w') pickle.dump(triplets, dump_file) dump_file = bz2.BZ2File(os.path.join(save_path, 'duplets_merged_within.p.bz2'), 'w') pickle.dump(duplets, dump_file) print 'Starting accross mergings' triplets, duplets = merging.merge_kplets_across_orders(triplets, duplets) quadruplets, triplets = merging.merge_kplets_across_orders(quadruplets, triplets) pentaplets, quadruplets = merging.merge_kplets_across_orders(pentaplets, quadruplets) print 'Dumping to files' dump_file = bz2.BZ2File(os.path.join(save_path, 'pentaplets_merged_across.p.bz2'), 'w') pickle.dump(pentaplets, dump_file) dump_file = bz2.BZ2File(os.path.join(save_path, 'quadruplets_merged_across.p.bz2'), 'w') pickle.dump(quadruplets, dump_file) dump_file = bz2.BZ2File(os.path.join(save_path, 'triplets_merged_across.p.bz2'), 'w') pickle.dump(triplets, dump_file) dump_file = bz2.BZ2File(os.path.join(save_path, 'duplets_merged_across.p.bz2'), 'w') pickle.dump(duplets, dump_file) print 'Done for limit_to:', limit_to print print
t.update_dictionary(gene2cnt, _gene_name, _weight) _values = type2cnt.values() _values /= np.sum(_values) entropy = -1 * np.sum(_values * np.log(_values)) entropies.append(entropy) to_collapse_retval.append((cluster, type2cnt, entropy, gene2cnt)) entropies = np.array(entropies) average_entropy = np.average(entropies) return singles, to_collapse_retval, average_entropy if __name__ == '__main__': fname = '/Volumes/hudaiber/Projects/NewSystems/data/cas4/dendrograms_10.p.bz2' data = t.load_compressed_pickle(fname) loci = data['loci'] tree = data['root'] leaf_names = [os.path.basename(l.file_name) for l in loci] newick = tree_to_newick(tree, "", tree.dist, leaf_names) fname = '/Volumes/hudaiber/Projects/NewSystems/data/cas4/tmp.nw' outf = open(fname, 'w') outf.write(newick) outf.close()
def merging_pipeline_for_order(order, data_path, load_from_db=False): limit_to = 1000000000 print "starting for ", order if load_from_db: print "Loading kplets from DB" if order == 2: kplet_file = 'duplets.p.bz2' elif order == 3: kplet_file = 'triplets.p.bz2' elif order == 4: kplet_file = 'quadruplets.p.bz2' kplets = q.get_report_kplets(profile_id2code, limit_to=limit_to) elif order == 5: kplet_file = 'pentaplets.p.bz2' kplets = p.get_report_kplets(profile_id2code, limit_to=limit_to) else: print "Loading kplets from pickle file" if order == 2: kplet_file = 'duplets.p.bz2' elif order == 3: kplet_file = 'triplets.p.bz2' elif order == 4: kplet_file = 'quadruplets.p.bz2' elif order == 5: kplet_file = 'pentaplets.p.bz2' kplet_file_full = os.path.join(data_path, kplet_file) print "Loading :", kplet_file_full kplets = t.load_compressed_pickle(kplet_file_full) print "No of kplets:", len(kplets) loci_threshold = 0.7 print "Starting to merge with loci_threshold:", loci_threshold #print "Loading file2genes" tic = time.time() print "Basic merging" merged_lists = merging.basic_merge_within_orders(kplets, loci_threshold) print "Basic merging done. Merged lists:", len(merged_lists) fname = os.path.join(data_path, "basic_merged_" + "%f_" % loci_threshold + kplet_file) print "Dumping basic merging: ", fname t.dump_compressed_pickle(fname, merged_lists) print "Iterative merging" merged_lists = merging.merge_kplets_within_orders_iterative( merged_lists, loci_threshold) print "Iterative merging done. Merged lists:", len(merged_lists) fname = os.path.join( data_path, "iterative_merged_" + "%f_" % loci_threshold + kplet_file) print "Dumping Iterative merging: ", fname t.dump_compressed_pickle(fname, merged_lists) print "Completed in:", time.time() - tic, "(s)" loci_threshold = 0.8 print "Starting to merge with loci_threshold:", loci_threshold #print "Loading file2genes" tic = time.time() print "Basic merging" merged_lists = merging.basic_merge_within_orders(kplets, loci_threshold) print "Basic merging done. Merged lists:", len(merged_lists) fname = os.path.join(data_path, "basic_merged_" + "%f_" % loci_threshold + kplet_file) print "Dumping basic merging: ", fname t.dump_compressed_pickle(fname, merged_lists) print "Iterative merging" merged_lists = merging.merge_kplets_within_orders_iterative( merged_lists, loci_threshold) print "Iterative merging done. Merged lists:", len(merged_lists) fname = os.path.join( data_path, "iterative_merged_" + "%f_" % loci_threshold + kplet_file) print "Dumping Iterative merging: ", fname t.dump_compressed_pickle(fname, merged_lists) print "Completed in:", time.time() - tic, "(s)" loci_threshold = 0.9 print "Starting to merge with loci_threshold:", loci_threshold #print "Loading file2genes" tic = time.time() print "Basic merging" merged_lists = merging.basic_merge_within_orders(kplets, loci_threshold) print "Basic merging done. Merged lists:", len(merged_lists) fname = os.path.join(data_path, "basic_merged_" + "%f_" % loci_threshold + kplet_file) print "Dumping basic merging: ", fname t.dump_compressed_pickle(fname, merged_lists) print "Iterative merging" merged_lists = merging.merge_kplets_within_orders_iterative( merged_lists, loci_threshold) print "Iterative merging done. Merged lists:", len(merged_lists) fname = os.path.join( data_path, "iterative_merged_" + "%f_" % loci_threshold + kplet_file) print "Dumping Iterative merging: ", fname t.dump_compressed_pickle(fname, merged_lists) print "Completed in:", time.time() - tic, "(s)"
files_path = os.path.join(gv.project_data_path, 'CRISPR/datasets/crispr/wgs/') pickle_path = os.path.join(gv.project_data_path, 'CRISPR/pickle/crispr/') report_path = os.path.join(gv.project_data_path, 'CRISPR/reports/crispr/dendrograms/') if not os.path.exists(report_path): os.mkdir(report_path) # generate_data() score_file = os.path.join(pickle_path, 'jw_scores.npz') distance_file = os.path.join(pickle_path, 'jw_distances.npz') linkage_file = os.path.join(pickle_path, 'upgma_linkage.npz') locus_file = os.path.join(pickle_path, 'loci.p.bz2') loci = t.load_compressed_pickle(locus_file) # M = np.load(score_file).items()[0][1] print("Generating/Loading linkage file") Z = build_linkage(M, distance_file, linkage_file) Z = np.load(linkage_file).items()[0][1] print("Starting to form the clusters") root = to_tree(Z) root = clone_graph(root) nodes = get_nodes(root) id2node = {node.id: node for node in nodes} leaf_ids = leaves_list(Z)
sys.path.append(gv.project_code_path) import lib.utils.tools as t import csv import os import gzip import bz2 if __name__=='__main__': pickle_file = sys.argv[1] csv_file = sys.argv[2] print "Loading objects from file:", pickle_file kplets = t.load_compressed_pickle(pickle_file) print "No of kplets:", len(kplets) # print "Writing into CSV file:", csv_file # if compression=='True': # # fout = bz2.BZ2File(csv_file, "w") # fout = gzip.open(csv_file,"w") # else: # fout = open(csv_file, "w") print "Writing into file:", csv_file fout = bz2.BZ2File(csv_file, "w") csv_writer = csv.writer(fout) csv_writer.writerow(('Id', 'K', 'Count', 'Codes', 'Files')) for kplet in kplets: row = []
params['src2org'] = src2org params['file_summaries'] = file_summaries params['community'] = community params['target_profiles'] = target_profiles params['profile2def'] = profile2def params['gid2arcog_cdd'] = gid2arcog_cdd r.write_to_xls(params) if cnt == 200: break filtered_merged_kplets = [] fname = '/Users/hudaiber/Projects/NewSystems/data/Bacteria/pickle/100000/pentaplets_community_count.p.bz2' pp_com_cnt = t.load_compressed_pickle(fname) pentaplets pivot_files = set() for kplet in pentaplets[1].kplets: pivot_files.update(set(kplet.files)) for i in range(2, len(pentaplets)): _files = set() for kplet in pentaplets[i].kplets: _files.update(set(kplet.files)) min_len = min(len(pivot_files), len(_files)) if len(pivot_files.intersection(_files))/float(min_len) > 0.3:
if __name__ == '__main__': work_path = os.path.join(gv.project_data_path, 'Bacteria/cases') pty_path = gv.pty_data_path kplet_id = 306123 id2cdd = map_id2cdd_clusters() kplet = p.get_report_kplet(kplet_id, id2cdd, load_locations=True) target_profiles = set(t.bacteria_target_profiles()) dump_file = os.path.join(work_path, 'kplet.p.bz2') t.dump_compressed_pickle(dump_file, kplet) kplet = t.load_compressed_pickle(dump_file) kplet_codes = kplet.codes.difference(target_profiles) org2src, src2blocks = sig.search_kplet_in_genomes(kplet_codes, target_profiles, max_dist=4) # dump_file = os.path.join(work_path, 'org2src_global.p.bz2') # t.dump_compressed_pickle(dump_file, org2src) # dump_file = os.path.join(work_path, 'src2blocks_global.p.bz2') # t.dump_compressed_pickle(dump_file, src2blocks) # # dump_file = os.path.join(work_path, 'org2src_global.p.bz2') # org2src = t.load_compressed_pickle(dump_file) # dump_file = os.path.join(work_path, 'src2blocks_global.p.bz2') # src2blocks = t.load_compressed_pickle(dump_file)
files_path = os.path.join(gv.project_data_path, 'cas1402/files/') pickle_path = os.path.join(gv.project_data_path, 'cas1402/pickle/') report_path = os.path.join(gv.project_data_path, 'cas1402/reports/') # print "Loading CDD" # # gi2annotation = t.map_gid2cdd() # # print "Loading loci" # loci = [Locus(os.path.join(files_path, f), gi2annotation) for f in os.listdir(files_path)] # loci = [locus for locus in loci if len(locus.genes) > 2] # # t.dump_compressed_pickle('loci.p.bz2', loci) print "Loading loci" loci = t.load_compressed_pickle('loci.p.bz2') # reporting_dot_product_run(loci) # process_jackard_distances(loci) reporting_jackard_distance(loci) # method = 'union' # thresholds = [(0, 0.01), (1, 0.4), (1, 0.6), (5, 0.5), (10, 0.6), (10, 0.4)] # process_dot_product(thresholds, method, loci) # results_data_file_name = os.path.join(gv.project_data_path,'cas1402/results_%s_1.txt'%method) # results_image_file_name = os.path.join(gv.project_data_path,'cas1402/clustering_%s_1.png'%method) # # reporting.plot_results(results_data_file_name, results_image_file_name)
# r.write_flanking_count_xls(params) # # data_path = os.path.join(gv.project_data_path,'Archea/pickle/tmp/') # # # for limit_to in [10000, 1000000]: # for limit_to in [1000000]: # print "Limit_to:", limit_to # print # cur_path = os.path.join(data_path, str(limit_to)) # generate_pickles(cur_path, limit_to) # print 'Done' # print "------------------------" # sys.exit() fname = '/Users/hudaiber/Projects/NewSystems/data/Archea/pickle/1000000/pentaplets_merged_within.p.bz2' pentaplets = t.load_compressed_pickle(fname) # # fname = '/Users/hudaiber/Projects/NewSystems/data/Archea/pickle/1000000/5_community_count.p.bz2' # community = t.load_compressed_pickle(fname) # # fname = '/Users/hudaiber/Projects/NewSystems/data/Archea/pickle/1000000/5_community_count_with_flanks.p.bz2' # community_flank = t.load_compressed_pickle(fname) # # fname = '/Users/hudaiber/Projects/NewSystems/data/Archea/pickle/1000000/5_community_count_classes.p.bz2' # community_classes = t.load_compressed_pickle(fname) # # fname = '/Users/hudaiber/Projects/NewSystems/data/Archea/pickle/1000000/5_community_count_with_flanks_classes.p.bz2' # community_flank_classes = t.load_compressed_pickle(fname) # # fname = '/Users/hudaiber/Projects/NewSystems/data/Archea/pickle/1000000/duplets_merged_within.p.bz2' # community_flank_classes = t.load_compressed_pickle(fname)
def merging_pipeline_for_order(order, data_path, load_from_db=False): limit_to = 1000000000 print "starting for ", order if load_from_db: print "Loading kplets from DB" if order == 2: kplet_file = 'duplets.p.bz2' #kplets = t.load_compressed_pickle(os.path.join(data_path, kplet_file)) elif order == 3: kplet_file = 'triplets.p.bz2' #kplets = t.load_compressed_pickle(os.path.join(data_path, kplet_file)) elif order == 4: kplet_file = 'quadruplets.p.bz2' kplets = q.get_report_kplets(profile_id2code, limit_to=limit_to) elif order == 5: kplet_file = 'pentaplets.p.bz2' kplets = p.get_report_kplets(profile_id2code, limit_to=limit_to) else: print "Loading kplets from pickle file" if order == 2: kplet_file = 'duplets.p.bz2' elif order == 3: kplet_file = 'triplets.p.bz2' elif order == 4: kplet_file = 'quadruplets.p.bz2' elif order == 5: kplet_file = 'pentaplets.p.bz2' kplet_file_full = os.path.join(data_path, kplet_file) print "Loading :", kplet_file_full kplets = t.load_compressed_pickle(kplet_file_full) print "No of kplets:", len(kplets) #print "Loading file2genes" # tic = time.time() # print "Basic merging" # merged_lists = merging.basic_merge_within_orders(kplets) # print "Basic merging done. Merged lists:", len(merged_lists) # # fname = os.path.join(data_path, "basic_merged_"+kplet_file) # # print "Dumping basic merging: ", fname # # t.dump_compressed_pickle(fname, merged_lists) # # print "Iterative merging" # merged_lists = merging.merge_kplets_within_orders_iterative(merged_lists) # print "Iterative merging done. Merged lists:", len(merged_lists) # # fname = os.path.join(data_path, "iterative_merged_"+kplet_file) # # print "Dumping Iterative merging: ",fname # # t.dump_compressed_pickle(fname, merged_lists) # print "Completed in:", time.time()-tic, "(s)" csv_kplet_file = os.path.join(data_path, kplet_file.split('.')[0] + ".csv") csv_merged_lists_file = os.path.join( data_path, "iterative_merged_" + kplet_file.split('.')[0] + ".csv") print "Writing kplets to csv file:" print csv_kplet_file t.write_kplets_to_csv(kplets, csv_kplet_file) tic = time.time() print "Starting kpletmerger with params:" print "kpletmerger", csv_kplet_file, csv_merged_lists_file print "\n\n" sp.call(["kpletmerger", csv_kplet_file, csv_merged_lists_file]) print "\n\n" print "Completed in:", time.time() - tic, "(s)"
def merging_pipeline_for_order(order, data_path, load_from_db=False): limit_to = 1000000000 print "starting for ", order if load_from_db: print "Loading kplets from DB" if order == 2: kplet_file = 'duplets.p.bz2' elif order == 3: kplet_file = 'triplets.p.bz2' elif order == 4: kplet_file = 'quadruplets.p.bz2' kplets = q.get_report_kplets(profile_id2code, limit_to=limit_to) elif order == 5: kplet_file = 'pentaplets.p.bz2' kplets = p.get_report_kplets(profile_id2code, limit_to=limit_to) else: print "Loading kplets from pickle file" if order == 2: kplet_file = 'duplets.p.bz2' elif order == 3: kplet_file = 'triplets.p.bz2' elif order == 4: kplet_file = 'quadruplets.p.bz2' elif order == 5: kplet_file = 'pentaplets.p.bz2' kplet_file_full = os.path.join(data_path, kplet_file) print "Loading :",kplet_file_full kplets = t.load_compressed_pickle(kplet_file_full) print "No of kplets:", len(kplets) loci_threshold = 0.7 print "Starting to merge with loci_threshold:", loci_threshold #print "Loading file2genes" tic = time.time() print "Basic merging" merged_lists = merging.basic_merge_within_orders(kplets, loci_threshold) print "Basic merging done. Merged lists:", len(merged_lists) fname = os.path.join(data_path, "basic_merged_"+"%f_"%loci_threshold+kplet_file) print "Dumping basic merging: ", fname t.dump_compressed_pickle(fname, merged_lists) print "Iterative merging" merged_lists = merging.merge_kplets_within_orders_iterative(merged_lists, loci_threshold) print "Iterative merging done. Merged lists:", len(merged_lists) fname = os.path.join(data_path, "iterative_merged_"+"%f_"%loci_threshold+kplet_file) print "Dumping Iterative merging: ",fname t.dump_compressed_pickle(fname, merged_lists) print "Completed in:", time.time()-tic, "(s)" loci_threshold = 0.8 print "Starting to merge with loci_threshold:", loci_threshold #print "Loading file2genes" tic = time.time() print "Basic merging" merged_lists = merging.basic_merge_within_orders(kplets, loci_threshold) print "Basic merging done. Merged lists:", len(merged_lists) fname = os.path.join(data_path, "basic_merged_"+"%f_"%loci_threshold+kplet_file) print "Dumping basic merging: ", fname t.dump_compressed_pickle(fname, merged_lists) print "Iterative merging" merged_lists = merging.merge_kplets_within_orders_iterative(merged_lists, loci_threshold) print "Iterative merging done. Merged lists:", len(merged_lists) fname = os.path.join(data_path, "iterative_merged_"+"%f_"%loci_threshold+kplet_file) print "Dumping Iterative merging: ",fname t.dump_compressed_pickle(fname, merged_lists) print "Completed in:", time.time()-tic, "(s)" loci_threshold = 0.9 print "Starting to merge with loci_threshold:", loci_threshold #print "Loading file2genes" tic = time.time() print "Basic merging" merged_lists = merging.basic_merge_within_orders(kplets, loci_threshold) print "Basic merging done. Merged lists:", len(merged_lists) fname = os.path.join(data_path, "basic_merged_"+"%f_"%loci_threshold+kplet_file) print "Dumping basic merging: ", fname t.dump_compressed_pickle(fname, merged_lists) print "Iterative merging" merged_lists = merging.merge_kplets_within_orders_iterative(merged_lists, loci_threshold) print "Iterative merging done. Merged lists:", len(merged_lists) fname = os.path.join(data_path, "iterative_merged_"+"%f_"%loci_threshold+kplet_file) print "Dumping Iterative merging: ",fname t.dump_compressed_pickle(fname, merged_lists) print "Completed in:", time.time()-tic, "(s)"
if __name__=='__main__': work_path = os.path.join(gv.project_data_path, 'Bacteria/cases') pty_path = gv.pty_data_path kplet_id = 306123 id2cdd = map_id2cdd_clusters() kplet = p.get_report_kplet(kplet_id, id2cdd, load_locations=True) target_profiles = set(t.bacteria_target_profiles()) dump_file = os.path.join(work_path, 'kplet.p.bz2') t.dump_compressed_pickle(dump_file, kplet) kplet = t.load_compressed_pickle(dump_file) kplet_codes = kplet.codes.difference(target_profiles) org2src, src2blocks = sig.search_kplet_in_genomes(kplet_codes, target_profiles, max_dist=4) # dump_file = os.path.join(work_path, 'org2src_global.p.bz2') # t.dump_compressed_pickle(dump_file, org2src) # dump_file = os.path.join(work_path, 'src2blocks_global.p.bz2') # t.dump_compressed_pickle(dump_file, src2blocks) # # dump_file = os.path.join(work_path, 'org2src_global.p.bz2') # org2src = t.load_compressed_pickle(dump_file) # dump_file = os.path.join(work_path, 'src2blocks_global.p.bz2') # src2blocks = t.load_compressed_pickle(dump_file) xls_file = os.path.join(work_path,'kplet_%d_tight.xlw'%kplet_id)
import pickle if __name__ == '__main__': print 'Pre-Loading dictionaries' target_profiles = t.target_profiles() profile2def = t.map_profile2def() gid2arcog_cdd = t.map_gid2arcog_cdd() neighborhood_files_path = neighborhoods_path() print "\n" limit_to = 1000000 data_path = os.path.join(gv.project_data_path, 'Archea/pickle/') fname = os.path.join(data_path, str(limit_to), 'pentaplets_merged_across.p.bz2') pentaplets = t.load_compressed_pickle(fname) report_dir = 'all' report_files_dir = os.path.join(gv.project_data_path, 'Archea/reports/merged_across_orders/', report_dir) j = 0 for kplet_sublist in pentaplets: cur_reports_folder = os.path.join(report_files_dir, str(5)) src2org, file_summaries, community, community_count, community_count_with_flanks = \ merging.merge_into_file_summaries(kplet_sublist, neighborhood_files_path, file2src_src2org_map, 'archaea')
if __name__ == '__main__': files_path = os.path.join(gv.project_data_path, 'CRISPR/Cas1/genes_and_flanks/wgs/') pickle_path = os.path.join(gv.project_data_path, 'CRISPR/pickle/') print "Loading data" Z = np.load( '/Users/hudaiber/Projects/NewSystems/data/CRISPR/pickle/upgma_distance_array.npz' ).items()[0][1] jwd = np.load( '/Users/hudaiber/Projects/NewSystems/data/CRISPR/pickle/jw_distances.npz' ).items()[0][1] fname = os.path.join(pickle_path, 'loci.p.bz2') loci = t.load_compressed_pickle(fname) print "Starting " root = to_tree(Z) root = clone_graph(root) nodes = get_nodes(root) id2node = {node.id: node for node in nodes} leaf_ids = leaves_list(Z) cnt = 0 i = 0 threshold = 0.5 clusters = [] total_count = 0
def generate_plots_from_pickle(limit_to, report_dir, target_profiles, profile2def, gid2arcog_cdd, neighborhood_files_path): data_path = os.path.join(gv.project_data_path, 'Bacteria/pickle/') fname = os.path.join(data_path, str(limit_to), 'pentaplets_merged_within.p.bz2') pentaplets = t.load_compressed_pickle(fname) fname = os.path.join(data_path, str(limit_to), 'quadruplets_merged_within.p.bz2') quadruplets = t.load_compressed_pickle(fname) fname = os.path.join(data_path, str(limit_to), 'triplets_merged_within.p.bz2') triplets = t.load_compressed_pickle(fname) fname = os.path.join(data_path, str(limit_to), 'duplets_merged_within.p.bz2') duplets = t.load_compressed_pickle(fname) print 'Generationg reports for across orders merged lists' flank_report_fnames = ['pentaplets_flank_counts.xls', 'quadruplets_flank_counts.xls', \ 'triplets_flank_counts.xls', 'duplets_flank_counts.xls'] kplets = [pentaplets, quadruplets, triplets, duplets] titles = ['Pentaplets', 'Quadruplets', 'Triplets', 'Duplets'] report_files_dir = os.path.join(gv.project_data_path, 'Bacteria/reports/merged_within_orders/', report_dir) for i in range(len(flank_report_fnames)): _fname = flank_report_fnames[i] _kplet_list = kplets[i] _title = titles[i] flank_counts, cog2gids, gid2weight = dist.get_flank_distributions(_kplet_list, neighborhood_files_path, target_profiles) universal_flank_counts = t.merge_dict_set_list(cog2gids, gid2weight) params = dict() params['xls_file_name'] = os.path.join(report_files_dir, _fname) params['profile2def'] = profile2def params['flank_counts'] = universal_flank_counts params['title'] = _title params['target_profiles'] = target_profiles r.write_flanking_count_xls(params) sys.exit() if not os.path.exists(report_files_dir): os.mkdir(report_files_dir) for i, kplet_pool in zip([5, 4, 3, 2], [pentaplets, quadruplets, triplets, duplets]): print i j = 0 # flank_counts_pool = dist.get_flank_distributions(kplet_pool, neighborhood_files_path, target_profiles) for k in range(len(kplet_pool)): kplet_sublist = kplet_pool[k] cur_reports_folder = os.path.join(report_files_dir, str(i)) if not os.path.exists(cur_reports_folder): os.mkdir(cur_reports_folder) src2org, file_summaries, community, community_count, community_count_with_flanks\ = merging.merge_into_file_summaries(kplet_sublist, neighborhood_files_path, file2src_src2org_map) if not src2org: continue xls_file_name = os.path.join(cur_reports_folder, "%d_%d.xls" % (j+1, i)) class2counts, class2profiles = merging.cdd_profile_count_into_class_count(community_count) class2counts_flank, class2profiles_flank = merging.cdd_profile_count_into_class_count(community_count_with_flanks) # profile2counts = flank_counts_pool[k] j += 1 params = dict() params['xls_file_name'] = xls_file_name params['src2org'] = src2org params['file_summaries'] = file_summaries params['community'] = community params['target_profiles'] = target_profiles params['profile2def'] = profile2def params['gid2arcog_cdd'] = gid2arcog_cdd params['class2counts'] = class2counts params['class2profiles'] = class2profiles params['class2counts_flank'] = class2counts_flank # params['profile2counts'] = profile2counts r.write_to_xls(params)
fpath = '/panfs/pan1/patternquest/Projects/NewSystems/data/Bacteria/CDD/cdd_integrase.txt' integrases = [l.split()[1] for l in open(fpath).readlines()] fpath = '/panfs/pan1/patternquest/Projects/NewSystems/data/Bacteria/CDD/cdd_recombinase.txt' recombinases = [l.split()[1] for l in open(fpath).readlines()] fpath = '/panfs/pan1/patternquest/Projects/NewSystems/data/Bacteria/CDD/cdd_resolvase.txt' resolvases = [l.split()[1] for l in open(fpath).readlines()] fpath = '/panfs/pan1/patternquest/Projects/NewSystems/data/Bacteria/CDD/cdd_transpos.txt' transposases = [l.split()[1] for l in open(fpath).readlines()] pty_files_path = pan_data_path + 'genes_and_flanks/win_10/raw_nbr_files/' print "Loading loci" pickle_file = os.path.join(pan_data_path, 'pickle/10000/loci.p.bz2') bacteria_loci = t.load_compressed_pickle(pickle_file) # all_bait_profiles = integrases + recombinases + resolvases + transposases # bacteria_loci = [] # # cnt = 0 # for f in os.listdir(pty_files_path): # bacteria_loci.append(dt.get_pty_file(pty_files_path + f)) # if cnt % 1000 == 0: # print cnt # cnt += 1 # # print "Dumping loci" # t.dump_compressed_pickle(pickle_file, bacteria_loci) # print "Finished dumping"
if sys.platform=='darwin': sys.path.append('/Users/hudaiber/Projects/SystemFiles/') elif sys.platform=='linux2': sys.path.append('/home/hudaiber/Projects/SystemFiles/') import global_variables as gv sys.path.append(gv.project_code_path) import lib.utils.tools as t # print "Bacteria" # files = ['duplets_merged_within.p.bz2', 'triplets_merged_within.p.bz2', # 'quadruplets_merged_within.p.bz2', 'pentaplets_merged_within.p.bz2'] # for f in files: # fpath = os.path.join(gv.project_data_path,'Bacteria/pickle/100000/',f) # # print fpath # kplets = t.load_compressed_pickle(fpath) # print f, len(kplets) print "CRISPR" files = ['duplets_seed.p.bz2', 'triplets_seed.p.bz2', 'quadruplets_seed.p.bz2', 'pentaplets_seed.p.bz2'] for f in files: fpath = os.path.join(gv.project_data_path,'CRISPR/pickle/cas1_2',f) kplets = t.load_compressed_pickle(fpath) print f, len(kplets)
print "\n" data_path = os.path.join(gv.project_data_path, 'Archea/pickle/1000000/') # files = ['pentaplets_basic_merged.p.bz2', 'quadruplets_raw.p.bz2', 'triplets_raw.p.bz2', 'duplets_raw.p.bz2'] files = ['duplets_basic_merged.p.bz2', 'triplets_basic_merged.p.bz2'] reports_dir_base = os.path.join( gv.project_data_path, 'Archea/reports/merged_within_orders/raw_ranks/') if not os.path.exists(reports_dir_base): os.mkdir(reports_dir_base) # for k, kplet_file_name in zip([5, 4, 3, 2], files): for k, kplet_file_name in zip([2, 3], files): print "Starting for", k kplet_lists = t.load_compressed_pickle( os.path.join(data_path, kplet_file_name)) reports_dir = os.path.join(reports_dir_base, str(k)) if not os.path.exists(reports_dir): os.mkdir(reports_dir) for ind, kplet_list in enumerate(kplet_lists): src2org, file_summaries, community, community_count, community_count_with_flanks, weight = \ merging.merge_into_file_summaries(kplet_list.kplets, neighborhood_files_path, file2src_src2org_map, 'archaea') # codes = list(kplet.codes) # suffix = ['s' if code in target_profiles else 'n' for code in codes] #
if __name__=='__main__': files_path = os.path.join( gv.project_data_path, 'CRISPR/datasets/crispr/wgs/') pickle_path = os.path.join(gv.project_data_path, 'CRISPR/pickle/crispr/') report_path = os.path.join(gv.project_data_path, 'CRISPR/reports/crispr/dendrograms/') if not os.path.exists(report_path): os.mkdir(report_path) # generate_data() score_file = os.path.join(pickle_path, 'jw_scores.npz') distance_file = os.path.join(pickle_path, 'jw_distances.npz') linkage_file = os.path.join(pickle_path, 'upgma_linkage.npz') locus_file = os.path.join(pickle_path, 'loci.p.bz2') loci = t.load_compressed_pickle(locus_file) # M = np.load(score_file).items()[0][1] print("Generating/Loading linkage file") Z = build_linkage(M, distance_file, linkage_file) Z = np.load(linkage_file).items()[0][1] print("Starting to form the clusters") root = to_tree(Z) root = clone_graph(root) nodes = get_nodes(root) id2node = {node.id: node for node in nodes} leaf_ids = leaves_list(Z)
"pfam13592": 4.0, "pfam13700": 4.0, "pfam13817": 4.0, "pfam14261": 5.0, "pfam14294": 5.0 } profile2def = t.map_cdd_profile2def() # gid2arcog_cdd = t.map_gid2arcog_cdd() neighborhood_files_path = merged_neighborhoods_path() org2weight = t.map_genome2weight() pan_data_path = '/panfs/pan1/patternquest/Projects/NewSystems/data/Bacteria/' pickle_file = os.path.join(pan_data_path, 'pickle/10000/profile2merged_files.p.bz2') profile2files = t.load_compressed_pickle(pickle_file) baiticity_file = os.path.join(gv.project_data_path, 'baiticity/bacteria/baiticity.tab') profile2baiticity = {l.split()[0]: l.split()[4] for l in open(baiticity_file).readlines()[1:] if l.strip()} i = 1 for highlight_profile in additional_profiles: _profile_containing_files = profile2files[highlight_profile] file_summaries = merging.get_file_summaries(_profile_containing_files, neighborhood_files_path, org2weight) print i, highlight_profile, len(file_summaries) report_file = os.path.join(gv.project_data_path,'baiticity/bacteria/reports/%s.xlsx' % highlight_profile)
def merging_pipeline_for_order(order, data_path, load_from_db=False): limit_to = 1000000000 print "starting for ", order if load_from_db: print "Loading kplets from DB" if order == 2: kplet_file = 'duplets.p.bz2' #kplets = t.load_compressed_pickle(os.path.join(data_path, kplet_file)) elif order == 3: kplet_file = 'triplets.p.bz2' #kplets = t.load_compressed_pickle(os.path.join(data_path, kplet_file)) elif order == 4: kplet_file = 'quadruplets.p.bz2' kplets = q.get_report_kplets(profile_id2code, limit_to=limit_to) elif order == 5: kplet_file = 'pentaplets.p.bz2' kplets = p.get_report_kplets(profile_id2code, limit_to=limit_to) else: print "Loading kplets from pickle file" if order == 2: kplet_file = 'duplets.p.bz2' elif order == 3: kplet_file = 'triplets.p.bz2' elif order == 4: kplet_file = 'quadruplets.p.bz2' elif order == 5: kplet_file = 'pentaplets.p.bz2' kplet_file_full = os.path.join(data_path, kplet_file) print "Loading :",kplet_file_full kplets = t.load_compressed_pickle(kplet_file_full) print "No of kplets:", len(kplets) #print "Loading file2genes" # tic = time.time() # print "Basic merging" # merged_lists = merging.basic_merge_within_orders(kplets) # print "Basic merging done. Merged lists:", len(merged_lists) # # fname = os.path.join(data_path, "basic_merged_"+kplet_file) # # print "Dumping basic merging: ", fname # # t.dump_compressed_pickle(fname, merged_lists) # # print "Iterative merging" # merged_lists = merging.merge_kplets_within_orders_iterative(merged_lists) # print "Iterative merging done. Merged lists:", len(merged_lists) # # fname = os.path.join(data_path, "iterative_merged_"+kplet_file) # # print "Dumping Iterative merging: ",fname # # t.dump_compressed_pickle(fname, merged_lists) # print "Completed in:", time.time()-tic, "(s)" csv_kplet_file = os.path.join(data_path, kplet_file.split('.')[0]+".csv") csv_merged_lists_file = os.path.join(data_path, "iterative_merged_"+kplet_file.split('.')[0]+".csv") print "Writing kplets to csv file:" print csv_kplet_file t.write_kplets_to_csv(kplets, csv_kplet_file) tic = time.time() print "Starting kpletmerger with params:" print "kpletmerger", csv_kplet_file, csv_merged_lists_file print "\n\n" sp.call(["kpletmerger",csv_kplet_file,csv_merged_lists_file]) print "\n\n" print "Completed in:", time.time()-tic, "(s)"
baiticity_file = os.path.join(gv.project_data_path, 'baiticity/archaea/baiticity.tab') profile2baiticity = { l.split()[0]: l.split()[4] for l in open(baiticity_file).readlines()[1:] } # gid2arcog_cdd = t.map_gid2arcog_cdd() neighborhood_files_path = merged_neighborhoods_path() org2weight = t.map_archaea_genome2weight() pan_data_path = '/panfs/pan1/patternquest/Projects/NewSystems/data/Archea/' pickle_file = pickle_file = os.path.join( pan_data_path, 'pickle/10000/profile2merged_files.p.bz2') profile2files = t.load_compressed_pickle(pickle_file) i = 1 for highlight_profile in additional_profiles: _profile_containing_files = profile2files[highlight_profile] file_summaries = merging.get_file_summaries(_profile_containing_files, neighborhood_files_path, org2weight) print i, highlight_profile, len(file_summaries) if len(file_summaries) == 1: clusters = []
import os import sys if sys.platform == 'darwin': sys.path.append('/Users/hudaiber/Projects/SystemFiles/') elif sys.platform == 'linux2': sys.path.append('/home/hudaiber/Projects/SystemFiles/') import global_variables as gv sys.path.append(gv.project_code_path) import lib.utils.tools as t # print "Bacteria" # files = ['duplets_merged_within.p.bz2', 'triplets_merged_within.p.bz2', # 'quadruplets_merged_within.p.bz2', 'pentaplets_merged_within.p.bz2'] # for f in files: # fpath = os.path.join(gv.project_data_path,'Bacteria/pickle/100000/',f) # # print fpath # kplets = t.load_compressed_pickle(fpath) # print f, len(kplets) print "CRISPR" files = [ 'duplets_seed.p.bz2', 'triplets_seed.p.bz2', 'quadruplets_seed.p.bz2', 'pentaplets_seed.p.bz2' ] for f in files: fpath = os.path.join(gv.project_data_path, 'CRISPR/pickle/cas1_2', f) kplets = t.load_compressed_pickle(fpath) print f, len(kplets)
neighborhood_files_path = neighborhoods_path() print "\n" data_path = os.path.join(gv.project_data_path, 'Archea/pickle/1000000/') # files = ['pentaplets_basic_merged.p.bz2', 'quadruplets_raw.p.bz2', 'triplets_raw.p.bz2', 'duplets_raw.p.bz2'] files = ['duplets_basic_merged.p.bz2', 'triplets_basic_merged.p.bz2'] reports_dir_base = os.path.join(gv.project_data_path,'Archea/reports/merged_within_orders/raw_ranks/') if not os.path.exists(reports_dir_base): os.mkdir(reports_dir_base) # for k, kplet_file_name in zip([5, 4, 3, 2], files): for k, kplet_file_name in zip([2, 3], files): print "Starting for", k kplet_lists = t.load_compressed_pickle(os.path.join(data_path, kplet_file_name)) reports_dir = os.path.join(reports_dir_base, str(k)) if not os.path.exists(reports_dir): os.mkdir(reports_dir) for ind, kplet_list in enumerate(kplet_lists): src2org, file_summaries, community, community_count, community_count_with_flanks, weight = \ merging.merge_into_file_summaries(kplet_list.kplets, neighborhood_files_path, file2src_src2org_map, 'archaea') # codes = list(kplet.codes) # suffix = ['s' if code in target_profiles else 'n' for code in codes] #
def generate_plots_from_pickle(limit_to, report_dir, target_profiles, profile2def, gid2arcog_cdd, neighborhood_files_path): data_path = os.path.join(gv.project_data_path, 'Archea/pickle/') fname = os.path.join(data_path, str(limit_to), 'pentaplets_merged_within.p.bz2') pentaplets = t.load_compressed_pickle(fname) fname = os.path.join(data_path, str(limit_to), 'quadruplets_merged_within.p.bz2') quadruplets = t.load_compressed_pickle(fname) fname = os.path.join(data_path, str(limit_to), 'triplets_merged_within.p.bz2') triplets = t.load_compressed_pickle(fname) fname = os.path.join(data_path, str(limit_to), 'duplets_merged_within.p.bz2') duplets = t.load_compressed_pickle(fname) # pentaplets, quadruplets, triplets = None, None, None print 'Generationg reports for within orders merged lists' flank_report_fnames = ['pentaplets_flank_counts.xls', 'quadruplets_flank_counts.xls', \ 'triplets_flank_counts.xls', 'duplets_flank_counts.xls'] kplets = [pentaplets, quadruplets, triplets, duplets] titles = ['Pentaplets', 'Quadruplets', 'Triplets', 'Duplets'] report_files_dir = os.path.join(gv.project_data_path, 'Archea/reports/merged_within_orders/', report_dir) for i in range(len(flank_report_fnames)): _fname = flank_report_fnames[i] _kplet_list = kplets[i] _title = titles[i] flank_counts, cog2gids, gid2weight = dist.get_flank_distributions( _kplet_list, neighborhood_files_path, target_profiles) # universal_flank_counts = t.merge_dict_list(flank_counts) # universal_cog2gids = t.merge_dict_set_list(cog2gids, gid2weight) universal_flank_counts = t.merge_dict_set_list(cog2gids, gid2weight) params = dict() params['xls_file_name'] = os.path.join(report_files_dir, _fname) params['profile2def'] = profile2def params['flank_counts'] = universal_flank_counts params['title'] = _title params['target_profiles'] = target_profiles r.write_flanking_count_xls(params) if not os.path.exists(report_files_dir): os.mkdir(report_files_dir) for i, kplet_pool in zip([2, 3, 4, 5], [duplets, triplets, quadruplets, pentaplets]): j = 0 # profile2counts_pool = dist.get_flank_distributions(kplet_pool, neighborhood_files_path, target_profiles) file_summaries_pool = [] for kplet_list in kplet_pool: summary_terms = merging.merge_into_file_summaries( kplet_list, neighborhood_files_path, file2src_src2org_map, 'archaea') file_summaries_pool.append(summary_terms) for summary_terms in sorted(file_summaries_pool, key=itemgetter(5), reverse=True): src2org, file_summaries, community, community_count, community_count_with_flanks, weight = summary_terms if not src2org: continue cur_reports_folder = os.path.join(report_files_dir, str(i)) if not os.path.exists(cur_reports_folder): os.mkdir(cur_reports_folder) xls_file_name = os.path.join(cur_reports_folder, "%d_%d.xls" % (j + 1, i)) class2counts, class2profiles = merging.arcog_profile_count_into_class_count( community_count) class2counts_flank, class2profiles_flank = merging.arcog_profile_count_into_class_count( community_count_with_flanks) j += 1 params = dict() params['xls_file_name'] = xls_file_name params['src2org'] = src2org params['file_summaries'] = file_summaries params['community'] = community params['target_profiles'] = target_profiles params['profile2def'] = profile2def params['gid2arcog_cdd'] = gid2arcog_cdd params['class2counts'] = class2counts params['class2profiles'] = class2profiles params['class2counts_flank'] = class2counts_flank # params['profile2counts'] = profile2counts r.write_to_xls(params)
def merging_pipeline_for_order(order, load_from_db=False): limit_to = 1000000000 print "starting for ", order if load_from_db: print "Loading kplets from DB" if order == 2: kplet_file = 'duplets.p.bz2' #kplets = t.load_compressed_pickle(os.path.join(data_path, kplet_file)) elif order == 3: kplet_file = 'triplets.p.bz2' #kplets = t.load_compressed_pickle(os.path.join(data_path, kplet_file)) elif order == 4: kplet_file = 'quadruplets.p.bz2' kplets = q.get_report_kplets(profile_id2code, limit_to=limit_to) elif order == 5: kplet_file = 'pentaplets.p.bz2' kplets = p.get_report_kplets(profile_id2code, limit_to=limit_to) else: print "Loading kplets from pickle file" if order == 2: kplet_file = 'duplets.p.bz2' elif order == 3: kplet_file = 'triplets.p.bz2' elif order == 4: kplet_file = 'quadruplets.p.bz2' elif order == 5: kplet_file = 'pentaplets.p.bz2' kplet_file_full = os.path.join(data_path, kplet_file) print kplet_file_full kplets = t.load_compressed_pickle(kplet_file_full) # print "Starting for", kplet_file # print "Loading kplets" # kplets = t.load_compressed_pickle(os.path.join(data_path, kplet_file)) print "No of kplets:", len(kplets) #print "Loading file2genes" #_file2genes = {} #for _f in os.listdir(neighborhood_files_path): # _file2genes[_f] = dt.get_wgs_file(os.path.join(neighborhood_files_path, _f)) # print 'Filtering' # kplets = filter_seed(kplets, _file2genes) # print "No of kplets:", len(kplets) # fname = os.path.join(data_path, kplet_file.split('.')[0]+'_seed.p.bz2') # print 'Dumping', fname # t.dump_compressed_pickle(fname, kplets) print "Basic merging" merged_lists = merging.basic_merge_within_orders(kplets) fname = os.path.join(data_path, "basic_merged_"+kplet_file) print "Dumping basic merging: ", fname t.dump_compressed_pickle(fname, merged_lists) print "Iterative merging" merged_lists = merging.merge_kplets_within_orders_iterative(merged_lists) fname = os.path.join(data_path, "iterative_merged_"+kplet_file) print "Dumping Iterative merging: ",fname t.dump_compressed_pickle(fname, merged_lists)