def merging_pipeline_for_order(order, load_from_db=False): limit_to = 1000000000 print "starting for ", order if load_from_db: print "Loading kplets from DB" if order == 2: kplet_file = 'duplets.p.bz2' #kplets = t.load_compressed_pickle(os.path.join(data_path, kplet_file)) elif order == 3: kplet_file = 'triplets.p.bz2' #kplets = t.load_compressed_pickle(os.path.join(data_path, kplet_file)) elif order == 4: kplet_file = 'quadruplets.p.bz2' kplets = q.get_report_kplets(profile_id2code, limit_to=limit_to) elif order == 5: kplet_file = 'pentaplets.p.bz2' kplets = p.get_report_kplets(profile_id2code, limit_to=limit_to) else: print "Loading kplets from pickle file" if order == 2: kplet_file = 'duplets.p.bz2' elif order == 3: kplet_file = 'triplets.p.bz2' elif order == 4: kplet_file = 'quadruplets.p.bz2' elif order == 5: kplet_file = 'pentaplets.p.bz2' kplet_file_full = os.path.join(data_path, kplet_file) print kplet_file_full kplets = t.load_compressed_pickle(kplet_file_full) # print "Starting for", kplet_file # print "Loading kplets" # kplets = t.load_compressed_pickle(os.path.join(data_path, kplet_file)) print "No of kplets:", len(kplets) #print "Loading file2genes" #_file2genes = {} #for _f in os.listdir(neighborhood_files_path): # _file2genes[_f] = dt.get_wgs_file(os.path.join(neighborhood_files_path, _f)) # print 'Filtering' # kplets = filter_seed(kplets, _file2genes) # print "No of kplets:", len(kplets) # fname = os.path.join(data_path, kplet_file.split('.')[0]+'_seed.p.bz2') # print 'Dumping', fname # t.dump_compressed_pickle(fname, kplets) print "Basic merging" merged_lists = merging.basic_merge_within_orders(kplets) fname = os.path.join(data_path, "basic_merged_" + kplet_file) print "Dumping basic merging: ", fname t.dump_compressed_pickle(fname, merged_lists) print "Iterative merging" merged_lists = merging.merge_kplets_within_orders_iterative(merged_lists) fname = os.path.join(data_path, "iterative_merged_" + kplet_file) print "Dumping Iterative merging: ", fname t.dump_compressed_pickle(fname, merged_lists)
def get_profiles_counts(data_path): # file_names = ['pentaplets', 'quadruplets', 'triplets', 'duplets'] file_names = ['duplets', 'triplets', 'quadruplets', 'pentaplets'] print 'Reading merged kplet files' for file_name in file_names: print 'Loading the file:', file_name dump_file = bz2.BZ2File(os.path.join(data_path, '%s_merged_across.p.bz2'%file_name)) kplets_pool = pickle.load(dump_file) print 'Counting community' community_count_pool = [] community_count_pool_with_flanks = [] for kplets in kplets_pool: _src2org, _, _, community_count, community_count_with_flanks = merging.merge_into_file_summaries(kplets, neighborhood_files_path, file2src_src2org_map) if not _src2org: continue community_count_pool.append(community_count) community_count_pool_with_flanks.append(community_count_with_flanks) dump_file_name = os.path.join(data_path, '%s_community_count.p.bz2'%file_name) print 'Dumping into', dump_file_name t.dump_compressed_pickle(dump_file_name, community_count_pool) dump_file_name = os.path.join(data_path, '%s_community_count_with_flanks.p.bz2'%file_name) print 'Dumping into', dump_file_name t.dump_compressed_pickle(dump_file_name, community_count_pool_with_flanks) print print
def generate_reports(merged_lists, reports_dir, neighborhood_files_path): if not os.path.exists(reports_dir): os.mkdir(reports_dir) summary_file = os.path.join(reports_dir, 'summary.xls') workbook = x.Workbook(summary_file) worksheet = workbook.add_worksheet() header_format = workbook.add_format() header_format.set_font_size(12) header_format.set_bold() header_format.set_align('center') worksheet.set_column(3,3,50) worksheet.write_row(0, 0, ["File name", "Weight", "Loci", "CRISPR/Cas systems"], header_format) file_summary_list = [] filter_weak_hits = False for i, kplet_list in enumerate(merged_lists): ret = merging.kplet_list_to_file_summaries(kplet_list, neighborhood_files_path, filter_weak_hits) if not ret or not ret.file_summaries: continue file_summary_list.append(ret) file_summaries_list = sorted(file_summary_list, key=lambda x: x.weight, reverse=True) ind = 0 for file_summaries_wrapper in file_summaries_list: ind += 1 t.dump_compressed_pickle('file_summaries_wrapper.p', file_summaries_wrapper) xls_file_name = os.path.join(reports_dir, '%d.xls' % ind) args = GenericReportingInput() args.xls_file_name = xls_file_name args.file_summaries = file_summaries_wrapper.file_summaries args.organisms = file_summaries_wrapper.organisms args.weight = file_summaries_wrapper.weight args.profile_code2def = profile_code2def args.local_af_kplet2count = file_summaries_wrapper.kplet2count_af args.local_bf_kplet2count = file_summaries_wrapper.kplet2count_bf args.local_profile2count_bf = file_summaries_wrapper.profile2count_bf args.local_profile2count_af = file_summaries_wrapper.profile2count_af args.cas_type2count = file_summaries_wrapper.cas_type2count r.write_to_xls_generic_kplets(args) cas_type_summary = "" for (cas_type, count) in sorted(file_summaries_wrapper.cas_type2count.items(), key = itemgetter(1), reverse=True): cas_type_summary += "%s : %d ; "%(cas_type, count) worksheet.write_row(ind+1, 0, ['%d.xls'%ind, len(file_summaries_wrapper.file_summaries), cas_type_summary])
def generate_data(): print "Loading loci" loci = [Locus(os.path.join(files_path, f)) for f in os.listdir(files_path)] loci = [locus for locus in loci if len(locus.genes) > 2] fname = os.path.join(pickle_path, 'loci.p.bz2') t.dump_compressed_pickle(fname, loci) # loci = t.load_compressed_pickle(fname) out_file = os.path.join(pickle_path, 'jw_scores.npz') jackard_weighted_scores(loci, out_file)
def generate_data(): print "Loading loci" loci = [Locus(os.path.join(files_path, f)) for f in os.listdir(files_path)] loci = [locus for locus in loci if len(locus.genes) > 2 ] fname = os.path.join(pickle_path, 'loci.p.bz2') t.dump_compressed_pickle(fname, loci) # loci = t.load_compressed_pickle(fname) out_file = os.path.join(pickle_path, 'jw_scores.npz') jackard_weighted_scores(loci, out_file)
def generate_pickles(save_path, limit_to): if not os.path.exists(save_path): os.mkdir(save_path) duplets = d.get_report_kplets(limit_to=limit_to, load_locations=True) print 'Dumping to files' dump_file = os.path.join(save_path, 'duplets_raw.p.bz2') t.dump_compressed_pickle(dump_file, duplets) print 'Done for limit_to:', limit_to print print
def generate_pickles(save_path, limit_to): if not os.path.exists(save_path): os.mkdir(save_path) print "Loading from DB" print "pentaplets" pentaplets = p.get_report_kplets(profile_id2code, limit_to=limit_to) print "quadruplets" quadruplets = q.get_report_kplets(profile_id2code, limit_to=limit_to) print "triplets" triplets = tr.get_report_kplets(profile_id2code, limit_to=limit_to) print "duplets" duplets = d.get_report_kplets(profile_id2code, limit_to=limit_to) print "Dumping to files" dump_file = os.path.join(save_path, 'duplets.p.bz2') print dump_file t.dump_compressed_pickle(dump_file, duplets) dump_file = os.path.join(save_path, 'triplets.p.bz2') print dump_file t.dump_compressed_pickle(dump_file, triplets) dump_file = os.path.join(save_path, 'quadruplets.p.bz2') print dump_file t.dump_compressed_pickle(dump_file, quadruplets) dump_file = os.path.join(save_path, 'pentaplets.p.bz2') print dump_file t.dump_compressed_pickle(dump_file, pentaplets)
def generate_pickle_order(prefix, order, save_path, limit_to): print "Loading from DB" if order == 2: print 'duplets' data_file = 'duplets.p.bz2' kplets = d.get_report_kplets(prefix, profile_id2code, limit_to=limit_to) elif order == 3: print 'triplets' data_file = 'triplets.p.bz2' kplets = tr.get_report_kplets(prefix, profile_id2code, limit_to=limit_to) elif order == 4: print 'quadruplets' data_file = 'quadruplets.p.bz2' kplets = q.get_report_kplets(prefix, profile_id2code, limit_to=limit_to) elif order == 5: print 'pentaplets' data_file = 'pentaplets.p.bz2' kplets = p.get_report_kplets(prefix, profile_id2code, limit_to=limit_to) # # block for work aorund of too bign pentaplet # print 'Loading file2genes' # neighborhood_files_path = os.path.join(gv.project_data_path,'CRISPR/datasets/crispr/wgs') # _file2genes = {} # for _f in os.listdir(neighborhood_files_path): # _file2genes[_f] = dt.get_wgs_file(os.path.join(neighborhood_files_path, _f)) # # kplets = filter_seed(kplets, _file2genes) # dump_file = os.path.join(save_path, data_file) # print "Dumtiping to file", dump_file # t.dump_compressed_pickle(kplets, dump_file) # print "Finished" # sys.exit() dump_file = os.path.join(save_path, data_file) print "Dumping to file", dump_file t.dump_compressed_pickle(dump_file, kplets) print "Finished"
def cas4_extract_dendrogram(): work_dir = os.path.join(gv.project_data_path, 'cas4') print "Loading loci" def_file = os.path.join(gv.project_data_path, 'cas4/profiles/defenseProfiles.tab') profile2gene = {} for l in open(def_file): terms = l.split('\t') profile = terms[0] gene_names = terms[3].split(',') if len(gene_names) > 1: profile2gene[profile] = gene_names[1] else: profile2gene[profile] = gene_names[0] cdd_profile2gene = t.map_cdd_profile2gene_name() cdd_profile2gene.update(profile2gene) files_path = os.path.join(work_dir, 'files') loci = [Locus(os.path.join(files_path, f), file_format='generic', profile2gene=cdd_profile2gene) for f in os.listdir(files_path)] tic = time.time() print "Generating score matrix" M = scores.generate_jackard_score_matrix(loci) tac = time.time() - tic print "Elapsed time:", float(tac) / 60 / 60, float(tac) / 60, float(tac) tic = time.time() jw_file = os.path.join(work_dir, 'pickle/jw_scores.p.bz2') print "Dumping JW scores to:", jw_file t.dump_compressed_pickle(jw_file, M) tac = time.time() - tic print "Elapsed time:", float(tac) / 60 / 60, float(tac) / 60, float(tac) # print "Loading JW scores from:", prok1603_jw_file # M = t.load_compressed_pickle(prok1603_jw_file) tree_file = os.path.join(work_dir, 'jw_upgma.tre') print "Generating tree:", tree_file dnd.convert_score_to_newick(M, [os.path.basename(l.file_name) for l in loci], tree_file)
def generate_pickle_order(order, save_path, limit_to): print "Loading from DB" if order == 2: print 'duplets' data_file = 'duplets.p.bz2' kplets = d.get_report_kplets(profile_id2code, limit_to=limit_to) elif order == 3: print 'triplets' data_file = 'triplets.p.bz2' kplets = tr.get_report_kplets(profile_id2code, limit_to=limit_to) elif order == 4: print 'quadruplets' data_file = 'quadruplets.p.bz2' kplets = q.get_report_kplets(profile_id2code, limit_to=limit_to) elif order == 5: print 'pentaplets' data_file = 'pentaplets.p.bz2' kplets = p.get_report_kplets(profile_id2code, limit_to=limit_to) # # block for work aorund of too bign pentaplet # print 'Loading file2genes' # neighborhood_files_path = os.path.join(gv.project_data_path,'CRISPR/datasets/crispr/wgs') # _file2genes = {} # for _f in os.listdir(neighborhood_files_path): # _file2genes[_f] = dt.get_wgs_file(os.path.join(neighborhood_files_path, _f)) # # kplets = filter_seed(kplets, _file2genes) # dump_file = os.path.join(save_path, data_file) # print "Dumtiping to file", dump_file # t.dump_compressed_pickle(kplets, dump_file) # print "Finished" # sys.exit() dump_file = os.path.join(save_path, data_file) print "Dumping to file", dump_file t.dump_compressed_pickle(dump_file, kplets) print "Finished"
def get_profiles_counts(data_path): # file_names = ['pentaplets', 'quadruplets', 'triplets', 'duplets'] file_names = ['duplets', 'triplets', 'quadruplets', 'pentaplets'] print 'Reading merged kplet files' for file_name in file_names: print 'Loading the file:', file_name dump_file = bz2.BZ2File( os.path.join(data_path, '%s_merged_across.p.bz2' % file_name)) kplets_pool = pickle.load(dump_file) print 'Counting community' community_count_pool = [] community_count_pool_with_flanks = [] for kplets in kplets_pool: _src2org, _, _, community_count, community_count_with_flanks = merging.merge_into_file_summaries( kplets, neighborhood_files_path, file2src_src2org_map) if not _src2org: continue community_count_pool.append(community_count) community_count_pool_with_flanks.append( community_count_with_flanks) dump_file_name = os.path.join(data_path, '%s_community_count.p.bz2' % file_name) print 'Dumping into', dump_file_name t.dump_compressed_pickle(dump_file_name, community_count_pool) dump_file_name = os.path.join( data_path, '%s_community_count_with_flanks.p.bz2' % file_name) print 'Dumping into', dump_file_name t.dump_compressed_pickle(dump_file_name, community_count_pool_with_flanks) print print
def graph_from_prok1603(): genome2weight = parser.map_org2weight() pty_path = "/panfs/pan1.be-md.ncbi.nlm.nih.gov/patternquest/data/Prok1603/pty" work_dir = os.path.join(data_path, 'prok1603/graph/') crisprs = parser.get_crispr_annotations() pair2weight = defaultdict(float) pair2count = defaultdict(int) profile2weight = defaultdict(float) cnt = 1 weighted_sum = 0 total_weight = 0 for dir in os.listdir(pty_path): if dir not in genome2weight: continue print cnt, dir _weight = genome2weight[dir] source_files = [ f for f in os.listdir(os.path.join(pty_path, dir)) if f.endswith(".pty2") ] for source_file in source_files: source = os.path.splitext(source_file)[0] genes = t.parse_pty_file(os.path.join(pty_path, dir, source_file)) if not genes: continue genes += crisprs[source] genes = sorted(genes) add_to_graph(pair2weight, pair2count, profile2weight, genes, _weight) profiles_num = 0 for gene in genes: _p = gene.profiles if _p: profiles_num += len(_p) else: profiles_num += 1 if profiles_num > 0: weighted_sum += 2 * _weight / profiles_num total_weight += _weight cnt += 1 # if cnt == 10: # break print "Weighted average of 2/N", weighted_sum / total_weight # Result of the above is: 0.0451314274706 G = nx.Graph() for k, v in pair2weight.items(): p1, p2 = k.split("-") G.add_edge(p1, p2, weight=v) _count = pair2count[k] G.add_edge(p1, p2, count=_count) graph_file = os.path.join(work_dir, "adj_graph.p") print "Writing to file:", graph_file nx.write_gpickle(G, graph_file) graph_file = os.path.join(work_dir, "adj_graph.p.bz2") print "Writing to file:", graph_file t.dump_compressed_pickle(graph_file, G)
def merging_pipeline_for_order(order, data_path, load_from_db=False): limit_to = 1000000000 print "starting for ", order if load_from_db: print "Loading kplets from DB" if order == 2: kplet_file = 'duplets.p.bz2' elif order == 3: kplet_file = 'triplets.p.bz2' elif order == 4: kplet_file = 'quadruplets.p.bz2' kplets = q.get_report_kplets(profile_id2code, limit_to=limit_to) elif order == 5: kplet_file = 'pentaplets.p.bz2' kplets = p.get_report_kplets(profile_id2code, limit_to=limit_to) else: print "Loading kplets from pickle file" if order == 2: kplet_file = 'duplets.p.bz2' elif order == 3: kplet_file = 'triplets.p.bz2' elif order == 4: kplet_file = 'quadruplets.p.bz2' elif order == 5: kplet_file = 'pentaplets.p.bz2' kplet_file_full = os.path.join(data_path, kplet_file) print "Loading :",kplet_file_full kplets = t.load_compressed_pickle(kplet_file_full) print "No of kplets:", len(kplets) loci_threshold = 0.7 print "Starting to merge with loci_threshold:", loci_threshold #print "Loading file2genes" tic = time.time() print "Basic merging" merged_lists = merging.basic_merge_within_orders(kplets, loci_threshold) print "Basic merging done. Merged lists:", len(merged_lists) fname = os.path.join(data_path, "basic_merged_"+"%f_"%loci_threshold+kplet_file) print "Dumping basic merging: ", fname t.dump_compressed_pickle(fname, merged_lists) print "Iterative merging" merged_lists = merging.merge_kplets_within_orders_iterative(merged_lists, loci_threshold) print "Iterative merging done. Merged lists:", len(merged_lists) fname = os.path.join(data_path, "iterative_merged_"+"%f_"%loci_threshold+kplet_file) print "Dumping Iterative merging: ",fname t.dump_compressed_pickle(fname, merged_lists) print "Completed in:", time.time()-tic, "(s)" loci_threshold = 0.8 print "Starting to merge with loci_threshold:", loci_threshold #print "Loading file2genes" tic = time.time() print "Basic merging" merged_lists = merging.basic_merge_within_orders(kplets, loci_threshold) print "Basic merging done. Merged lists:", len(merged_lists) fname = os.path.join(data_path, "basic_merged_"+"%f_"%loci_threshold+kplet_file) print "Dumping basic merging: ", fname t.dump_compressed_pickle(fname, merged_lists) print "Iterative merging" merged_lists = merging.merge_kplets_within_orders_iterative(merged_lists, loci_threshold) print "Iterative merging done. Merged lists:", len(merged_lists) fname = os.path.join(data_path, "iterative_merged_"+"%f_"%loci_threshold+kplet_file) print "Dumping Iterative merging: ",fname t.dump_compressed_pickle(fname, merged_lists) print "Completed in:", time.time()-tic, "(s)" loci_threshold = 0.9 print "Starting to merge with loci_threshold:", loci_threshold #print "Loading file2genes" tic = time.time() print "Basic merging" merged_lists = merging.basic_merge_within_orders(kplets, loci_threshold) print "Basic merging done. Merged lists:", len(merged_lists) fname = os.path.join(data_path, "basic_merged_"+"%f_"%loci_threshold+kplet_file) print "Dumping basic merging: ", fname t.dump_compressed_pickle(fname, merged_lists) print "Iterative merging" merged_lists = merging.merge_kplets_within_orders_iterative(merged_lists, loci_threshold) print "Iterative merging done. Merged lists:", len(merged_lists) fname = os.path.join(data_path, "iterative_merged_"+"%f_"%loci_threshold+kplet_file) print "Dumping Iterative merging: ",fname t.dump_compressed_pickle(fname, merged_lists) print "Completed in:", time.time()-tic, "(s)"
from lib.db.bacteria import pentaplets as p if __name__=='__main__': work_path = os.path.join(gv.project_data_path, 'Bacteria/cases') pty_path = gv.pty_data_path kplet_id = 306123 id2cdd = map_id2cdd_clusters() kplet = p.get_report_kplet(kplet_id, id2cdd, load_locations=True) target_profiles = set(t.bacteria_target_profiles()) dump_file = os.path.join(work_path, 'kplet.p.bz2') t.dump_compressed_pickle(dump_file, kplet) kplet = t.load_compressed_pickle(dump_file) kplet_codes = kplet.codes.difference(target_profiles) org2src, src2blocks = sig.search_kplet_in_genomes(kplet_codes, target_profiles, max_dist=4) # dump_file = os.path.join(work_path, 'org2src_global.p.bz2') # t.dump_compressed_pickle(dump_file, org2src) # dump_file = os.path.join(work_path, 'src2blocks_global.p.bz2') # t.dump_compressed_pickle(dump_file, src2blocks) # # dump_file = os.path.join(work_path, 'org2src_global.p.bz2') # org2src = t.load_compressed_pickle(dump_file) # dump_file = os.path.join(work_path, 'src2blocks_global.p.bz2') # src2blocks = t.load_compressed_pickle(dump_file)
def generate_reports(merged_lists, reports_dir, neighborhood_files_path): if not os.path.exists(reports_dir): os.mkdir(reports_dir) summary_file = os.path.join(reports_dir, 'summary.xls') workbook = x.Workbook(summary_file) worksheet = workbook.add_worksheet() header_format = workbook.add_format() header_format.set_font_size(12) header_format.set_bold() header_format.set_align('center') worksheet.set_column(3, 3, 50) worksheet.write_row(0, 0, ["File name", "Weight", "Loci", "CRISPR/Cas systems"], header_format) file_summary_list = [] filter_weak_hits = False for i, kplet_list in enumerate(merged_lists): ret = merging.kplet_list_to_file_summaries(kplet_list, neighborhood_files_path, filter_weak_hits) if not ret or not ret.file_summaries: continue file_summary_list.append(ret) file_summaries_list = sorted(file_summary_list, key=lambda x: x.weight, reverse=True) ind = 0 for file_summaries_wrapper in file_summaries_list: ind += 1 t.dump_compressed_pickle('file_summaries_wrapper.p', file_summaries_wrapper) xls_file_name = os.path.join(reports_dir, '%d.xls' % ind) args = GenericReportingInput() args.xls_file_name = xls_file_name args.file_summaries = file_summaries_wrapper.file_summaries args.organisms = file_summaries_wrapper.organisms args.weight = file_summaries_wrapper.weight args.profile_code2def = profile_code2def args.local_af_kplet2count = file_summaries_wrapper.kplet2count_af args.local_bf_kplet2count = file_summaries_wrapper.kplet2count_bf args.local_profile2count_bf = file_summaries_wrapper.profile2count_bf args.local_profile2count_af = file_summaries_wrapper.profile2count_af args.cas_type2count = file_summaries_wrapper.cas_type2count r.write_to_xls_generic_kplets(args) cas_type_summary = "" for (cas_type, count) in sorted(file_summaries_wrapper.cas_type2count.items(), key=itemgetter(1), reverse=True): cas_type_summary += "%s : %d ; " % (cas_type, count) worksheet.write_row(ind + 1, 0, [ '%d.xls' % ind, len(file_summaries_wrapper.file_summaries), cas_type_summary ])
def merging_pipeline_for_order(order, data_path, load_from_db=False): limit_to = 1000000000 print "starting for ", order if load_from_db: print "Loading kplets from DB" if order == 2: kplet_file = 'duplets.p.bz2' elif order == 3: kplet_file = 'triplets.p.bz2' elif order == 4: kplet_file = 'quadruplets.p.bz2' kplets = q.get_report_kplets(profile_id2code, limit_to=limit_to) elif order == 5: kplet_file = 'pentaplets.p.bz2' kplets = p.get_report_kplets(profile_id2code, limit_to=limit_to) else: print "Loading kplets from pickle file" if order == 2: kplet_file = 'duplets.p.bz2' elif order == 3: kplet_file = 'triplets.p.bz2' elif order == 4: kplet_file = 'quadruplets.p.bz2' elif order == 5: kplet_file = 'pentaplets.p.bz2' kplet_file_full = os.path.join(data_path, kplet_file) print "Loading :", kplet_file_full kplets = t.load_compressed_pickle(kplet_file_full) print "No of kplets:", len(kplets) loci_threshold = 0.7 print "Starting to merge with loci_threshold:", loci_threshold #print "Loading file2genes" tic = time.time() print "Basic merging" merged_lists = merging.basic_merge_within_orders(kplets, loci_threshold) print "Basic merging done. Merged lists:", len(merged_lists) fname = os.path.join(data_path, "basic_merged_" + "%f_" % loci_threshold + kplet_file) print "Dumping basic merging: ", fname t.dump_compressed_pickle(fname, merged_lists) print "Iterative merging" merged_lists = merging.merge_kplets_within_orders_iterative( merged_lists, loci_threshold) print "Iterative merging done. Merged lists:", len(merged_lists) fname = os.path.join( data_path, "iterative_merged_" + "%f_" % loci_threshold + kplet_file) print "Dumping Iterative merging: ", fname t.dump_compressed_pickle(fname, merged_lists) print "Completed in:", time.time() - tic, "(s)" loci_threshold = 0.8 print "Starting to merge with loci_threshold:", loci_threshold #print "Loading file2genes" tic = time.time() print "Basic merging" merged_lists = merging.basic_merge_within_orders(kplets, loci_threshold) print "Basic merging done. Merged lists:", len(merged_lists) fname = os.path.join(data_path, "basic_merged_" + "%f_" % loci_threshold + kplet_file) print "Dumping basic merging: ", fname t.dump_compressed_pickle(fname, merged_lists) print "Iterative merging" merged_lists = merging.merge_kplets_within_orders_iterative( merged_lists, loci_threshold) print "Iterative merging done. Merged lists:", len(merged_lists) fname = os.path.join( data_path, "iterative_merged_" + "%f_" % loci_threshold + kplet_file) print "Dumping Iterative merging: ", fname t.dump_compressed_pickle(fname, merged_lists) print "Completed in:", time.time() - tic, "(s)" loci_threshold = 0.9 print "Starting to merge with loci_threshold:", loci_threshold #print "Loading file2genes" tic = time.time() print "Basic merging" merged_lists = merging.basic_merge_within_orders(kplets, loci_threshold) print "Basic merging done. Merged lists:", len(merged_lists) fname = os.path.join(data_path, "basic_merged_" + "%f_" % loci_threshold + kplet_file) print "Dumping basic merging: ", fname t.dump_compressed_pickle(fname, merged_lists) print "Iterative merging" merged_lists = merging.merge_kplets_within_orders_iterative( merged_lists, loci_threshold) print "Iterative merging done. Merged lists:", len(merged_lists) fname = os.path.join( data_path, "iterative_merged_" + "%f_" % loci_threshold + kplet_file) print "Dumping Iterative merging: ", fname t.dump_compressed_pickle(fname, merged_lists) print "Completed in:", time.time() - tic, "(s)"
def generate_cas4_gi_summary_file(singles, cluster_packs, loci, reports_dir, cluster2summary_file_name): cluster2summary = {int(l.split('\t')[0]):l.split('\t')[1].strip() for l in open(cluster2summary_file_name)} summary_file = open(os.path.join(reports_dir, 'cas4_gi_summary.tab'), 'w') cas_filter = set(["cas3", "cas5", "cas8c", "cas7", "cas4", "cas1", "cas2"]) gi2crispr_type = {} ind = 1 for outer_i in range(len(cluster_packs)): (cluster, type2count, entropy, gene2count) = cluster_packs[outer_i] sorted_gene2count = sorted(gene2count.items(), key=lambda x: x[1], reverse=True) top_genes = set(k for k,v in sorted_gene2count[:5]) cl_loci = [loci[_i] for _i in cluster] cl_summary = cluster2summary[ind] for locus in cl_loci: cas4_genes = [g for g in locus.genes if g.is_seed] cl_genes = set(gene_name for gene in locus.genes for gene_name in gene.gene_name.split(',')) # cas_genes = ",".join(cas_filter.intersection(cl_genes)) cas_genes_summary = [] buffer = [] for gene in locus.genes: for gene_name in set(gene.gene_name.split(",")): if gene_name in top_genes: buffer.append(gene_name) cas_genes_summary += buffer buffer = [] continue buffer.append(gene_name if gene_name else "?") cas_genes_summary = "+".join(cas_genes_summary) for gene in cas4_genes: summary_file.write("%s\t%s\t%s\t%s\t%s\n" % (gene.gid, locus.organism, cl_summary, cas_genes_summary, "%d.xlsx" % ind)) gi2crispr_type[gene.gid] = locus.crispr_type ind += 1 for single_ind in singles: locus = loci[single_ind] cas4_genes = [g for g in locus.genes if g.is_seed] cas_genes_summary = [] buffer = [] for gene in locus.genes: for gene_name in gene.gene_name.split(","): if gene_name in cas_filter: buffer.append(gene_name) cas_genes_summary += buffer buffer = [] buffer.append(gene_name if gene_name else "?") cas_genes_summary = "+".join(cas_genes_summary) for gene in cas4_genes: summary_file.write("%s\t%s\t%s\t%s\n" % (gene.gid, locus.organism, "singleton", cas_genes_summary)) gi2crispr_type[gene.gid] = locus.crispr_type summary_file.close() fname = os.path.join(gv.project_data_path, 'cas4/pickle/cas4_gi2crispr_type.p.bz2') t.dump_compressed_pickle(fname, gi2crispr_type) return gi2crispr_type
from lib.db import map_id2cdd_clusters from lib.db.bacteria import pentaplets as p if __name__ == '__main__': work_path = os.path.join(gv.project_data_path, 'Bacteria/cases') pty_path = gv.pty_data_path kplet_id = 306123 id2cdd = map_id2cdd_clusters() kplet = p.get_report_kplet(kplet_id, id2cdd, load_locations=True) target_profiles = set(t.bacteria_target_profiles()) dump_file = os.path.join(work_path, 'kplet.p.bz2') t.dump_compressed_pickle(dump_file, kplet) kplet = t.load_compressed_pickle(dump_file) kplet_codes = kplet.codes.difference(target_profiles) org2src, src2blocks = sig.search_kplet_in_genomes(kplet_codes, target_profiles, max_dist=4) # dump_file = os.path.join(work_path, 'org2src_global.p.bz2') # t.dump_compressed_pickle(dump_file, org2src) # dump_file = os.path.join(work_path, 'src2blocks_global.p.bz2') # t.dump_compressed_pickle(dump_file, src2blocks) # # dump_file = os.path.join(work_path, 'org2src_global.p.bz2') # org2src = t.load_compressed_pickle(dump_file)
def generate_pickles(save_path, limit_to): if not os.path.exists(save_path): os.mkdir(save_path) # print "Loading kplets from DB" # pentaplets = p.get_report_kplets(limit_to=limit_to, load_locations=True) # quadruplets = q.get_report_kplets(limit_to=limit_to, load_locations=True) # triplets = tr.get_report_kplets(limit_to=limit_to, load_locations=True) # duplets = d.get_report_kplets(limit_to=limit_to, load_locations=True) # print "Dumping raw kplet data to files" # dump_file = bz2.BZ2File(os.path.join(save_path, 'duplets_raw.p.bz2'), 'w') # pickle.dump(duplets, dump_file) # dump_file = bz2.BZ2File(os.path.join(save_path, 'triplets_raw.p.bz2'), 'w') # pickle.dump(triplets, dump_file) # dump_file = bz2.BZ2File(os.path.join(save_path, 'quadruplets_raw.p.bz2'), 'w') # pickle.dump(quadruplets, dump_file) # dump_file = bz2.BZ2File(os.path.join(save_path, 'pentaplets_raw.p.bz2'), 'w') # pickle.dump(pentaplets, dump_file) print "Loading raw kplets from pickles" dump_file = os.path.join(save_path, 'duplets_raw.p.bz2') duplets = t.load_compressed_pickle(dump_file) dump_file = os.path.join(save_path, 'triplets_raw.p.bz2') triplets= t.load_compressed_pickle(dump_file) dump_file = os.path.join(save_path, 'quadruplets_raw.p.bz2') quadruplets = t.load_compressed_pickle(dump_file) # dump_file = os.path.join(save_path, 'pentaplets_raw.p.bz2') # pentaplets = t.load_compressed_pickle(dump_file) print "Basic within merging" # pentaplets = merging.basic_merge_within_orders(pentaplets) quadruplets= merging.basic_merge_within_orders(quadruplets) triplets = merging.basic_merge_within_orders(triplets) duplets = merging.basic_merge_within_orders(duplets) print "Dumping basic merges" # dump_file = os.path.join(save_path, 'pentaplets_basic_merged.p.bz2') # t.dump_compressed_pickle(dump_file, pentaplets) dump_file = os.path.join(save_path, 'quadruplets_basic_merged.p.bz2') t.dump_compressed_pickle(dump_file, quadruplets) dump_file = os.path.join(save_path, 'triplets_basic_merged.p.bz2') t.dump_compressed_pickle(dump_file, triplets) dump_file = os.path.join(save_path, 'duplets_basic_merged.p.bz2') t.dump_compressed_pickle(dump_file, duplets) # print "Loading basic merges" # dump_file = os.path.join(save_path, 'pentaplets_basic_merged.p.bz2') # pentaplets = t.load_compressed_pickle(dump_file) # dump_file = os.path.join(save_path, 'quadruplets_basic_merged.p.bz2') # quadruplets = t.load_compressed_pickle(dump_file) # dump_file = os.path.join(save_path, 'triplets_basic_merged.p.bz2') # triplets = t.load_compressed_pickle(dump_file) # dump_file = os.path.join(save_path, 'duplets_basic_merged.p.bz2') # duplets = t.load_compressed_pickle(dump_file) sys.exit() print 'Starting iterative within mergings' pentaplets = merging.merge_kplets_within_orders_iterative_2(pentaplets) quadruplets = merging.merge_kplets_within_orders_iterative_2(quadruplets) triplets = merging.merge_kplets_within_orders_iterative_2(triplets) duplets = merging.merge_kplets_within_orders_iterative_2(duplets) print "Dumping iterative merges" dump_file = os.path.join(save_path, 'pentaplets_iterative_merged.p.bz2') t.dump_compressed_pickle(dump_file, pentaplets) dump_file = os.path.join(save_path, 'quadruplets_iterative_merged.p.bz2') t.dump_compressed_pickle(dump_file, quadruplets) dump_file = os.path.join(save_path, 'triplets_iterative_merged.p.bz2') t.dump_compressed_pickle(dump_file, triplets) dump_file = os.path.join(save_path, 'duplets_iterative_merged.p.bz2') t.dump_compressed_pickle(dump_file, duplets) sys.exit() print 'Dumping merged kplet lists to files' dump_file = bz2.BZ2File(os.path.join(save_path, 'pentaplets_merged_within.p.bz2'), 'w') pickle.dump(pentaplets, dump_file) dump_file = bz2.BZ2File(os.path.join(save_path, 'quadruplets_merged_within.p.bz2'), 'w') pickle.dump(quadruplets, dump_file) dump_file = bz2.BZ2File(os.path.join(save_path, 'triplets_merged_within.p.bz2'), 'w') pickle.dump(triplets, dump_file) dump_file = bz2.BZ2File(os.path.join(save_path, 'duplets_merged_within.p.bz2'), 'w') pickle.dump(duplets, dump_file) print 'Starting accross mergings' triplets, duplets = merging.merge_kplets_across_orders(triplets, duplets) quadruplets, triplets = merging.merge_kplets_across_orders(quadruplets, triplets) pentaplets, quadruplets = merging.merge_kplets_across_orders(pentaplets, quadruplets) print 'Dumping to files' dump_file = bz2.BZ2File(os.path.join(save_path, 'pentaplets_merged_across.p.bz2'), 'w') pickle.dump(pentaplets, dump_file) dump_file = bz2.BZ2File(os.path.join(save_path, 'quadruplets_merged_across.p.bz2'), 'w') pickle.dump(quadruplets, dump_file) dump_file = bz2.BZ2File(os.path.join(save_path, 'triplets_merged_across.p.bz2'), 'w') pickle.dump(triplets, dump_file) dump_file = bz2.BZ2File(os.path.join(save_path, 'duplets_merged_across.p.bz2'), 'w') pickle.dump(duplets, dump_file) print 'Done for limit_to:', limit_to print print
def prok1603_extract_dendrogram(): work_dir = os.path.join(gv.project_data_path, 'UvrD/') files_path = os.path.join(work_dir, 'prok1603/merged_files/') print "Loading loci" def_file = os.path.join(gv.project_data_path, 'cas4/profiles/defenseProfiles.tab') profile2gene = {} for l in open(def_file): terms = l.split('\t') profile = terms[0] gene_names = terms[3].split(',') if len(gene_names) > 1: profile2gene[profile] = gene_names[1] else: profile2gene[profile] = gene_names[0] cdd_profile2gene = t.map_cdd_profile2gene_name() cdd_profile2gene.update(profile2gene) loci = [BasicLocus(os.path.join(files_path, f), profile2gene=cdd_profile2gene) for f in os.listdir(files_path)] prok1603_loci_file = os.path.join(work_dir, 'prok1603/prok1603_loci.p.bz2') # loci = t.load_compressed_pickle(prok1603_loci_file) print "Loci:", len(loci) print "Dumping loci to:", prok1603_loci_file t.dump_compressed_pickle(prok1603_loci_file, loci) sys.exit() tic = time.time() print "Generating score matrix" M = scores.generate_jackard_score_matrix(loci) tac = time.time() - tic print "Elapsed time:", float(tac)/60/60, float(tac)/60, float(tac) tic = time.time() prok1603_jw_file = os.path.join(work_dir, 'prok1603_jw_scores.p') print "Dumping JW scores to:", prok1603_jw_file with open(prok1603_jw_file, 'wb') as outf: cPickle.dump(M, outf, protocol=cPickle.HIGHEST_PROTOCOL) tac = time.time() - tic print "Elapsed time:", float(tac) / 60 / 60, float(tac) / 60, float(tac) tic = time.time() prok1603_jw_file = os.path.join(work_dir, 'prok1603_jw_scores.p.bz2') print "Dumping JW scores to:", prok1603_jw_file t.dump_compressed_pickle(prok1603_jw_file, M) tac = time.time() - tic print "Elapsed time:", float(tac) / 60 / 60, float(tac) / 60, float(tac) # print "Loading JW scores from:", prok1603_jw_file # M = t.load_compressed_pickle(prok1603_jw_file) tic = time.time() prok1603_jw_file = os.path.join(work_dir, 'prok1603/prok1603_jw_scores.npz') # prok1603_jw_file = os.path.join('/Users/hudaiber/Projects/NewSystems/data/UvrD/prok1603/prok1603_jw_scores.npz') print "Dumping JW scores to:", prok1603_jw_file np.savez_compressed(prok1603_jw_file, M) tac = time.time() - tic print "Elapsed time:", float(tac) / 60 / 60, float(tac) / 60, float(tac) # print "Loading JW scores from:", prok1603_jw_file # M = t.load_compressed_pickle(prok1603_jw_file) prok1603_tree_file = os.path.join(work_dir, 'prok1603/prok1603_upgma.tre') print "Generating tree:", prok1603_tree_file dnd.convert_score_to_newick(M, [os.path.basename(l.file_name) for l in loci], prok1603_tree_file)
def generate_pickles(save_path, limit_to): if not os.path.exists(save_path): os.mkdir(save_path) pentaplets = p.get_report_kplets(profile_id2code, limit_to=limit_to, load_locations=True) quadruplets = q.get_report_kplets(profile_id2code, limit_to=limit_to, load_locations=True) triplets = tr.get_report_kplets(profile_id2code, limit_to=limit_to, load_locations=True) duplets = d.get_report_kplets(profile_id2code, limit_to=limit_to, load_locations=True) print "Dumping raw kplet data to files" dump_file = os.path.join(save_path, 'duplets_raw.p.bz2') t.dump_compressed_pickle(dump_file, duplets) dump_file = os.path.join(save_path, 'triplets_raw.p.bz2') t.dump_compressed_pickle(dump_file, triplets) dump_file = os.path.join(save_path, 'quadruplets_raw.p.bz2') t.dump_compressed_pickle(dump_file, quadruplets) dump_file = os.path.join(save_path, 'pentaplets_raw.p.bz2') t.dump_compressed_pickle(dump_file, pentaplets) # print "Loading raw kplets from pickles" # dump_file = os.path.join(save_path, 'duplets_raw.p.bz2') # duplets = t.load_compressed_pickle(dump_file) # dump_file = os.path.join(save_path, 'triplets_raw.p.bz2') # triplets= t.load_compressed_pickle(dump_file) # dump_file = os.path.join(save_path, 'quadruplets_raw.p.bz2') # quadruplets = t.load_compressed_pickle(dump_file) # dump_file = os.path.join(save_path, 'pentaplets_raw.p.bz2') # pentaplets = t.load_compressed_pickle(dump_file) print "Basic within merging" pentaplets = merging.basic_merge_within_orders(pentaplets) quadruplets= merging.basic_merge_within_orders(quadruplets) triplets = merging.basic_merge_within_orders(triplets) duplets = merging.basic_merge_within_orders(duplets) print "Dumping basic merges" dump_file = os.path.join(save_path, 'pentaplets_basic_merged.p.bz2') t.dump_compressed_pickle(dump_file, pentaplets) dump_file = os.path.join(save_path, 'quadruplets_basic_merged.p.bz2') t.dump_compressed_pickle(dump_file, quadruplets) dump_file = os.path.join(save_path, 'triplets_basic_merged.p.bz2') t.dump_compressed_pickle(dump_file, triplets) dump_file = os.path.join(save_path, 'duplets_basic_merged.p.bz2') t.dump_compressed_pickle(dump_file, duplets)
def generate_pickles(save_path, limit_to): if not os.path.exists(save_path): os.mkdir(save_path) pentaplets = p.get_report_kplets(profile_id2code, limit_to=limit_to, load_locations=True) quadruplets = q.get_report_kplets(profile_id2code, limit_to=limit_to, load_locations=True) triplets = tr.get_report_kplets(profile_id2code, limit_to=limit_to, load_locations=True) duplets = d.get_report_kplets(profile_id2code, limit_to=limit_to, load_locations=True) print "Dumping raw kplet data to files" dump_file = os.path.join(save_path, 'duplets_raw.p.bz2') t.dump_compressed_pickle(dump_file, duplets) dump_file = os.path.join(save_path, 'triplets_raw.p.bz2') t.dump_compressed_pickle(dump_file, triplets) dump_file = os.path.join(save_path, 'quadruplets_raw.p.bz2') t.dump_compressed_pickle(dump_file, quadruplets) dump_file = os.path.join(save_path, 'pentaplets_raw.p.bz2') t.dump_compressed_pickle(dump_file, pentaplets) # print "Loading raw kplets from pickles" # dump_file = os.path.join(save_path, 'duplets_raw.p.bz2') # duplets = t.load_compressed_pickle(dump_file) # dump_file = os.path.join(save_path, 'triplets_raw.p.bz2') # triplets= t.load_compressed_pickle(dump_file) # dump_file = os.path.join(save_path, 'quadruplets_raw.p.bz2') # quadruplets = t.load_compressed_pickle(dump_file) # dump_file = os.path.join(save_path, 'pentaplets_raw.p.bz2') # pentaplets = t.load_compressed_pickle(dump_file) print "Basic within merging" pentaplets = merging.basic_merge_within_orders(pentaplets) quadruplets = merging.basic_merge_within_orders(quadruplets) triplets = merging.basic_merge_within_orders(triplets) duplets = merging.basic_merge_within_orders(duplets) print "Dumping basic merges" dump_file = os.path.join(save_path, 'pentaplets_basic_merged.p.bz2') t.dump_compressed_pickle(dump_file, pentaplets) dump_file = os.path.join(save_path, 'quadruplets_basic_merged.p.bz2') t.dump_compressed_pickle(dump_file, quadruplets) dump_file = os.path.join(save_path, 'triplets_basic_merged.p.bz2') t.dump_compressed_pickle(dump_file, triplets) dump_file = os.path.join(save_path, 'duplets_basic_merged.p.bz2') t.dump_compressed_pickle(dump_file, duplets)
def generate_cas4_gi_summary_file(singles, cluster_packs, loci, reports_dir, cluster2summary_file_name): cluster2summary = { int(l.split('\t')[0]): l.split('\t')[1].strip() for l in open(cluster2summary_file_name) } summary_file = open(os.path.join(reports_dir, 'cas4_gi_summary.tab'), 'w') cas_filter = set(["cas3", "cas5", "cas8c", "cas7", "cas4", "cas1", "cas2"]) gi2crispr_type = {} ind = 1 for outer_i in range(len(cluster_packs)): (cluster, type2count, entropy, gene2count) = cluster_packs[outer_i] sorted_gene2count = sorted(gene2count.items(), key=lambda x: x[1], reverse=True) top_genes = set(k for k, v in sorted_gene2count[:5]) cl_loci = [loci[_i] for _i in cluster] cl_summary = cluster2summary[ind] for locus in cl_loci: cas4_genes = [g for g in locus.genes if g.is_seed] cl_genes = set(gene_name for gene in locus.genes for gene_name in gene.gene_name.split(',')) # cas_genes = ",".join(cas_filter.intersection(cl_genes)) cas_genes_summary = [] buffer = [] for gene in locus.genes: for gene_name in set(gene.gene_name.split(",")): if gene_name in top_genes: buffer.append(gene_name) cas_genes_summary += buffer buffer = [] continue buffer.append(gene_name if gene_name else "?") cas_genes_summary = "+".join(cas_genes_summary) for gene in cas4_genes: summary_file.write("%s\t%s\t%s\t%s\t%s\n" % (gene.gid, locus.organism, cl_summary, cas_genes_summary, "%d.xlsx" % ind)) gi2crispr_type[gene.gid] = locus.crispr_type ind += 1 for single_ind in singles: locus = loci[single_ind] cas4_genes = [g for g in locus.genes if g.is_seed] cas_genes_summary = [] buffer = [] for gene in locus.genes: for gene_name in gene.gene_name.split(","): if gene_name in cas_filter: buffer.append(gene_name) cas_genes_summary += buffer buffer = [] buffer.append(gene_name if gene_name else "?") cas_genes_summary = "+".join(cas_genes_summary) for gene in cas4_genes: summary_file.write( "%s\t%s\t%s\t%s\n" % (gene.gid, locus.organism, "singleton", cas_genes_summary)) gi2crispr_type[gene.gid] = locus.crispr_type summary_file.close() fname = os.path.join(gv.project_data_path, 'cas4/pickle/cas4_gi2crispr_type.p.bz2') t.dump_compressed_pickle(fname, gi2crispr_type) return gi2crispr_type
def generate_pickles(save_path, limit_to): if not os.path.exists(save_path): os.mkdir(save_path) # print "Loading kplets from DB" # pentaplets = p.get_report_kplets(limit_to=limit_to, load_locations=True) # quadruplets = q.get_report_kplets(limit_to=limit_to, load_locations=True) # triplets = tr.get_report_kplets(limit_to=limit_to, load_locations=True) # duplets = d.get_report_kplets(limit_to=limit_to, load_locations=True) # print "Dumping raw kplet data to files" # dump_file = bz2.BZ2File(os.path.join(save_path, 'duplets_raw.p.bz2'), 'w') # pickle.dump(duplets, dump_file) # dump_file = bz2.BZ2File(os.path.join(save_path, 'triplets_raw.p.bz2'), 'w') # pickle.dump(triplets, dump_file) # dump_file = bz2.BZ2File(os.path.join(save_path, 'quadruplets_raw.p.bz2'), 'w') # pickle.dump(quadruplets, dump_file) # dump_file = bz2.BZ2File(os.path.join(save_path, 'pentaplets_raw.p.bz2'), 'w') # pickle.dump(pentaplets, dump_file) print "Loading raw kplets from pickles" dump_file = os.path.join(save_path, 'duplets_raw.p.bz2') duplets = t.load_compressed_pickle(dump_file) dump_file = os.path.join(save_path, 'triplets_raw.p.bz2') triplets = t.load_compressed_pickle(dump_file) dump_file = os.path.join(save_path, 'quadruplets_raw.p.bz2') quadruplets = t.load_compressed_pickle(dump_file) # dump_file = os.path.join(save_path, 'pentaplets_raw.p.bz2') # pentaplets = t.load_compressed_pickle(dump_file) print "Basic within merging" # pentaplets = merging.basic_merge_within_orders(pentaplets) quadruplets = merging.basic_merge_within_orders(quadruplets) triplets = merging.basic_merge_within_orders(triplets) duplets = merging.basic_merge_within_orders(duplets) print "Dumping basic merges" # dump_file = os.path.join(save_path, 'pentaplets_basic_merged.p.bz2') # t.dump_compressed_pickle(dump_file, pentaplets) dump_file = os.path.join(save_path, 'quadruplets_basic_merged.p.bz2') t.dump_compressed_pickle(dump_file, quadruplets) dump_file = os.path.join(save_path, 'triplets_basic_merged.p.bz2') t.dump_compressed_pickle(dump_file, triplets) dump_file = os.path.join(save_path, 'duplets_basic_merged.p.bz2') t.dump_compressed_pickle(dump_file, duplets) # print "Loading basic merges" # dump_file = os.path.join(save_path, 'pentaplets_basic_merged.p.bz2') # pentaplets = t.load_compressed_pickle(dump_file) # dump_file = os.path.join(save_path, 'quadruplets_basic_merged.p.bz2') # quadruplets = t.load_compressed_pickle(dump_file) # dump_file = os.path.join(save_path, 'triplets_basic_merged.p.bz2') # triplets = t.load_compressed_pickle(dump_file) # dump_file = os.path.join(save_path, 'duplets_basic_merged.p.bz2') # duplets = t.load_compressed_pickle(dump_file) sys.exit() print 'Starting iterative within mergings' pentaplets = merging.merge_kplets_within_orders_iterative_2(pentaplets) quadruplets = merging.merge_kplets_within_orders_iterative_2(quadruplets) triplets = merging.merge_kplets_within_orders_iterative_2(triplets) duplets = merging.merge_kplets_within_orders_iterative_2(duplets) print "Dumping iterative merges" dump_file = os.path.join(save_path, 'pentaplets_iterative_merged.p.bz2') t.dump_compressed_pickle(dump_file, pentaplets) dump_file = os.path.join(save_path, 'quadruplets_iterative_merged.p.bz2') t.dump_compressed_pickle(dump_file, quadruplets) dump_file = os.path.join(save_path, 'triplets_iterative_merged.p.bz2') t.dump_compressed_pickle(dump_file, triplets) dump_file = os.path.join(save_path, 'duplets_iterative_merged.p.bz2') t.dump_compressed_pickle(dump_file, duplets) sys.exit() print 'Dumping merged kplet lists to files' dump_file = bz2.BZ2File( os.path.join(save_path, 'pentaplets_merged_within.p.bz2'), 'w') pickle.dump(pentaplets, dump_file) dump_file = bz2.BZ2File( os.path.join(save_path, 'quadruplets_merged_within.p.bz2'), 'w') pickle.dump(quadruplets, dump_file) dump_file = bz2.BZ2File( os.path.join(save_path, 'triplets_merged_within.p.bz2'), 'w') pickle.dump(triplets, dump_file) dump_file = bz2.BZ2File( os.path.join(save_path, 'duplets_merged_within.p.bz2'), 'w') pickle.dump(duplets, dump_file) print 'Starting accross mergings' triplets, duplets = merging.merge_kplets_across_orders(triplets, duplets) quadruplets, triplets = merging.merge_kplets_across_orders( quadruplets, triplets) pentaplets, quadruplets = merging.merge_kplets_across_orders( pentaplets, quadruplets) print 'Dumping to files' dump_file = bz2.BZ2File( os.path.join(save_path, 'pentaplets_merged_across.p.bz2'), 'w') pickle.dump(pentaplets, dump_file) dump_file = bz2.BZ2File( os.path.join(save_path, 'quadruplets_merged_across.p.bz2'), 'w') pickle.dump(quadruplets, dump_file) dump_file = bz2.BZ2File( os.path.join(save_path, 'triplets_merged_across.p.bz2'), 'w') pickle.dump(triplets, dump_file) dump_file = bz2.BZ2File( os.path.join(save_path, 'duplets_merged_across.p.bz2'), 'w') pickle.dump(duplets, dump_file) print 'Done for limit_to:', limit_to print print
def merging_pipeline_for_order(order, load_from_db=False): limit_to = 1000000000 print "starting for ", order if load_from_db: print "Loading kplets from DB" if order == 2: kplet_file = 'duplets.p.bz2' #kplets = t.load_compressed_pickle(os.path.join(data_path, kplet_file)) elif order == 3: kplet_file = 'triplets.p.bz2' #kplets = t.load_compressed_pickle(os.path.join(data_path, kplet_file)) elif order == 4: kplet_file = 'quadruplets.p.bz2' kplets = q.get_report_kplets(profile_id2code, limit_to=limit_to) elif order == 5: kplet_file = 'pentaplets.p.bz2' kplets = p.get_report_kplets(profile_id2code, limit_to=limit_to) else: print "Loading kplets from pickle file" if order == 2: kplet_file = 'duplets.p.bz2' elif order == 3: kplet_file = 'triplets.p.bz2' elif order == 4: kplet_file = 'quadruplets.p.bz2' elif order == 5: kplet_file = 'pentaplets.p.bz2' kplet_file_full = os.path.join(data_path, kplet_file) print kplet_file_full kplets = t.load_compressed_pickle(kplet_file_full) # print "Starting for", kplet_file # print "Loading kplets" # kplets = t.load_compressed_pickle(os.path.join(data_path, kplet_file)) print "No of kplets:", len(kplets) #print "Loading file2genes" #_file2genes = {} #for _f in os.listdir(neighborhood_files_path): # _file2genes[_f] = dt.get_wgs_file(os.path.join(neighborhood_files_path, _f)) # print 'Filtering' # kplets = filter_seed(kplets, _file2genes) # print "No of kplets:", len(kplets) # fname = os.path.join(data_path, kplet_file.split('.')[0]+'_seed.p.bz2') # print 'Dumping', fname # t.dump_compressed_pickle(fname, kplets) print "Basic merging" merged_lists = merging.basic_merge_within_orders(kplets) fname = os.path.join(data_path, "basic_merged_"+kplet_file) print "Dumping basic merging: ", fname t.dump_compressed_pickle(fname, merged_lists) print "Iterative merging" merged_lists = merging.merge_kplets_within_orders_iterative(merged_lists) fname = os.path.join(data_path, "iterative_merged_"+kplet_file) print "Dumping Iterative merging: ",fname t.dump_compressed_pickle(fname, merged_lists)