def cas4_dataset(): print "Loading loci" def_file = os.path.join(gv.project_data_path, 'cas4/profiles/defenseProfiles.tab') profile2gene = {} for l in open(def_file): terms = l.split('\t') profile = terms[0] gene_names = terms[3].split(',') if len(gene_names) > 1: profile2gene[profile] = gene_names[1] else: profile2gene[profile] = gene_names[0] cdd_profile2gene = t.map_cdd_profile2gene_name() cdd_profile2gene.update(profile2gene) loci = [Locus(os.path.join(work_dir, 'files', f), file_format='generic', profile2gene=cdd_profile2gene) for f in os.listdir(os.path.join(work_dir, 'files'))] # dendrogram_file = os.path.join(work_dir, 'loci_dendrogram.pdf') singles, cluster_packs, _ = dnd.classify_loci_hierarchically(loci, threshold=2.6) print "Clusters: %d, singles %d" % (len(cluster_packs), len(singles)) reports_dir = os.path.join(work_dir, 'reports/cas4') feature_profiles = [l.strip() for l in open(os.path.join(work_dir, 'feature_profiles.txt')).readlines()] r.generate_cluster_reports_cas4(cluster_packs, loci, reports_dir, feature_profiles)
def prok1603_architecture_frequencies(): work_dir = os.path.join(gv.project_data_path, 'UvrD/') map_file = os.path.join(work_dir, 'prok1603/prok1603_weights.txt') locus2weight = {l.split()[0]:float(l.split()[1]) for l in open(map_file)} def_file = os.path.join(gv.project_data_path, 'cas4/profiles/defenseProfiles.tab') profile2gene={} profile2def = {} for l in open(def_file): terms = l.strip().split('\t') profile = terms[0] gene_names = terms[3].split(',') if len(gene_names)>1: profile2gene[profile] = gene_names[1] else: profile2gene[profile] = gene_names[0] profile2def[profile] = terms[4] cdd_profile2gene = t.map_cdd_profile2gene_name() cdd_profile2gene.update(profile2gene) cdd_profile2def = t.map_cdd_profile2def() cdd_profile2def.update(profile2def) prok1603_loci_file = os.path.join(work_dir, 'prok1603The CRISPR/prok1603_loci.p.bz2') loci = t.load_compressed_pickle(prok1603_loci_file) profile2loci = {} for locus in loci: for _profile in locus.profiles: if _profile in profile2loci: profile2loci[_profile].append(locus) else: profile2loci[_profile] = [locus] for (profile, loci) in sorted(profile2loci.items(), key=lambda x: len(x[1]), reverse=True): _weight = sum([locus2weight[locus.base_file_name] for locus in loci]) print "%s\t%s\t%d\t%f\t%s" % (profile, cdd_profile2gene[profile] if profile in cdd_profile2gene else "", len(loci), _weight, cdd_profile2def[profile] if profile in cdd_profile2def else "")
def cas4_extract_dendrogram(): work_dir = os.path.join(gv.project_data_path, 'cas4') print "Loading loci" def_file = os.path.join(gv.project_data_path, 'cas4/profiles/defenseProfiles.tab') profile2gene = {} for l in open(def_file): terms = l.split('\t') profile = terms[0] gene_names = terms[3].split(',') if len(gene_names) > 1: profile2gene[profile] = gene_names[1] else: profile2gene[profile] = gene_names[0] cdd_profile2gene = t.map_cdd_profile2gene_name() cdd_profile2gene.update(profile2gene) files_path = os.path.join(work_dir, 'files') loci = [Locus(os.path.join(files_path, f), file_format='generic', profile2gene=cdd_profile2gene) for f in os.listdir(files_path)] tic = time.time() print "Generating score matrix" M = scores.generate_jackard_score_matrix(loci) tac = time.time() - tic print "Elapsed time:", float(tac) / 60 / 60, float(tac) / 60, float(tac) tic = time.time() jw_file = os.path.join(work_dir, 'pickle/jw_scores.p.bz2') print "Dumping JW scores to:", jw_file t.dump_compressed_pickle(jw_file, M) tac = time.time() - tic print "Elapsed time:", float(tac) / 60 / 60, float(tac) / 60, float(tac) # print "Loading JW scores from:", prok1603_jw_file # M = t.load_compressed_pickle(prok1603_jw_file) tree_file = os.path.join(work_dir, 'jw_upgma.tre') print "Generating tree:", tree_file dnd.convert_score_to_newick(M, [os.path.basename(l.file_name) for l in loci], tree_file)
print "Loading loci" def_file = os.path.join(gv.project_data_path, 'cas4/profiles/defenseProfiles.tab') profile2gene = {} for l in open(def_file): terms = l.split('\t') profile = terms[0] gene_names = terms[3].split(',') if len(gene_names) > 1: profile2gene[profile] = gene_names[1] else: profile2gene[profile] = gene_names[0] cdd_profile2gene = t.map_cdd_profile2gene_name() cdd_profile2gene.update(profile2gene) loci = [Locus(os.path.join(files_path, f), file_format='generic', profile2gene=cdd_profile2gene) for f in os.listdir(files_path)] locus2id = {} cluster2id = {} feature_clusters = get_feature_labels() print "No of feature clusters", len(feature_clusters) vertices, edges = [], [] id = 1 for locus in loci:
print "Loading loci" def_file = os.path.join(gv.project_data_path, 'cas4/profiles/defenseProfiles.tab') profile2gene = {} for l in open(def_file): terms = l.split('\t') profile = terms[0] gene_names = terms[3].split(',') if len(gene_names) > 1: profile2gene[profile] = gene_names[1] else: profile2gene[profile] = gene_names[0] cdd_profile2gene = t.map_cdd_profile2gene_name() cdd_profile2gene.update(profile2gene) file2locus = {} for f in os.listdir(data_dir + 'files'): locus = Locus(os.path.join(data_dir, 'files', f), file_format='generic', profile2gene=cdd_profile2gene) file2locus[f] = locus locus2weight = { l.split()[0]: float(l.split()[1]) for l in open(weights_file)} community2nodes = parse_tree_file(infomap_file, locus2weight) sorted_clusters = sorted(community2nodes.values(), key=lambda x: sum(locus.weight for locus in x if locus.type==2),
def merge_loci(source_path, dest_path): profile2gene = t.map_cdd_profile2gene_name() print "Loading loci" loci = [BasicLocus(os.path.join(source_path, f), profile2gene=profile2gene) for f in os.listdir(source_path)] print "Merging loci" cnt = 1 while True: merged_out = [0]*len(loci) new_loci = [] for i in range(len(loci)-1): if merged_out[i]: continue for j in range(i+1, len(loci)): if merged_out[i]: continue if loci[i].overlaps(loci[j]): loci[i].merge(loci[j]) merged_out[j] = 1 new_loci.append(loci[i]) if not merged_out[len(loci)-1]: new_loci.append(loci[-1]) print "Iteration %d results. Old list: %d, New list: %d" % (cnt, len(loci), len(new_loci)) cnt += 1 if len(loci) == len(new_loci): loci = new_loci break loci = new_loci # with open(work_dir+'merged_loci_name.txt', 'wb') as outf: # # [outf.write("%s\n"%_file) for locus in loci for _file in locus.merged_files] print "Writing merged loci to files" for locus in loci: fname = os.path.join(dest_path, locus.merged_base_files.pop()) while os.path.exists(fname): print "Duplicate file name:", fname fname = os.path.join(dest_path, locus.merged_base_files.pop()) header_lines=[] if locus.merged_base_files: header_line = "Merged files: %s" % ",".join(f for f in locus.merged_base_files) header_lines = [header_line] t.write_genes_to_pty(locus.genes, fname, header_lines=header_lines)
def prok1603_extract_dendrogram(): work_dir = os.path.join(gv.project_data_path, 'UvrD/') files_path = os.path.join(work_dir, 'prok1603/merged_files/') print "Loading loci" def_file = os.path.join(gv.project_data_path, 'cas4/profiles/defenseProfiles.tab') profile2gene = {} for l in open(def_file): terms = l.split('\t') profile = terms[0] gene_names = terms[3].split(',') if len(gene_names) > 1: profile2gene[profile] = gene_names[1] else: profile2gene[profile] = gene_names[0] cdd_profile2gene = t.map_cdd_profile2gene_name() cdd_profile2gene.update(profile2gene) loci = [BasicLocus(os.path.join(files_path, f), profile2gene=cdd_profile2gene) for f in os.listdir(files_path)] prok1603_loci_file = os.path.join(work_dir, 'prok1603/prok1603_loci.p.bz2') # loci = t.load_compressed_pickle(prok1603_loci_file) print "Loci:", len(loci) print "Dumping loci to:", prok1603_loci_file t.dump_compressed_pickle(prok1603_loci_file, loci) sys.exit() tic = time.time() print "Generating score matrix" M = scores.generate_jackard_score_matrix(loci) tac = time.time() - tic print "Elapsed time:", float(tac)/60/60, float(tac)/60, float(tac) tic = time.time() prok1603_jw_file = os.path.join(work_dir, 'prok1603_jw_scores.p') print "Dumping JW scores to:", prok1603_jw_file with open(prok1603_jw_file, 'wb') as outf: cPickle.dump(M, outf, protocol=cPickle.HIGHEST_PROTOCOL) tac = time.time() - tic print "Elapsed time:", float(tac) / 60 / 60, float(tac) / 60, float(tac) tic = time.time() prok1603_jw_file = os.path.join(work_dir, 'prok1603_jw_scores.p.bz2') print "Dumping JW scores to:", prok1603_jw_file t.dump_compressed_pickle(prok1603_jw_file, M) tac = time.time() - tic print "Elapsed time:", float(tac) / 60 / 60, float(tac) / 60, float(tac) # print "Loading JW scores from:", prok1603_jw_file # M = t.load_compressed_pickle(prok1603_jw_file) tic = time.time() prok1603_jw_file = os.path.join(work_dir, 'prok1603/prok1603_jw_scores.npz') # prok1603_jw_file = os.path.join('/Users/hudaiber/Projects/NewSystems/data/UvrD/prok1603/prok1603_jw_scores.npz') print "Dumping JW scores to:", prok1603_jw_file np.savez_compressed(prok1603_jw_file, M) tac = time.time() - tic print "Elapsed time:", float(tac) / 60 / 60, float(tac) / 60, float(tac) # print "Loading JW scores from:", prok1603_jw_file # M = t.load_compressed_pickle(prok1603_jw_file) prok1603_tree_file = os.path.join(work_dir, 'prok1603/prok1603_upgma.tre') print "Generating tree:", prok1603_tree_file dnd.convert_score_to_newick(M, [os.path.basename(l.file_name) for l in loci], prok1603_tree_file)
def tree_leaves(): work_dir = os.path.join(gv.project_data_path, 'UvrD/prok1603') tree_dir = os.path.join(work_dir, 'clust_tree/') files_dir = os.path.join(work_dir, 'merged_files') profile2gene = t.map_cdd_profile2gene_name() gi2org = {l.split()[0]: l.rstrip().split()[1] for l in open(work_dir + '/gi_org.txt')} gi2weight = {l.split()[0].split('.')[0]: float(l.split()[1]) for l in open(work_dir + '/prok1603_weights.txt')} cl2size, cl2gis, cl2weight = {}, {}, {} for l in open(tree_dir + 'uvrd.cls'): terms = l.rstrip().split() cl2size[terms[1]] = terms[0] cl2gis[terms[1]] = terms[2:] cl2weight[terms[1]] = sum([ gi2weight[gi] if gi in gi2weight else 0 for gi in terms[2:]]) tree_string = open(tree_dir + 'uvrd.up.tre').readline() leave_file_names = [os.path.basename(l) for l in glob.glob(tree_dir + '*.sr')] for leave_file_name in leave_file_names: leave_file_gis = [ l.split()[0] for l in open(os.path.join(tree_dir, leave_file_name))] system_gene_pool = [] sgp_count = {} for gi in leave_file_gis: system_genes = get_system_genes(gi, files_dir, profile2gene) if not system_genes: continue system_gene_pool.append(system_genes) t.update_dictionary(sgp_count, system_genes, gi2weight[gi]) sorted_sgp_count = sorted(sgp_count.items(), key=lambda x: x[1], reverse=True) leaf_name = os.path.splitext(leave_file_name)[0] gene_names = sorted_sgp_count[0][0] if sorted_sgp_count else "" representative = gi2org[random.choice(leave_file_gis)] total_weight = sum([v for k,v in sgp_count.items()]) leaf_prefix = "%s|" % int(total_weight) if total_weight else "-" has_genes = False for _gene_name in ["Cas4", "UvrA", "UvrB", "UvrC", "SbcS", "SbcD"]: # for _gene_name in ["Cas4"]: _weight = sum([v for k, v in sgp_count.items() if _gene_name.lower() in k.lower()]) if _weight: leaf_prefix += "%s=%d|" % (_gene_name, _weight) has_genes = True if has_genes: new_leaf_name = leaf_prefix + representative + "|" + leaf_name.split('.')[1] else: new_leaf_name = leaf_prefix + leaf_name.split('.')[1] # new_leaf_name = "cas4=%s/%s|%s|%s" % (int(cas4_weight) if total_weight else "-", # int(total_weight) if total_weight else "-", # representative, # leaf_name.split('.')[1]) print leaf_name, new_leaf_name tree_string = tree_string.replace(leaf_name + ":", new_leaf_name + ":") # new_file_name = os.path.join(tree_dir, os.path.splitext(leave_file_name)[0] + '.def') # with open(new_file_name, 'w') as new_file: # # for k, v in sorted_sgp_count: # new_file.write("#%s\t%f\n" % (k, v)) # # new_file.write("\n") # # [new_file.write("%s\t%s\n" % (gi, gi2org[gi])) for gi in leave_file_gis] with open(tree_dir + 'uvrd.up_all_genes.tree', 'w') as outf: outf.write(tree_string)
def merge_loci(source_path, dest_path): profile2gene = t.map_cdd_profile2gene_name() print "Loading loci" loci = [ BasicLocus(os.path.join(source_path, f), profile2gene=profile2gene) for f in os.listdir(source_path) ] print "Merging loci" cnt = 1 while True: merged_out = [0] * len(loci) new_loci = [] for i in range(len(loci) - 1): if merged_out[i]: continue for j in range(i + 1, len(loci)): if merged_out[i]: continue if loci[i].overlaps(loci[j]): loci[i].merge(loci[j]) merged_out[j] = 1 new_loci.append(loci[i]) if not merged_out[len(loci) - 1]: new_loci.append(loci[-1]) print "Iteration %d results. Old list: %d, New list: %d" % ( cnt, len(loci), len(new_loci)) cnt += 1 if len(loci) == len(new_loci): loci = new_loci break loci = new_loci # with open(work_dir+'merged_loci_name.txt', 'wb') as outf: # # [outf.write("%s\n"%_file) for locus in loci for _file in locus.merged_files] print "Writing merged loci to files" for locus in loci: fname = os.path.join(dest_path, locus.merged_base_files.pop()) while os.path.exists(fname): print "Duplicate file name:", fname fname = os.path.join(dest_path, locus.merged_base_files.pop()) header_lines = [] if locus.merged_base_files: header_line = "Merged files: %s" % ",".join( f for f in locus.merged_base_files) header_lines = [header_line] t.write_genes_to_pty(locus.genes, fname, header_lines=header_lines)