def get_flank_distributions(kplets_2d_list, neighborhood_path, target_profiles): org2weights = t.map_genome2weight() flanking_genes_count = [] cog2gids = [] gid2weight = dict() for kplets_list in kplets_2d_list: cur_flanking_genes_count = dict() cur_cog2gids = dict() for kplet in kplets_list: neighborhoods = [Neighborhood(os.path.join(neighborhood_path, f)) for f in kplet.files] for neighborhood in neighborhoods: for gene in neighborhood.genes: gid2weight[int(gene.gid)] = org2weights[gene.organism] for cogid in gene.cogid.split(): # if cogid in target_profiles: # continue t.update_dictionary(cur_flanking_genes_count,cogid,org2weights[gene.organism]) t.update_dictionary_set(cur_cog2gids, cogid, set([int(gene.gid)])) flanking_genes_count.append(cur_flanking_genes_count) cog2gids.append(cur_cog2gids) return flanking_genes_count, cog2gids, gid2weight
def search_kplet_in_genomes(kplet_codes, target_profiles, max_dist=20, block_size=4): org2src = dict() src2blocks = dict() for _org in os.listdir(gv.pty_data_path): _org_path = os.path.join(gv.pty_data_path, _org) for _src in os.listdir(_org_path): _genes = dt.get_pty(os.path.join(_org_path, _src)) blocks = list() cur_block = list() last_ind = None for (ind, _gene) in enumerate(_genes): _cogids = set(gid2cdd[_gene.gid].split() if _gene.gid in gid2cdd else set([])) _gene.cogid = _cogids if _cogids.intersection(kplet_codes): if not last_ind: cur_block.append(_gene) last_ind = ind+1 continue if ind - last_ind < max_dist: cur_block += _genes[last_ind: ind+1] else: blocks.append(cur_block) cur_block = [_gene] last_ind = ind+1 blocks.append(cur_block) filtered_blocks = list() for block in blocks: block_codes = set([]) block_all_codes = set([]) for _gene in block: block_codes.update(_gene.cogid.intersection(kplet_codes)) block_all_codes.update(_gene.cogid) if len(block_codes) >= block_size and not block_all_codes.intersection(target_profiles): filtered_blocks.append(block) del blocks if filtered_blocks: t.update_dictionary_set(org2src, _org, _src) src2blocks[_src] = filtered_blocks return org2src, src2blocks
def kplet_to_file_summaries(kplet, neighborhood_files_path): file_summaries = list() organisms = set() _crispr_type2files = dict() for f in kplet.files: _genes = dt.get_wgs_file(os.path.join(neighborhood_files_path, f)) _src = _genes[0].src _org = _genes[0].organism _crispr_type = _genes[0].crispr_type t.update_dictionary_set(_crispr_type2files, _crispr_type, f) file_summaries.append(WGSNeighborhoodFileSummary(f,[kplet],_genes,_org,_src)) organisms.update(_org) file_summaries.sort(key=lambda x: x.org) return file_summaries, organisms, _crispr_type2files
def census_table(): cas42locus = {} genome2cas4 = {} loci_file = os.path.join(data_path, 'Islands_ID.ann_clust') loci = t.parse_crispr_loci(loci_file) census_outf = open(os.path.join(data_path, 'cas4_census.txt'), "w") # columns = ["GI", "profiles", "Gene name(s)", "Status", "Complete", "Type", "Organism", "Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species"] columns = [ "GI", "profiles", "Gene name(s)", "Status", "Complete", "Type", "Organism", "Total Cas4 in genome", "Kingdom" ] census_outf.write("#" + "\t".join(columns) + "\n") # annotation_outf = open(os.path.join(target_path, 'cas4_annotation.txt'), "w") # annotation_outf.write("\t".join(["GI","annotation"]) + "\n") for locus in loci: cas42locus.update({gene: locus for gene in locus.cas4_genes}) [ t.update_dictionary_set(genome2cas4, locus.organism, _cas4.gid) for _cas4 in locus.cas4_genes ] for cas4_gene, locus in sorted(cas42locus.items(), key=lambda x: x[0].gid): _gi = cas4_gene.gid _profiles = cas4_gene.cogid _gene_names = ",".join(set(cas4_gene.gene_name.split(','))) _status = locus.status _complete = "complete" if locus.complete else "partial" _type = locus.type if ";" not in locus.type else locus.type.split( ";")[0] _org = locus.organism _domain = genome2domain[_org] # _phylum = genome2phylum[_org] # _class = genome2class[_org] # _order = genome2Order[_org] # _family = genome2Family[_org] # _genus = genome2Genus[_org] # _species = genome2Species[_org].strip() # _lineage = genome2tax_lineage[_org] # terms = [_gi, _profiles, _gene_names, _status, _complete, _type, _org, _domain, _phylum, _class, _order, _family, _genus, _species] terms = [ _gi, _profiles, _gene_names, _status, _complete, _type, _org, str(len(genome2cas4[_org])), _domain ] census_outf.write("\t".join(terms) + "\n")
def census_table(): cas42locus = {} genome2cas4 = {} loci_file = os.path.join(data_path, 'Islands_ID.ann_clust') loci = t.parse_crispr_loci(loci_file) census_outf = open(os.path.join(data_path, 'cas4_census.txt'), "w") # columns = ["GI", "profiles", "Gene name(s)", "Status", "Complete", "Type", "Organism", "Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species"] columns = ["GI", "profiles", "Gene name(s)", "Status", "Complete", "Type", "Organism", "Total Cas4 in genome", "Kingdom"] census_outf.write("#"+"\t".join(columns)+"\n") # annotation_outf = open(os.path.join(target_path, 'cas4_annotation.txt'), "w") # annotation_outf.write("\t".join(["GI","annotation"]) + "\n") for locus in loci: cas42locus.update({gene:locus for gene in locus.cas4_genes}) [t.update_dictionary_set(genome2cas4, locus.organism, _cas4.gid) for _cas4 in locus.cas4_genes] for cas4_gene, locus in sorted(cas42locus.items(), key=lambda x: x[0].gid): _gi = cas4_gene.gid _profiles = cas4_gene.cogid _gene_names = ",".join(set(cas4_gene.gene_name.split(','))) _status = locus.status _complete = "complete" if locus.complete else "partial" _type = locus.type if ";" not in locus.type else locus.type.split(";")[0] _org = locus.organism _domain = genome2domain[_org] # _phylum = genome2phylum[_org] # _class = genome2class[_org] # _order = genome2Order[_org] # _family = genome2Family[_org] # _genus = genome2Genus[_org] # _species = genome2Species[_org].strip() # _lineage = genome2tax_lineage[_org] # terms = [_gi, _profiles, _gene_names, _status, _complete, _type, _org, _domain, _phylum, _class, _order, _family, _genus, _species] terms = [_gi, _profiles, _gene_names, _status, _complete, _type, _org, str(len(genome2cas4[_org])), _domain] census_outf.write("\t".join(terms)+"\n")
def kplet_list_to_file_summaries(kplets, neighborhood_files_path, filter_weak_hits=True, dataset=None): file_summaries = list() organisms = set() _crispr_type2files = dict() _file2kplets = dict() _kplet2count_af = dict() # kplet2count after filtration _kplet2count_bf = dict() # kplet2count before filtration _profile2count_bf = dict() _profile2count_af = dict() filter_size = 5 singletons = get_singleton_loci(dataset) clusters = get_clustered_loci(dataset) for kplet in kplets: for f in kplet.files: t.update_dictionary(_file2kplets, f, [kplet]) initial_length = len(_file2kplets) for f in _file2kplets.keys(): [t.update_dictionary(_kplet2count_bf, kplet.id, 1) for kplet in _file2kplets[f]] del f kplet_ids = [k.id for k in kplets] if filter_weak_hits: _file2kplets = {k: v for (k,v) in _file2kplets.items() if len(v) > filter_size} if len(_file2kplets) < 2: return None _file2genes = {f: dt.get_wgs_file(os.path.join(neighborhood_files_path, f)) for f in _file2kplets.keys()} _files = set(_file2kplets.keys()) for _gene_list in _file2genes.values(): for _gene in _gene_list: for _c in _gene.cogid.split(','): t.update_dictionary(_profile2count_bf, _c, 1) del _gene_list, _gene, _c while _files: _f = _files.pop() if _f in singletons: _genes = _file2genes[_f] _src = _genes[0].src _org = _genes[0].organism _crispr_type = _genes[0].crispr_type t.update_dictionary_set(_crispr_type2files, _crispr_type, _f) file_summaries.append(WGSNeighborhoodFileSummary(_f, _file2kplets[_f], _genes, _org, _src, 'singleton')) organisms.update(set([_org])) else: _cluster = None for cl in clusters: if _f in cl.files: _cluster = cl break if not _cluster: continue del cl _cl_files = _cluster.files.intersection(_files) _representative = _f del _f for _cl_file in _cl_files: if len(_file2genes[_cl_file]) > len(_file2genes[_representative]): _representative = _cl_file _genes = _file2genes[_representative] _src = _genes[0].src _org = _genes[0].organism _crispr_type = _genes[0].crispr_type t.update_dictionary_set(_crispr_type2files, _crispr_type, _representative) _file_summary = WGSNeighborhoodFileSummary(_representative, _file2kplets[_representative], _genes, _org, _src, _cluster) _file_summary.cluster_local_count = len(_cl_files)+1 file_summaries.append(_file_summary) organisms.update(set([_org])) _files = _files.difference(_cl_files) file_summaries = [fs for fs in file_summaries if len(fs.kplets)>1] _files = [fs.file_name for fs in file_summaries] for _f in _files: [t.update_dictionary(_kplet2count_af, kplet.id, 1) for kplet in _file2kplets[_f]] _gene_list = _file2genes[_f] for _gene in _gene_list: for _c in _gene.cogid.split(','): t.update_dictionary(_profile2count_af, _c, 1) file_summaries.sort(key=lambda x: x.org) retval = CrisprMergingKplets2FsOutput() retval.file_summaries = file_summaries retval.organisms = organisms retval.crispr_type2files = _crispr_type2files retval.kplet2count_af = _kplet2count_af retval.kplet2count_bf = _kplet2count_bf retval.initial_length = initial_length retval.kplets = kplets retval.profile2count_bf = _profile2count_bf retval.profile2count_af = _profile2count_af return retval
# if 'COG0210' in profiles: # print len(profiles), f, profiles if __name__ == "__main__": work_dir = '/home/hudaiber/Projects/NewSystems/data/UvrD/paralogs/' # prok1603_ccp = '/home/hudaiber/Projects/NewSystems/data/UvrD/paralogs/Prok1603.ccp.csv' prok1603_ccp = '/dev/shm/Prok1603.ccp.csv' prok1603_except_dir = '/panfs/pan1/patternquest/Projects/NewSystems/data/UvrD/except_prok1603/COG0210/hhpred/' pk_gi2pr = {} for l in open(prok1603_ccp): terms = l.strip().split(',') gi = terms[0] profile = terms[6] t.update_dictionary_set(pk_gi2pr, gi, profile) outfmt = "%s\t%s\t%s" print outfmt % ("#GI", "Genome", "Cas4 fusion") print "#--------------------------------------------" all_lines = open(os.path.join(work_dir, 'from_not_prok1603/uvrd_filtered.pty')).readlines() + \ open(os.path.join(work_dir, 'from_prok1603/uvrd.pty')).readlines() for pty_line in open(os.path.join(work_dir, 'from_prok1603/uvrd.pty')).readlines(): parts = pty_line.strip().split() genome = parts[3] gi = parts[-1] acc = parts[-2]