def __init__(self, file_name, annotation_map=None, file_format='pty', profile2gene=None): self.file_name = file_name self.base_file_name = os.path.basename(file_name) self.merged_files = set([self.file_name]) self.merged_base_files = set([self.base_file_name]) if file_format=='pty': _genes = dt.get_pty_file(file_name, annotation_map=annotation_map, profile2gene=profile2gene) else: _genes = dt.get_pty_file_generic(file_name, profile2gene=profile2gene) # print "File name:", file_name, "Genes size:",len(_genes) self.genes = _genes self.profiles = set(profile for g in _genes for profile in g.cogid.split(',') if profile != '') self.gene_names = set(gene_name for g in _genes for gene_name in g.gene_name.split(',') if gene_name != '') self.organism = _genes[0].organism self.source = _genes[0].src _forward = set() _reverse = set() for i in range(len(self.genes)): _gene = self.genes[i] for _cogid in _gene.cogid.split(','): _forward.update((_cogid,)) if i == len(self.genes)-1: continue _next_gene = self.genes[i+1] for _next_cogid in _next_gene.cogid.split(','): _forward.update(("%s-%s" % (_cogid, _next_cogid),)) self.genes.sort(reverse=True) for i in range(len(self.genes)): _gene = self.genes[i] for _cogid in _gene.cogid.split(','): _reverse.update((_cogid,)) if i == len(self.genes)-1: continue _next_gene = self.genes[i+1] for _next_cogid in _next_gene.cogid.split(','): _reverse.update(("%s-%s" % (_cogid, _next_cogid),)) self.forward_set = _forward self.reverse_set = _reverse self.feature_weights = None self.feature_labels = None
def __init__(self, file_name, annotation_map=None): self.file_name = file_name _genes = dt.get_pty_file_generic(file_name, annotation_map=annotation_map) self.genes = _genes self.profiles = set(profile for g in _genes for profile in g.cogid.split() if profile != '') self.organism = _genes[0].organism self.source = _genes[0].src _forward = set() _reverse = set() for i in range(len(_genes)): _gene = _genes[i] for _cogid in _gene.cogid.split(','): _forward.update((_cogid, )) if i == len(_genes) - 1: continue _next_gene = _genes[i + 1] for _next_cogid in _next_gene.cogid.split(','): _forward.update(("%s-%s" % (_cogid, _next_cogid), )) _genes.sort(reverse=True) for i in range(len(_genes)): _gene = _genes[i] for _cogid in _gene.cogid.split(','): _reverse.update((_cogid, )) if i == len(_genes) - 1: continue _next_gene = _genes[i + 1] for _next_cogid in _next_gene.cogid.split(','): _reverse.update(("%s-%s" % (_cogid, _next_cogid), )) self.forward_set = _forward self.reverse_set = _reverse self.feature_weights = None self.feature_labels = None
if __name__ == '__main__': # prok1402_path_file = '/mnt/storage/data/CDD/all_Prok1402.ccp.csv' prok1402_path_file = os.path.join(os.path.expanduser('~'), 'data/CDD/all_Prok1402.ccp.csv') print "Loading map_gid2cdd" gi2annotation = t.map_gid2cdd() print "Loading CRISPR loci" # cas1402_loci_path = os.path.join(gv.project_data_path,'cas1402/files/') cas1402_loci_path = os.path.join(gv.project_data_path, 'cas1402/files/') cas1402_loci = [ dt.get_pty_file_generic(os.path.join(cas1402_loci_path, f), annotation_map=gi2annotation) for f in os.listdir(cas1402_loci_path) ] cas1402_gis = set([gene.gid for locus in cas1402_loci for gene in locus]) cas1402_organisms = set([locus[0].organism for locus in cas1402_loci]) calculate_profile_based_crispricity(cas1402_loci, cas1402_gis, cas1402_organisms, prok1402_path_file) # all_orgs = set() # for profile, org2gis in global_profile2orgs2gis.items(): # if 'Yersinia_pseudotuberculosis_PB1__uid59153' in org2gis: # print "Yersinia_pseudotuberculosis_PB1__uid59153" # for _org in org2gis.keys(): # all_orgs.update([_org])
plt.ylabel("Crispricity") plt.legend(loc="upper left", fontsize=7) plt.savefig(os.path.join(work_dir, 'crispricity_log.png')) if __name__=='__main__': # prok1402_path_file = '/mnt/storage/data/CDD/all_Prok1402.ccp.csv' prok1402_path_file = os.path.join(os.path.expanduser('~'),'data/CDD/all_Prok1402.ccp.csv') print "Loading map_gid2cdd" gi2annotation = t.map_gid2cdd() print "Loading CRISPR loci" # cas1402_loci_path = os.path.join(gv.project_data_path,'cas1402/files/') cas1402_loci_path = os.path.join(gv.project_data_path,'cas1402/files/') cas1402_loci = [dt.get_pty_file_generic(os.path.join(cas1402_loci_path, f), annotation_map = gi2annotation) for f in os.listdir(cas1402_loci_path)] cas1402_gis = set([gene.gid for locus in cas1402_loci for gene in locus]) cas1402_organisms = set([locus[0].organism for locus in cas1402_loci]) calculate_profile_based_crispricity(cas1402_loci, cas1402_gis, cas1402_organisms, prok1402_path_file) # all_orgs = set() # for profile, org2gis in global_profile2orgs2gis.items(): # if 'Yersinia_pseudotuberculosis_PB1__uid59153' in org2gis: # print "Yersinia_pseudotuberculosis_PB1__uid59153" # for _org in org2gis.keys(): # all_orgs.update([_org])
def kplet_list_to_file_summaries(kplets, neighborhood_files_path, filter_weak_hits=True): file_summaries = list() organisms = set() _file2kplets = dict() _kplet2count_af = dict() # kplet2count after filtration _kplet2count_bf = dict() # kplet2count before filtration _profile2count_bf = dict() _profile2count_af = dict() _cas_type2count = dict() # filter_size = 5 for kplet in kplets: for f in kplet.files: t.update_dictionary(_file2kplets, f, [kplet]) initial_length = len(_file2kplets) for f in _file2kplets.keys(): [ t.update_dictionary(_kplet2count_bf, kplet.id, 1) for kplet in _file2kplets[f] ] del f # if filter_weak_hits: # _file2kplets = {k: v for (k,v) in _file2kplets.items() if len(v) > filter_size} if len(_file2kplets) < 2: return None _file2genes = { f: dt.get_pty_file_generic(os.path.join(neighborhood_files_path, f)) for f in _file2kplets.keys() } _files = set(_file2kplets.keys()) for _f in _files: _genes = _file2genes[_f] _src = _genes[0].src _org = _genes[0].organism organisms.update([_org]) _nfs = NeighborhoodFileSummary(_f, _file2kplets[_f], _genes, _org, _src, _org2weight[_org]) for _gene in _genes: if _gene.gid in _gi2castype: _nfs.cas_type = ";".join(_gi2castype[_gene.gid]) for _cas_type in _gi2castype[_gene.gid]: t.update_dictionary(_cas_type2count, _cas_type, 1) break [ t.update_dictionary(_kplet2count_af, kplet.id, 1) for kplet in _file2kplets[_f] ] for _gene in _genes: for _c in _gene.cogid.split(','): t.update_dictionary(_profile2count_af, _c, 1) file_summaries.append(_nfs) # file_summaries = [fs for fs in file_summaries if len(fs.kplets)>1] # _files = [fs.file_name for fs in file_summaries] # for _f in _files: # [t.update_dictionary(_kplet2count_af, kplet.id, 1) for kplet in _file2kplets[_f]] # # _gene_list = _file2genes[_f] # for _gene in _gene_list: # for _c in _gene.cogid.split(','): # t.update_dictionary(_profile2count_af, _c, 1) file_summaries.sort(key=lambda x: x.org) retval = GenericMergingKplets2FsOutput() retval.file_summaries = file_summaries retval.organisms = organisms retval.profile2count_bf = _profile2count_bf retval.profile2count_af = _profile2count_af retval.kplet2count_af = _kplet2count_af retval.kplet2count_bf = _kplet2count_bf retval.weight = sum(fs.weight for fs in file_summaries) retval.cas_type2count = _cas_type2count return retval
def __init__(self, file_name, annotation_map=None, file_format='pty', profile2gene=None): self.file_name = file_name self.base_file_name = os.path.basename(file_name) if file_format == 'pty': _genes = dt.get_pty_file(file_name, annotation_map=annotation_map) else: _genes = dt.get_pty_file_generic(file_name, profile2gene=profile2gene) locus_file = open(file_name) type_line = locus_file.readline() if type_line.startswith('#type:'): self.crispr_type = type_line.split(':')[1].strip() else: self.crispr_type = None self.summary_line = locus_file.readline() assert self.summary_line.startswith("===") locus_file.close() self.genes = _genes self.profiles = set(profile for g in _genes for profile in g.cogid.split() if profile != '') self.gene_names = set(gene_name for g in _genes for gene_name in g.gene_name.split(',') if gene_name != '') self.clusters = set(gene.cluster_id for gene in _genes) self.organism = _genes[0].organism self.source = _genes[0].src _forward = set() _reverse = set() for i in range(len(self.genes)): _gene = self.genes[i] for _cogid in _gene.cogid.split(','): _forward.update((_cogid, )) if i == len(self.genes) - 1: continue _next_gene = self.genes[i + 1] for _next_cogid in _next_gene.cogid.split(','): _forward.update(("%s-%s" % (_cogid, _next_cogid), )) self.genes.sort(reverse=True) for i in range(len(self.genes)): _gene = self.genes[i] for _cogid in _gene.cogid.split(','): _reverse.update((_cogid, )) if i == len(self.genes) - 1: continue _next_gene = self.genes[i + 1] for _next_cogid in _next_gene.cogid.split(','): _reverse.update(("%s-%s" % (_cogid, _next_cogid), )) self.forward_set = _forward self.reverse_set = _reverse self.feature_weights = None self.feature_labels = None
def __init__(self, file_name, annotation_map=None, file_format='pty', profile2gene=None): self.file_name = file_name self.base_file_name = os.path.basename(file_name) self.merged_files = set([self.file_name]) self.merged_base_files = set([self.base_file_name]) if file_format == 'pty': _genes = dt.get_pty_file(file_name, annotation_map=annotation_map, profile2gene=profile2gene) else: _genes = dt.get_pty_file_generic(file_name, profile2gene=profile2gene) # print "File name:", file_name, "Genes size:",len(_genes) self.genes = _genes self.profiles = set(profile for g in _genes for profile in g.cogid.split(',') if profile != '') self.gene_names = set(gene_name for g in _genes for gene_name in g.gene_name.split(',') if gene_name != '') self.organism = _genes[0].organism self.source = _genes[0].src _forward = set() _reverse = set() for i in range(len(self.genes)): _gene = self.genes[i] for _cogid in _gene.cogid.split(','): _forward.update((_cogid, )) if i == len(self.genes) - 1: continue _next_gene = self.genes[i + 1] for _next_cogid in _next_gene.cogid.split(','): _forward.update(("%s-%s" % (_cogid, _next_cogid), )) self.genes.sort(reverse=True) for i in range(len(self.genes)): _gene = self.genes[i] for _cogid in _gene.cogid.split(','): _reverse.update((_cogid, )) if i == len(self.genes) - 1: continue _next_gene = self.genes[i + 1] for _next_cogid in _next_gene.cogid.split(','): _reverse.update(("%s-%s" % (_cogid, _next_cogid), )) self.forward_set = _forward self.reverse_set = _reverse self.feature_weights = None self.feature_labels = None
sys.path.append('/Users/hudaiber/Projects/SystemFiles/') elif sys.platform=='linux2': sys.path.append('/home/hudaiber/Projects/SystemFiles/') import global_variables as gv # sys.path.append(gv.project_code_path) import dm_tools as dt cas_type_file = os.path.join(gv.project_data_path, 'cas1402/cas1402.type.tab') _gi2castype = {l.strip().split('\t')[0]: l.strip().split('\t')[1].split(';') for l in open(cas_type_file).readlines()} files_path = os.path.join(gv.project_data_path, 'cas1402/files') file2type = {} for f in os.listdir(files_path): _genes = dt.get_pty_file_generic(os.path.join(files_path, f)) for _gene in _genes: if _gene.gid in _gi2castype: if f in file2type: file2type[f] += _gi2castype[_gene.gid] else: file2type[f] = _gi2castype[_gene.gid] print len(file2type) print outf = os.path.join(gv.project_data_path, 'cas1402/file2type.tab') with open(outf,'w') as of: for file, cas_type in file2type.items(): of.write("%s\t%s\n" % (file, ";".join(cas_type)))
def __init__(self, file_name, annotation_map=None, file_format='pty', profile2gene=None): self.file_name = file_name self.base_file_name = os.path.basename(file_name) if file_format=='pty': _genes = dt.get_pty_file(file_name, annotation_map=annotation_map) else: _genes = dt.get_pty_file_generic(file_name, profile2gene=profile2gene) locus_file = open(file_name) type_line = locus_file.readline() if type_line.startswith('#type:'): self.crispr_type = type_line.split(':')[1].strip() else: self.crispr_type = None self.summary_line = locus_file.readline() assert self.summary_line.startswith("===") locus_file.close() self.genes = _genes self.profiles = set(profile for g in _genes for profile in g.cogid.split() if profile != '') self.gene_names = set(gene_name for g in _genes for gene_name in g.gene_name.split(',') if gene_name != '') self.clusters = set(gene.cluster_id for gene in _genes) self.organism = _genes[0].organism self.source = _genes[0].src _forward = set() _reverse = set() for i in range(len(self.genes)): _gene = self.genes[i] for _cogid in _gene.cogid.split(','): _forward.update((_cogid,)) if i == len(self.genes)-1: continue _next_gene = self.genes[i+1] for _next_cogid in _next_gene.cogid.split(','): _forward.update(("%s-%s" % (_cogid, _next_cogid),)) self.genes.sort(reverse=True) for i in range(len(self.genes)): _gene = self.genes[i] for _cogid in _gene.cogid.split(','): _reverse.update((_cogid,)) if i == len(self.genes)-1: continue _next_gene = self.genes[i+1] for _next_cogid in _next_gene.cogid.split(','): _reverse.update(("%s-%s" % (_cogid, _next_cogid),)) self.forward_set = _forward self.reverse_set = _reverse self.feature_weights = None self.feature_labels = None