def genes2phens(p2entrez_file, e2z=None): ensg2z = e2z if e2z else ut.load_dict_sets(ut.proj_path('convert', 'Hs2Hs_entrez.tab')) def dict_sets_rem_prefix(d, sep): d = dict([(k,set([vi.split(sep)[1] for vi in v])) for k,v in d.items()]) return d p2z = dict_sets_rem_prefix(ut.load_dict_sets(p2entrez_file), ":") return ut.compose_dict_sets(ensg2z, ut.dict_inverse_sets(p2z))
def orth_count_ogroups(sp1, sp2): """ Symmetric measure of orthology. Does not lend itself as well to only counting genes in a provided list. """ key, swap_order = orth.orth_key(sp1, sp2) ogs = orth._load_ogroups(ut.proj_path('convert_orth', 'table.'+key)) return len(ogs)
def custom_conversion(fromtype, totype): """ Check for a custom file in data/convert Return None if not found. """ fname = "%s2%s.tab" % (fromtype, totype) fpath = ut.proj_path('convert',fname) if os.path.exists(fpath): return ut.load_dict_sets(fpath)
def custom_conversion(fromtype, totype): """ Check for a custom file in data/convert Return None if not found. """ fname = "%s2%s.tab" % (fromtype, totype) fpath = ut.proj_path('convert', fname) if os.path.exists(fpath): return ut.load_dict_sets(fpath)
def load_havug_cxs(convert_ensg=True): fname = ut.proj_path('havug_cxs') u2e = ut.dict_inverse_sets(ut.load_dict_sets( '../../data/convert/Hs2Hs_uni.tab')) hcxs = ut.load_list_of_type(fname,set) if convert_ensg: hcxs = convert_complexes([(i,c) for i,c in enumerate(hcxs)], u2e, seqs.load_prots_from_fasta('../../data/sequences/canon/Hs.fasta')) return hcxs
def fnet_names(fnet_file): filename = ut.proj_path('fnet_path',fnet_file) first = ut.load_tab_file(filename).next() nfields = len(first)-2 if nfields > 1: return [l[0].strip() if l[0].find('=')==-1 else l[0].split('=')[0].strip() for l in ut.load_tab_file(ut.pre_ext(filename,'_names'))] else: return None #means there is only one data column.
def __init__(self, sp='Hs'): lines = ut.load_list_of_lists(ut.proj_path('gene_desc_'+sp))[1:] processed = [tuple([l[0],l[1].lower()] + (l[2:] if len(l)>2 else [''])) for l in lines if len(l)>1] self.gnames = [(l[1], l[2].split('[')[0]) for l in processed] self.name2id = dict(((l[1],l[0]) for l in processed)) self.id2name = dict(((l[0], l[1]) for l in processed)) self.name2desc = dict(self.gnames) self.id2desc = dict(((l[0],l[2].split('[')[0]) for l in processed)) self.id2all = dict(((l[0],l[1:]) for l in processed))
def orth_fname(from_sp, to_sp): key = from_sp + '-' + to_sp if key in keys: swap_order=False else: key = to_sp + '-' + from_sp if key in keys: swap_order=True else: assert False, "Orthogroup key %s not in keys list" % key fname = ut.proj_path('convert_orth', 'table.'+key) return fname, swap_order
def orth_fname(from_sp, to_sp): key = from_sp + '-' + to_sp if key in keys: swap_order = False else: key = to_sp + '-' + from_sp if key in keys: swap_order = True else: assert False, "Orthogroup key %s not in keys list" % key fname = ut.proj_path('convert_orth', 'table.' + key) return fname, swap_order
def filter_location(cxs, go_location): """ Return only those cxs for x (cyto/nuc) where the go cell compartment is either: - more proteins annotated with x than y - more than half of the proteins annotated with x """ assert go_location in ['cyto','nuc'], 'location not supported' keys = ['cyto_prots','nuc_prots'] yes_key,no_key = keys if go_location=='cyto' else keys[::-1] yes_prots,no_prots = [go_assoc_prots(ut.proj_path(key)) for key in yes_key, no_key] return [c for c in cxs if len(set.intersection(c[1],yes_prots))/len(c[1]) > .5 or (len(set.intersection(c[1],yes_prots)) - len(set.intersection(c[1],no_prots))) > 0]
def score_arr_ext(arr, species, ext_key): """ Key_or_data: either a string matching one of the keys for ext data in config.py, or a tuple of (name,data) where data is a sequence of (id1, id2, score), and the sequence can be a generator. fnet_cols: list of columns or first 2 letters to include, eg ['HS','CE'] """ ext_file = ut.config()[ext_key] conv_dict = convdict_from_fname(species, ext_file) filename = ut.proj_path('fnet_path', ext_file) stored_names = fnet_names(ext_file) # None if only one data column. names = stored_names if stored_names else [ext_key] data_dict = load_net(ut.load_tab_file(filename)) print 'External data file: %s; size: %s; cols: %s' % (ext_file, len(data_dict), len(names)) score_arr(arr, species, names, data_dict, conv_dict)
def load_corum(fname, filter_methods, do_dedupe): """ Returns a list of tuples: (name, set(uniprotIDs), species) """ lines = [l[:7] for l in ut.load_tab_file(fname, sep=';')][1:] cxs = [(name, set(prots.split(',')), species, method) for _,name,_,species,prots,_,method in lines] if filter_methods: print "Filtering corum methods." keep_methods = set([x[0] for x in (ut.load_tab_file(ut.proj_path('corum_methods'))) if int(x[3])==1]) cxs = [(n,p,s) for n,p,s,methods in cxs if (len([m for m in methods.split('|') if m.split('-')[0].strip() in keep_methods]) > 0)] else: cxs = [(n,p,s) for n,p,s,m in cxs] return cxs
def load_ppi_cxs(minlen=2, maxlen=50, sp_match='Human', go_location=None, do_filter_methods=True, dedupe_names=True, remove_sps=True): """ Returns a list of sets of uniprot ids. No de-duplication or anything else. Expected that this list may have duplicates. """ fname = ut.proj_path('corum_cxs') cxs = load_corum(fname, do_filter_methods, dedupe_names) print_rib_count(cxs, 'a') if sp_match: cxs = ut.list_filter_value(cxs, 2, sp_match) #print_rib_count(cxs, 'b') cxs = [c for c in cxs if len(c[1])>=minlen and len(c[1])<=maxlen] #print_rib_count(cxs, 'c') if remove_sps: cxs = [(name,ps) for name,ps,spec in cxs] #print len(cxs) if go_location: print "Filtering corum by go location" cxs = filter_location(cxs, go_location) return cxs
def fasta_fname(sp): return ut.proj_path('fastadir', sp + '.fasta')