Exemple #1
0
def genes2phens(p2entrez_file, e2z=None):
    ensg2z = e2z if e2z else ut.load_dict_sets(ut.proj_path('convert', 'Hs2Hs_entrez.tab'))
    def dict_sets_rem_prefix(d, sep):
        d = dict([(k,set([vi.split(sep)[1] for vi in v])) for k,v in d.items()])
        return d
    p2z = dict_sets_rem_prefix(ut.load_dict_sets(p2entrez_file), ":")
    return ut.compose_dict_sets(ensg2z, ut.dict_inverse_sets(p2z))
def orth_count_ogroups(sp1, sp2):
    """
    Symmetric measure of orthology.
    Does not lend itself as well to only counting genes in a provided list.
    """
    key, swap_order = orth.orth_key(sp1, sp2)
    ogs = orth._load_ogroups(ut.proj_path('convert_orth', 'table.'+key))
    return len(ogs)
Exemple #3
0
def custom_conversion(fromtype, totype):
    """
    Check for a custom file in data/convert
    Return None if not found.
    """
    fname = "%s2%s.tab" % (fromtype, totype)
    fpath = ut.proj_path('convert',fname)
    if os.path.exists(fpath):
        return ut.load_dict_sets(fpath)
Exemple #4
0
def custom_conversion(fromtype, totype):
    """
    Check for a custom file in data/convert
    Return None if not found.
    """
    fname = "%s2%s.tab" % (fromtype, totype)
    fpath = ut.proj_path('convert', fname)
    if os.path.exists(fpath):
        return ut.load_dict_sets(fpath)
Exemple #5
0
def load_havug_cxs(convert_ensg=True):
    fname = ut.proj_path('havug_cxs')
    u2e = ut.dict_inverse_sets(ut.load_dict_sets(
        '../../data/convert/Hs2Hs_uni.tab'))
    hcxs = ut.load_list_of_type(fname,set)
    if convert_ensg:
        hcxs = convert_complexes([(i,c) for i,c in
            enumerate(hcxs)], u2e,
            seqs.load_prots_from_fasta('../../data/sequences/canon/Hs.fasta'))
    return hcxs
Exemple #6
0
def fnet_names(fnet_file):
    filename = ut.proj_path('fnet_path',fnet_file)
    first = ut.load_tab_file(filename).next()
    nfields = len(first)-2
    if nfields > 1:
        return [l[0].strip() if l[0].find('=')==-1 else
                l[0].split('=')[0].strip() for l in
                ut.load_tab_file(ut.pre_ext(filename,'_names'))]
    else:
        return None #means there is only one data column.
Exemple #7
0
 def __init__(self, sp='Hs'):
     lines = ut.load_list_of_lists(ut.proj_path('gene_desc_'+sp))[1:]
     processed = [tuple([l[0],l[1].lower()] + (l[2:] if len(l)>2 else [''])) for l in
             lines if len(l)>1]
     self.gnames = [(l[1], l[2].split('[')[0]) for l in processed]
     self.name2id = dict(((l[1],l[0]) for l in processed))
     self.id2name = dict(((l[0], l[1]) for l in processed))
     self.name2desc = dict(self.gnames)
     self.id2desc = dict(((l[0],l[2].split('[')[0]) for l in processed))
     self.id2all = dict(((l[0],l[1:]) for l in processed))
Exemple #8
0
def orth_fname(from_sp, to_sp):
    key = from_sp + '-' + to_sp
    if key in keys:
        swap_order=False
    else:
        key = to_sp + '-' + from_sp
        if key in keys:
            swap_order=True
        else:
            assert False, "Orthogroup key %s not in keys list" % key
    fname = ut.proj_path('convert_orth', 'table.'+key)
    return fname, swap_order
Exemple #9
0
def orth_fname(from_sp, to_sp):
    key = from_sp + '-' + to_sp
    if key in keys:
        swap_order = False
    else:
        key = to_sp + '-' + from_sp
        if key in keys:
            swap_order = True
        else:
            assert False, "Orthogroup key %s not in keys list" % key
    fname = ut.proj_path('convert_orth', 'table.' + key)
    return fname, swap_order
Exemple #10
0
def filter_location(cxs, go_location):
    """
    Return only those cxs for x (cyto/nuc) where the go cell compartment is either:
    - more proteins annotated with x than y
    - more than half of the proteins annotated with x
    """
    assert go_location in ['cyto','nuc'], 'location not supported'
    keys = ['cyto_prots','nuc_prots']
    yes_key,no_key = keys if go_location=='cyto' else keys[::-1]
    yes_prots,no_prots = [go_assoc_prots(ut.proj_path(key)) 
            for key in yes_key, no_key]
    return [c for c in cxs 
            if len(set.intersection(c[1],yes_prots))/len(c[1]) > .5 
            or (len(set.intersection(c[1],yes_prots)) -
                len(set.intersection(c[1],no_prots))) > 0]
Exemple #11
0
def score_arr_ext(arr, species, ext_key):
    """
    Key_or_data: either a string matching one of the keys for ext data in
    config.py, or a tuple of (name,data) where data is a sequence of (id1, id2,
    score), and the sequence can be a generator.
    fnet_cols: list of columns or first 2 letters to include, eg ['HS','CE']
    """
    ext_file = ut.config()[ext_key]
    conv_dict = convdict_from_fname(species, ext_file)
    filename = ut.proj_path('fnet_path', ext_file)
    stored_names = fnet_names(ext_file) # None if only one data column.
    names = stored_names if stored_names else [ext_key]
    data_dict = load_net(ut.load_tab_file(filename))
    print 'External data file: %s; size: %s; cols: %s' % (ext_file,
            len(data_dict), len(names))
    score_arr(arr, species, names, data_dict, conv_dict)
Exemple #12
0
def load_corum(fname, filter_methods, do_dedupe):
    """
    Returns a list of tuples: (name, set(uniprotIDs), species)
    """
    lines = [l[:7] for l in ut.load_tab_file(fname, sep=';')][1:]
    cxs = [(name, set(prots.split(',')), species, method) 
            for _,name,_,species,prots,_,method in lines]
    if filter_methods:
        print "Filtering corum methods."
        keep_methods = set([x[0] for x in
            (ut.load_tab_file(ut.proj_path('corum_methods'))) if int(x[3])==1])
        cxs = [(n,p,s) for n,p,s,methods in cxs 
                if (len([m for m in methods.split('|') 
                    if m.split('-')[0].strip() in keep_methods]) > 0)]
    else:
        cxs = [(n,p,s) for n,p,s,m in cxs]
    return cxs
Exemple #13
0
def load_ppi_cxs(minlen=2, maxlen=50, sp_match='Human', go_location=None,
        do_filter_methods=True, dedupe_names=True, remove_sps=True):
    """
    Returns a list of sets of uniprot ids.
    No de-duplication or anything else.
    Expected that this list may have duplicates.
    """
    fname = ut.proj_path('corum_cxs')
    cxs = load_corum(fname, do_filter_methods, dedupe_names)
    print_rib_count(cxs, 'a')
    if sp_match: 
        cxs = ut.list_filter_value(cxs, 2, sp_match)
    #print_rib_count(cxs, 'b')
    cxs = [c for c in cxs if len(c[1])>=minlen and len(c[1])<=maxlen]
    #print_rib_count(cxs, 'c')
    if remove_sps:
        cxs = [(name,ps) for name,ps,spec in cxs]
    #print len(cxs)
    if go_location:
        print "Filtering corum by go location"
        cxs = filter_location(cxs, go_location)
    return cxs
Exemple #14
0
def fasta_fname(sp):
    return ut.proj_path('fastadir', sp + '.fasta')