def ramilowski_interactions(putative = False): """ Downloads and processes ligand-receptor interactions from Supplementary Table 2 of Ramilowski 2015. Returns list of lists with ligand and receptor gene symbols, reference and resources as elements. """ c = curl.Curl(urls.urls['rami']['url'], silent = False, large = True) xlsname = c.fname del(c) raw = inputs_common.read_xls(xlsname, 'All.Pairs')[1:] return [ [ r[1], r[3], r[13].replace(' ', ''), # references ';'.join(filter(len, itertools.chain(r[5:11], [r[15]]))) ] for r in raw if r[15] != 'EXCLUDED not ligand' and ( putative or r[15] != 'putative' ) ] return raw
def cspa_cell_types(organism = 9606): sheets = { 'Human': 'Table_E', 'Mouse': 'Table_F', } str_organism = taxonomy.taxids[organism].capitalize() url = urls.urls['cspa']['url_s1'] c = curl.Curl(url, large = True, silent = False) xlsname = c.fname del(c) raw = inputs_common.read_xls(xlsname, sheets[str_organism]) result = collections.defaultdict(lambda: collections.defaultdict(dict)) cell_types = raw[0][1:] for row in raw[1:]: for uniprot in mapping.map_name(row[0], 'uniprot', 'uniprot'): for col, cell_type in enumerate(cell_types): value = row[col + 1] result[cell_type][uniprot] = ( float(value) if common.is_float(value) else None ) return result
def almen2009_annotations(): resep = re.compile(r'[;/]') Almen2009Annotation = collections.namedtuple( 'Almen2009Annotation', [ 'mainclass', 'classes', 'phobius_secreted', 'phobius_transmembrane', 'sosui_transmembrane', 'tmhmm_transmembrane', ] ) url = urls.urls['almen2009']['url'] c = curl.Curl(url, silent = False, large = True) xls = c.fileobj xlsfile = xls.name xls.close() tbl = inputs_common.read_xls(xlsfile, sheet = 'Data')[1:] result = collections.defaultdict(set) for row in tbl: uniprots = mapping.map_name(row[0], 'ipi', 'uniprot') mainclass = row[2] classes = row[3].replace('KInase', 'Kinase') classes = tuple(sorted(resep.split(classes))) phobius_transmembrane = int(float(row[5])) phobius_secreted = row[6] == 'Y' sosui_transmembrane = int(float(row[8])) if row[8] != 'ERROR' else 0 tmhmm_transmembrane = int(float(row[10])) for uniprot in uniprots: result[uniprot].add( Almen2009Annotation( mainclass = mainclass, classes = classes, phobius_secreted = phobius_secreted, phobius_transmembrane = phobius_transmembrane, sosui_transmembrane = sosui_transmembrane, tmhmm_transmembrane = tmhmm_transmembrane, ) ) return result
def get_li2012(): """ Reads supplementary data of Li 2012 from local file. Returns table (list of lists). """ url = urls.urls['li2012']['url'] c = curl.Curl(url, silent=False, large=True) xls = c.fileobj xlsfile = xls.name xls.close() tbl = inputs_common.read_xls(xlsfile, sheet='File S1') return filter(lambda l: len(l[-1]) > 0, map(lambda l: l[:7], tbl[2:]))
def rolland_hi_ii_14(): """ Loads the HI-II-14 unbiased interactome from the large scale screening of from Rolland 2014. Returns list of interactions. """ url = urls.urls['hiii14']['url'] c = curl.Curl(url, silent=False, large=True) xlsname = c.fileobj.name c.fileobj.close() tbl = inputs_common.read_xls(xlsname, sheet='2G') for row in tbl[1:]: yield [c.split('.')[0] for c in row]
def rolland_hi_ii_14(): """ Loads the HI-II-14 unbiased interactome from the large scale screening of from Rolland 2014. Returns list of interactions. """ xlsname = cell.cell_supplementary( supp_url=urls.urls['hiii14']['url'], article_url=urls.urls['hiii14']['article_url'], ) tbl = inputs_common.read_xls(xlsname, sheet='2G') for row in tbl[1:]: yield [c.split('.')[0] for c in row]
def cspa_annotations(organism = 9606): CspaAnnotation = collections.namedtuple( 'CspaAnnotation', [ 'high_confidence', 'n_cell_types', 'tm', 'gpi', 'uniprot_cell_surface', ], ) sheets = { 'Human': 'Table A', 'Mouse': 'Table B', } str_organism = taxonomy.taxids[organism].capitalize() url = urls.urls['cspa']['url_s2'] c = curl.Curl(url, large = True, silent = False) xlsname = c.fname del(c) raw = inputs_common.read_xls(xlsname, sheets[str_organism])[1:] result = collections.defaultdict(set) for row in raw: for uniprot in mapping.map_name(row[1], 'uniprot', 'uniprot'): result[uniprot].add( CspaAnnotation( high_confidence = 'high confidence' in row[2], n_cell_types = int(float(row[9])), tm = int(float(row[11])), gpi = int(float(row[12])), uniprot_cell_surface = row[13] == 'yes', ) ) return dict(result)
def matrisome_annotations(organism=9606): """ Downloads MatrisomeDB 2.0, a database of extracellular matrix proteins. Returns dict where keys are UniProt IDs and values are tuples of classes, subclasses and notes. """ MatrisomeAnnotation = collections.namedtuple( 'MatrisomeAnnotation', ['mainclass', 'subclass', 'subsubclass']) tax_names = { 10090: ('Murine', 'mm'), 9606: ('Human', 'hs'), } url = urls.urls['matrisome']['url_xls'] % tax_names[organism] c = curl.Curl(url, large=True, silent=False) xlsname = c.fname del (c) raw = inputs_common.read_xls(xlsname)[1:] result = collections.defaultdict(set) for r in raw: uniprots = set(r[7].split(':')) uniprots.discard('') if not uniprots: continue uniprots = mapping.map_names(uniprots, 'uniprot', 'uniprot') for uniprot in uniprots: result[uniprot].add( MatrisomeAnnotation( mainclass=r[0].strip(), subclass=r[1].strip(), subsubclass=r[10].strip() or None, )) return dict(result)
def embrace_raw(): """ Returns Supplementary Table S11 from 10.1016/j.isci.2019.10.026 (Sheikh et al. 2019) as a list of tuples. """ path = cell_input.cell_supplementary( supp_url=urls.urls['embrace']['url'], article_url=urls.urls['embrace']['article'], ) content = inputs_common.read_xls(path) EmbraceRawRecord = collections.namedtuple('EmbraceRawRecord', content[0]) return [ EmbraceRawRecord(*(line[:2] + [int(float(n)) for n in line[2:]])) for line in content[1:] ]
def kinasedotcom_annotations(): """ Downloads and processes kinase annotations from kinase.com. """ KinasedotcomAnnotation = collections.namedtuple( 'KinasedotcomAnnotation', ['group', 'family', 'subfamily']) KinasedotcomAnnotation.__new__.__defaults__ = (None, ) def add_record(uniprot, rec, offset=2): if rec[offset].strip(): result[uniprot].add( KinasedotcomAnnotation( group=rec[offset].strip(), family=rec[offset + 1].strip(), subfamily=rec[offset + 2].strip() or None, )) url = urls.urls['kinome']['url'] c = curl.Curl(url, large=True, silent=False) xlsf = c.fileobj xlsname = xlsf.name xlsf.close() tbl = inputs_common.read_xls(xlsname) result = collections.defaultdict(set) for rec in tbl: uniprots = mapping.map_name(rec[23].strip(), 'genesymbol', 'uniprot') for uniprot in uniprots: add_record(uniprot, rec) if rec[12].strip(): add_record(uniprot, rec, offset=12) return result
def surfaceome_annotations(): """ Downloads the "In silico human surfaceome". Dict with UniProt IDs as key and tuples of surface prediction score, class and subclass as values (columns B, N, S and T of table S3). """ url = urls.urls['surfaceome']['url'] c = curl.Curl(url, large=True, silent=False) xlsname = c.fname del (c) raw = inputs_common.read_xls(xlsname, 'in silico surfaceome only')[2:] return dict(( uniprot, # uniprot ( float(r[13]), # score r[18] if r[18] else None, # class set(r[19].replace('KInase', 'Kinase').split(';') ) if r[19] else set(), # subclass )) for r in raw for uniprot in mapping.map_name(r[1], 'uniprot', 'uniprot'))
def icellnet_interactions(): url = urls.urls['icellnet']['url'] c = curl.Curl(url, silent=False, large=True) xls = c.fileobj xlsfile = xls.name xls.close() tbl = inputs_common.read_xls(xlsfile) for line in tbl[1:]: references = _icellnet_get_references(line) resources = _icellnet_get_resources(line) if resources: references.extend([r for r in resources if r.isdigit()]) resources = [r for r in resources if not r.isdigit()] ligand_components = _icellnet_get_components(line, (0, 1)) receptor_components = _icellnet_get_components(line, (2, 3, 4)) ligand = _icellnet_get_entity(ligand_components, references) receptor = _icellnet_get_entity(receptor_components, references) yield IcellnetRecord( ligand=ligand, receptor=receptor, family=line[6].strip() or None, subfamily=line[7].strip() or None, classification=([ cls.strip().replace('.', '').capitalize() for cls in line[8].split('/') ] if line[8].strip() else None), resources=resources, references=references, )
def wojtowicz2020_raw(): """ Returns Supplementary Table S4 from 10.1016/j.cell.2020.07.025 (Wojtowicz et al. 2020) as a list of tuples. """ path = cell_input.cell_supplementary( supp_url=urls.urls['wojtowicz2020']['url'], article_url=urls.urls['wojtowicz2020']['article'], ) content = inputs_common.read_xls(path) fields = content.pop(0) fields = [re.sub('[- ]', '_', f.lower()) for f in fields] Wojtowicz2020RawRecord = collections.namedtuple('Wojtowicz2020RawRecord', fields) return [ Wojtowicz2020RawRecord(*(float(f) if 5 < i < 17 else f for i, f in enumerate(line))) for line in content ]
def baccin2019_interactions(ncbi_tax_id=9606): recamel = re.compile(r'(.+?)([A-Z][a-z])') recap = re.compile(r'(^[A-Z][a-z]|_[A-Z][a-z])(.+)') def camel_to_snake(value): return (recamel.sub(lambda m: m.group(1).lower() + '_' + m.group(2), value.strip()).lower()) def id_translate(mouse_gs): uniprots = mapping.map_name( mouse_gs, 'genesymbol', 'uniprot', 10090, ) if ncbi_tax_id != 10090: uniprots = set( itertools.chain(*(homology.translate(uniprot) for uniprot in uniprots))) return uniprots def raw_to_uniprots(raw): components = raw.split('&') return set( itertools.product(*(id_translate(comp) for comp in components))) def get_partners(components, sources, references): return {(comp[0] if len(comp) == 1 else intera.Complex( components=comp, sources=sources, references=references, )) for comp in components} Baccin2019Interaction = collections.namedtuple('Baccin2019Interaction', [ 'ligand', 'receptor', 'correct', 'ligand_location', 'ligand_category', 'resources', 'references', ]) source_names = { 'Baccin': 'Baccin2019', 'Ramilowski': 'Ramilowski2015', } url = urls.urls['baccin2019']['url'] c = curl.Curl(url, silent=False, large=True) data = inputs_common.read_xls(c.fileobj.name, sheet='SuppTable3') result = [] if ncbi_tax_id != 10090: homology = homology_mod.ProteinHomology( target=ncbi_tax_id, source=10090, ) for rec in data[3:]: if rec[4].strip().lower() == 'incorrect': continue ligand_components = raw_to_uniprots(rec[1]) if not ligand_components: continue receptor_components = raw_to_uniprots(rec[2]) if not receptor_components: continue sources = {'Baccin2019', rec[3].strip()} sources = { source_names[s] if s in source_names else s for s in sources } references = { _ref for _ref in (ref.strip().replace('.0', '') for ref in rec[7].split(',')) if _ref.isdigit() } ligands = get_partners(ligand_components, sources, references) receptors = get_partners(receptor_components, sources, references) for ligand, receptor in itertools.product(ligands, receptors): result.append( Baccin2019Interaction( ligand=ligand, receptor=receptor, correct=rec[4].strip(), ligand_location=camel_to_snake(rec[5]), ligand_category=camel_to_snake(rec[6]), resources=sources, references=references, )) return result
def kirouac2010_interactions(): """ Returns tuples of ligand-receptor genesymbol pairs. """ rename = re.compile(r'[A-Z]{2}[A-Z0-9][-A-Z0-9]*') rerange = re.compile(r'([0-9])-([0-9])') reslash = re.compile(r'.*?([A-Z0-9]{1,3}/[/A-Z0-9]+)') def get_names(s): names = set() prev = None for n in s.split(): m = rename.findall(n) if m: prev = m m = reslash.match(n) if m: for post in m.groups()[0].split('/'): for pre in prev: names.add('%s%s' % (pre, post)) else: m = rerange.match(n) if m: intv = m.groups() for post in range(int(intv[0]), int(intv[1]) + 1): for pre in prev: names.add('%s%u' % (pre, post)) else: names.update(prev) prev = None return names init_url = urls.urls['kirouac2010']['init_url'] req_headers = [ ('User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:68.0) ' 'Gecko/20100101 Firefox/68.0'), ] url = urls.urls['kirouac2010']['url'] c00 = curl.Curl(url, call=False, process=False) if (not os.path.exists(c00.cache_file_name) or os.path.getsize(c00.cache_file_name) == 0): _log('Kirouac 2010 download: requesting website cookie.') c0 = curl.Curl( init_url, silent=True, large=False, req_headers=req_headers, follow=False, cache=False, ) cookies = [] if hasattr(c0, 'resp_headers'): for hdr in c0.resp_headers: if hdr.lower().startswith(b'set-cookie'): cookie = hdr.split(b':')[1].split(b';')[0].strip() if cookie not in cookies: cookies.append(cookie.decode('ascii')) cookies = '; '.join(cookies) req_headers.append('Cookie: %s' % cookies) _log('Response header: %s' % str(c0.resp_headers)) _log('Cookies: %s' % str(cookies)) _log('Request header: %s' % str(req_headers)) os.remove(c00.cache_file_name) c = curl.Curl( url, silent=False, large=True, req_headers=req_headers, ) xlsname = c.fname del (c) tbl = inputs_common.read_xls(xlsname, sheet='S12') result = [] for r in tbl[2:]: namesA = get_names(r[0]) namesB = get_names(r[1]) result.extend(list(itertools.product(namesA, namesB))) return result
def embrace_raw(): """ Returns Supplementary Table S11 from 10.1016/j.isci.2019.10.026 (Sheikh et al. 2019) as a list of tuples. """ url = urls.urls['embrace']['url'] c_nocall = curl.Curl( url, call=False, setup=False, process=False, silent=True, ) c_nocall.get_cache_file_name() path = c_nocall.cache_file_name init_url = urls.urls['embrace']['article'] req_headers = [] if not os.path.exists(path): cookies = {} for step in range(3): c_init = curl.Curl( init_url, silent=True, large=True, cache=False, follow=False, req_headers=req_headers + ['user-agent: curl/7.69.1'], bypass_url_encoding=True, retries=1, empty_attempt_again=False, ) new_cookies = dict( tuple( h.decode().split(':')[1].\ split(';')[0].\ strip().split('=', maxsplit = 1) ) for h in c_init.resp_headers if h.lower().startswith(b'set-cookie') ) cookies.update(new_cookies) _ = cookies.pop('__cflb', None) for h in c_init.resp_headers: if h.lower().startswith(b'location'): init_url = h.decode().split(':', maxsplit=1)[1].strip() req_headers = ([ 'Cookie: %s' % ('; '.join('%s=%s' % cookie for cookie in iteritems(cookies))) ] if cookies else []) _log('HTTP %u; location: `%s`, cookies: `%s`.' % ( c_init.status, init_url, req_headers[0] if req_headers else '', )) if c_init.status != 302: break c_table = curl.Curl( url, silent=False, large=True, empty_attempt_again=False, req_headers=req_headers + ['user-agent: curl/7.69.1'], ) path = c_table.cache_file_name c_table.fileobj.close() content = inputs_common.read_xls(path) EmbraceRawRecord = collections.namedtuple('EmbraceRawRecord', content[0]) return [ EmbraceRawRecord(*(line[:2] + [int(float(n)) for n in line[2:]])) for line in content[1:] ]