Esempio n. 1
0
def netpath_pathway_annotations():

    NetpathPathway = collections.namedtuple(
        'NetpathPathway',
        ['pathway'],
    )

    result = collections.defaultdict(set)

    url_template = urls.urls['netpath_pw']['url']

    url_main = urls.urls['netpath_pw']['mainpage']
    c = curl.Curl(url_main, cache = False)
    cookie = [
        h.decode().split(':')[1].split(';')[0].strip()
        for h in c.resp_headers
        if h.startswith(b'Set-Cookie')
    ]
    cookie_hdr = ['Cookie: %s' % '; '.join(cookie)]

    pathway_ids = netpath_names()

    for _id, pathway in iteritems(pathway_ids):

        url = url_template % int(_id)
        c = curl.Curl(
            url,
            req_headers = cookie_hdr,
            silent = False,
            encoding = 'iso-8859-1',
        )

        soup = bs4.BeautifulSoup(c.result, 'html.parser')

        for tbl in soup.find_all('table'):
            hdr = tbl.find('td', {'class': 'barhead'})

            if not hdr or not hdr.text.strip().startswith('Molecules Invol'):
                continue

            for td in tbl.find_all('td'):
                genesymbol = td.text.strip()

                if not genesymbol:
                    continue

                uniprots = mapping.map_name(
                    genesymbol,
                    'genesymbol',
                    'uniprot',
                )

                for uniprot in uniprots:
                    result[uniprot].add(
                        NetpathPathway(
                            pathway = pathway
                        )
                    )

    return result
Esempio n. 2
0
def depod_enzyme_substrate(organism=9606):

    result = []

    reunip = re.compile(r'uniprotkb:([A-Z0-9]+)')
    reptm = re.compile(r'([A-Z][a-z]{2})-([0-9]+)')
    repmidsep = re.compile(r'[,|]\s?')

    url = urls.urls['depod']['urls'][0]
    c = curl.Curl(url, silent=False, encoding='ascii')
    data = c.result
    data = [x.split('\t') for x in data.split('\n')]
    del data[0]

    url_mitab = urls.urls['depod']['urls'][1]
    c_mitab = curl.Curl(url_mitab, silent=False, encoding='iso-8859-1')
    data_mitab = c_mitab.result
    data_mitab = [x.split('\t') for x in data_mitab.split('\n')]
    del data_mitab[0]

    for i, l in enumerate(data):

        if (len(l) > 6 and l[2] == 'protein substrate'
                and taxonomy.ensure_ncbi_tax_id(l[3].split('(')[0].strip())
                == organism and l[4].strip() != 'N/A'):

            enzyme_uniprot = reunip.search(data_mitab[i][0]).groups()[0]
            substrate_uniprot = reunip.search(data_mitab[i][1]).groups()[0]

            for enzyme_up, substrate_up in itertools.product(
                    mapping.map_name(enzyme_uniprot, 'uniprot', 'uniprot'),
                    mapping.map_name(substrate_uniprot, 'uniprot', 'uniprot'),
            ):

                for resaa, resnum in reptm.findall(l[4]):

                    resnum = int(resnum)
                    resaa = (common.aminoa_3_to_1_letter[resaa] if resaa
                             in common.aminoa_3_to_1_letter else resaa)

                    result.append({
                        'instance': None,
                        'kinase': enzyme_up,
                        'resaa': resaa,
                        'resnum': resnum,
                        'references': repmidsep.split(l[6].strip()),
                        'substrate': substrate_up,
                        'start': None,
                        'end': None,
                        'typ': 'dephosphorylation',
                    })

    return result
Esempio n. 3
0
def genecards_datasheet(gene):
    """
    Retrieves a gene (protein) datasheet from GeneCards.
    Returns HTML as string.
    
    :param str gene:
        A Gene Symbol or UniProt ID.
    """
    
    url = urls.urls['genecards']['url'] % gene
    
    c = curl.Curl(
        url,
        silent = True,
        large = False,
        connect_timeout = settings.get('genecards_datasheet_connect_timeout'),
        timeout = settings.get('genecards_datasheet_timeout'),
    )
    
    if c.status not in {0, 200}:
        
        _log('Failed to retrieve gene card for ID `%s`.' % gene)
        
        return None
    
    return c.result
Esempio n. 4
0
def get_isoforms(organism=9606):
    """
    Loads UniProt sequences for all isoforms.
    """

    if organism in taxonomy.phosphoelm_taxids:
        organism = taxonomy.phosphoelm_taxids[organism]

    reorg = re.compile(r'OS=([A-Z][a-z]+\s[a-z]+)')
    result = {}
    url = urls.urls['unip_iso']['url']
    c = curl.Curl(url, silent=False)
    data = c.result
    data = read_fasta(data)

    for header, seq in iteritems(data):

        org = reorg.findall(header)

        if len(org) > 0 and org[0] == organism:

            prot = header.split('|')[1].split('-')
            unip = prot[0]
            isof = int(prot[1])

            if unip not in result:
                result[unip] = {}

            result[unip][isof] = seq

    return result
Esempio n. 5
0
def cspa_cell_types(organism = 9606):

    sheets = {
        'Human': 'Table_E',
        'Mouse': 'Table_F',
    }

    str_organism = taxonomy.taxids[organism].capitalize()

    url = urls.urls['cspa']['url_s1']
    c = curl.Curl(url, large = True, silent = False)
    xlsname = c.fname
    del(c)
    raw = inputs_common.read_xls(xlsname, sheets[str_organism])

    result = collections.defaultdict(lambda: collections.defaultdict(dict))

    cell_types = raw[0][1:]

    for row in raw[1:]:

        for uniprot in mapping.map_name(row[0], 'uniprot', 'uniprot'):

            for col, cell_type in enumerate(cell_types):

                value = row[col + 1]

                result[cell_type][uniprot] = (
                    float(value)
                        if common.is_float(value) else
                    None
                )

    return result
Esempio n. 6
0
def cellcellinteractions_annotations():
    
    
    CellcellinteractionsAnnotation = collections.namedtuple(
        'CellcellinteractionsAnnotation',
        [
            'mainclass',
        ]
    )
    
    
    url = urls.urls['cellcellinteractions']['url']
    
    c = curl.Curl(url, silent = False, large = True)
    
    _ = next(c.result)
    
    result = collections.defaultdict(set)
    
    for row in c.result:
        
        row = row.strip('\r\n').split('\t')
        
        uniprots = mapping.map_name(row[0], 'genesymbol', 'uniprot')
        classes = row[1].split('/')
        
        for uniprot in uniprots:
            
            for cls in classes:
                
                result[uniprot].add(
                    CellcellinteractionsAnnotation(mainclass = cls)
                )
    
    return dict(result)
Esempio n. 7
0
def get_dorothea_old(levels={'A', 'B'}, only_curated=False):
    """
    Retrieves TF-target interactions from DoRothEA.

    :param set levels:
        Confidence levels to be used.
    :param bool only_curated:
            Retrieve only literature curated interactions.

    Details
    -------
    DoRothEA is a comprehensive resource of TF-target interactions
    combining multiple lines of evidences: literature curated databases,
    ChIP-Seq data, PWM based prediction using HOCOMOCO and JASPAR matrices
    and prediction from GTEx expression data by ARACNe.

    For details see https://github.com/saezlab/DoRothEA.
    """

    url = urls.urls['dorothea']['url'] % (
        'all' if 'E' in levels else 'ABCD' if 'D' in levels else
        'ABC' if 'C' in levels else 'AB' if 'B' in levels else 'A')

    c = curl.Curl(url, silent=False, large=True)
    _ = next(c.result)

    return (list(
        itertools.chain(ll[:4], (s == 'TRUE' for s in ll[4:8]), ll[-4:],
                        [','.join(s for s in ll[-4:]
                                  if s)] if not only_curated else ll[8]))
            for ll in (l.strip('\n\r').split('\t') for l in c.result)
            if (ll[3] in levels and not only_curated or ll[4] == 'TRUE'))
Esempio n. 8
0
def ipi_uniprot():
    """
    Retrieves an IPI-UniProt mapping dictionary.
    """

    result = collections.defaultdict(set)

    url = urls.urls['ipi']['url']

    c = curl.Curl(url, large=True, silent=False)

    for row in c.result:

        row = row.strip('\n\r').split('\t')

        if len(row) < 3:

            continue

        ipi_id = row[2]

        uniprot, isoform = inputs_common._try_isoform(row[1])

        is_uniprot = (not any(
            uniprot.startswith(pref)
            for pref in ('NP_', 'OTTH', 'HIT', 'ENSP', 'XP_')))

        if is_uniprot:

            result[ipi_id].add(uniprot)

    return dict(result)
Esempio n. 9
0
def lit_bm_interactions():
    """
    Literature collected interactions from Luck 2020.
    """

    LitBmInteraction = collections.namedtuple(
        'LitBmInteraction',
        ['uniprot_a', 'uniprot_b'],
    )

    url = urls.urls['hid']['lit-bm']
    c = curl.Curl(url, large=True, silent=False)

    for row in c.result:

        row = row.strip().split('\t')

        uniprots_a = mapping.map_name(row[0], 'ensembl', 'uniprot')
        uniprots_b = mapping.map_name(row[1], 'ensembl', 'uniprot')

        for uniprot_a, uniprot_b in itertools.product(uniprots_a, uniprots_b):

            yield LitBmInteraction(
                uniprot_a=uniprot_a,
                uniprot_b=uniprot_b,
            )
Esempio n. 10
0
def adhesome_annotations():

    AdhesomeAnnotation = collections.namedtuple(
        'AdhesomeAnnotation',
        ['mainclass', 'intrinsic'],
    )

    result = collections.defaultdict(set)

    url = urls.urls['adhesome']['components']
    c = curl.Curl(url, large = True, silent = False)

    data = csv.DictReader(c.result, delimiter = ',')

    for rec in data:
        uniprots = rec['Swiss-Prot ID']

        for uniprot in uniprots.split(','):
            uniprot = uniprot.strip()

            if uniprot == 'null':
                continue

            for _uniprot in mapping.map_name(uniprot, 'uniprot', 'uniprot'):
                result[uniprot].add(AdhesomeAnnotation(
                    mainclass = (
                        common.upper0(rec['Functional Category'].strip())
                    ),
                    intrinsic = rec['FA'].strip() == 'Intrinsic Proteins',
                ))

    return result
Esempio n. 11
0
def get_pfam_names():

    c = curl.Curl(urls.urls['pfam_pdb']['url'], silent=False)
    data = c.result
    if data is None:
        return None, None
    dname_pfam = {}
    pfam_dname = {}
    data = data.replace('\r', '').split('\n')
    del data[0]

    for l in data:

        l = l.split('\t')
        if len(l) > 5:
            pfam = l[4].split('.')[0]
            name = l[5]
            if pfam not in pfam_dname:
                pfam_dname[pfam] = []
            if name not in dname_pfam:
                dname_pfam[name] = []
            pfam_dname[pfam].append(name)
            dname_pfam[name].append(pfam)

    for k, v in iteritems(pfam_dname):
        pfam_dname[k] = list(set(v))
    for k, v in iteritems(dname_pfam):
        dname_pfam[k] = list(set(v))

    return dname_pfam, pfam_dname
Esempio n. 12
0
def dbptm_enzyme_substrate(organism=9606):
    """
    Downloads enzyme-substrate interactions from dbPTM.
    Returns list of dicts.
    """

    if organism is None:
        _organism = None
    elif organism in taxonomy.dbptm_taxids:
        _organism = taxonomy.dbptm_taxids[organism]
    else:
        sys.stdout.write('\t:: Unknown organism: `%u`.\n' % organism)
        return []

    url = urls.urls['dbptm']['old_table']
    c = curl.Curl(url, silent=False, large=True)
    data = []

    hdr = next(c.result).strip().split('\t')

    for l in c.result:

        l = l.strip().split('\t')

        data.append(
            dict((key, (None if val == '' else val.split(';') if key in
                        {'references', 'kinase'} else int(val) if val.isdigit(
                        ) else val)) for key, val in zip(hdr, l)))

    return data
Esempio n. 13
0
    def query(self, api, param, silent=False, large=False):
        '''
        Retrieves data from the API. 

        @api : str
            Shold be one of the 10 API sections available.
        @param : tuple
            Tuple of the parameters according to the API.
        @large : bool
            Passed to the curl wrapper function. If True, 
            the file will be written to disk, and a file 
            object open for reading is returned; if False,
            the raw data will be returned, in case of JSON,
            converted to python object, in case of XML, as
            a string.
        '''
        url = self.urls[api] % param
        # long timeout is given, because huge files (hundreds MB) take time to
        # load
        c = curl.Curl(
            url,
            req_headers=self.auth,
            silent=silent,
            timeout=1200,
            large=large)
        
        data = c.fileobj
        self.tmp = c
        
        if self.output_format == 'json' and not large:
            self.result = self.get_json(c.result)
        else:
            self.result = c.fileobj
Esempio n. 14
0
def get_uniprot_sec(organism=9606):
    """
    Downloads and processes the mapping between secondary and
    primary UniProt IDs.

    Yields pairs of secondary and primary UniProt IDs.

    :param int organism:
        NCBI Taxonomy ID of the organism.
    """

    if organism is not None:
        proteome = all_uniprots(organism=organism)
        proteome = set(proteome)

    sec_pri = []
    url = urls.urls['uniprot_sec']['url']
    c = curl.Curl(url, silent=False, large=True, timeout=2400)

    for line in filter(
            lambda line: len(line) == 2 and
        (organism is None or line[1] in proteome),
            map(lambda i: i[1].split(),
                filter(lambda i: i[0] >= 30, enumerate(c.result)))):

        yield line
Esempio n. 15
0
def lit_bm_13_interactions():
    """
    Downloads and processes Lit-BM-13 dataset, the 2013 version of the
    high confidence literature curated interactions from CCSB.
    Returns list of interactions.
    """

    LitBm13Interaction = collections.namedtuple('LitBm13Interaction', [
        'entrez_a',
        'entrez_b',
        'genesymbol_a',
        'genesymbol_b',
    ])

    url = urls.urls['hid']['lit-bm-13']
    c = curl.Curl(url, silent=False, large=True)

    _ = next(c.result)

    for row in c.result:

        row = row.strip().split('\t')

        yield LitBm13Interaction(
            entrez_a=row[0],
            entrez_b=row[2],
            genesymbol_a=row[1],
            genesymbol_b=row[3],
        )
Esempio n. 16
0
def uniprot_history(identifier):
    """
    Retrieves the history of a record.
    Returns a generator iterating over the history from most recent to the
    oldest.
    """

    if valid_uniprot(identifier):

        url_history = urls.urls['uniprot_basic']['history'] % identifier
        c_history = curl.Curl(
            url_history,
            silent=True,
            large=True,
        )

        if c_history.result:

            line0 = next(c_history.result)

            if not line0.startswith('<!DOCTYPE'):

                for line in c_history.result:

                    if line:

                        yield UniprotRecordHistory(
                            *(field.strip() for field in line.split('\t')))
Esempio n. 17
0
def _uniprot_deleted(swissprot=True, confirm=True):

    if not swissprot and confirm:

        resp = input('Loading the list of deleted TrEMBL IDs requires '
                     '>5GB memory. Do you want to proceed [y/n] ')

        if not resp or resp[0].lower() != 'y':

            return set()

    key = 'deleted_%s' % ('sp' if swissprot else 'tr')
    url = urls.urls['uniprot_basic'][key]
    c = curl.Curl(url, silent=False, large=True)

    result = set()

    for line in c.result:

        m = reac.match(line.strip())

        if m:

            result.add(m.groups()[0])

    return result
Esempio n. 18
0
def _matrixdb_protein_list(category, organism=9606):
    """
    Returns a set of proteins annotated by MatrixDB.

    :arg str category:
        The protein annotation category. Possible values: `ecm`, `membrane`
        or `secreted`.
    """

    url = urls.urls['matrixdb']['%s_proteins' % category]
    c = curl.Curl(url, silent=False, large=True)

    proteins = set()

    # header row
    _ = next(c.result)

    for l in c.result:
        if not l:
            continue

        proteins.add(l.strip().replace('"', '').split('\t')[0])

    proteins = mapping.map_names(proteins, 'uniprot', 'uniprot')

    if organism:

        uniprots = uniprot_input.all_uniprots(
            organism=organism,
            swissprot=True,
        )
        proteins = proteins & set(uniprots)

    return proteins
Esempio n. 19
0
def uniprot_history_recent_datasheet(identifier):

    recent_version = uniprot_recent_version(identifier)

    if recent_version:

        if recent_version.replaced_by:

            new = recent_version.replaced_by.split(';')[0]
            url = urls.urls['uniprot_basic']['datasheet'] % new
            _logger._log('UniProt ID `%s` is obsolete, has been replaced by '
                         '`%s`: `%s`.' % (
                             identifier,
                             new,
                             url,
                         ))
            return protein_datasheet(new)

        else:

            version = int(recent_version.entry_version)
            url = '%s?version=%u' % (
                urls.urls['uniprot_basic']['datasheet'] % identifier,
                version,
            )
            _logger._log('UniProt ID `%s` is obsolete, downloading archived '
                         'version %u: `%s`.' % (
                             identifier,
                             version,
                             url,
                         ))
            c = curl.Curl(url, silent=True, large=False)
            return _protein_datasheet(url)

    return []
Esempio n. 20
0
def uniprot_taxonomy():
    """
    Returns a dictionary with SwissProt IDs as keys and sets of various taxon
    names as values.
    """

    rename = re.compile(r'\(?(\w[\w\s\',/\.-]+\w)\)?')
    reac = re.compile(r'\s*\w+\s+\(([A-Z\d]+)\)\s*,')

    url = urls.urls['uniprot_basic']['speindex']
    c = curl.Curl(url, large=True, silent=False)

    result = collections.defaultdict(set)

    for line in c.result:

        if line[0] != ' ':

            names = set(rename.findall(line))

        else:

            for ac in reac.findall(line):

                result[ac].update(names)

    return result
Esempio n. 21
0
def connectomedb_interactions():
    """
    Retrieves ligand-receptor interactions from connectomeDB2020
    https://asrhou.github.io/NATMI/
    """

    ConnectomedbInteraction = collections.namedtuple('ConnectomedbInteraction',
                                                     [
                                                         'ligand',
                                                         'ligand_location',
                                                         'receptor',
                                                         'references',
                                                     ])

    rea = re.compile(r'<a[^>]+>([^<]*)</a>')
    resemicol = re.compile(r'; ?')

    url = urls.urls['connectomedb2020']['url']
    c = curl.Curl(url, large=True, silent=False)
    tab = list(csv.DictReader(c.result))

    return [
        ConnectomedbInteraction(
            ligand=row['Ligand gene symbol'],
            ligand_location=resemicol.split(row['Ligand location']),
            receptor=row['Receptor gene symbol'],
            references=rea.findall(row['PMID support']),
        ) for row in tab
    ]
Esempio n. 22
0
 def smiles2chembl(self, smiles):
     self.result = {}
     prg = progress.Progress(total=len(smiles),
                             name='Translating SMILEs',
                             interval=1)
     for sml in smiles:
         url = self.chembl_url.format(sml)
         c = curl.Curl(url, large=False)
         result = c.result
         self.result[sml] = []
         if result is not None:
             try:
                 data = json.loads(result)
                 for d in data['compounds']:
                     this_smile = d['smiles']
                     this_chembl = d['chemblId']
                     # if this_smile == sml:
                     self.result[sml].append(this_chembl)
             except ValueError:
                 soup = bs4.BeautifulSoup(result)
                 compounds = soup.find_all('compound')
                 if compounds is not None:
                     for compound in compounds:
                         this_smile = compound.find('smiles').text
                         this_chembl = compound.find('chemblid').text
                         # if this_smile == sml:
                         self.result[sml].append(this_chembl)
         prg.step()
     prg.terminate()
Esempio n. 23
0
def get_pfam_pdb():

    c = curl.Curl(urls.urls['pfam_pdb']['url'], silent=False)
    data = c.result

    if data is None:

        return None, None

    pdb_pfam = {}
    pfam_pdb = {}
    data = data.replace('\r', '').split('\n')
    del data[0]

    for l in data:

        l = l.split('\t')

        if len(l) > 4:

            pfam = l[4].split('.')[0]
            pdb = l[0].lower()
            chain = l[1]
            start = int(common.non_digit.sub('', l[2]))
            end = int(common.non_digit.sub('', l[3]))
            if pdb not in pdb_pfam:
                pdb_pfam[pdb] = {}
            if pfam not in pfam_pdb:
                pfam_pdb[pfam] = {}
            pdb_pfam[pdb][pfam] = [chain, start, end]
            pfam_pdb[pfam][pdb] = [chain, start, end]

    return pdb_pfam, pfam_pdb
Esempio n. 24
0
def uniprot_data(field, organism=9606, reviewed=True):
    """
    Retrieves a field from UniProt for all proteins of one organism, by
    default only the reviewed (SwissProt) proteins.
    For the available fields refer to the ``_uniprot_fields`` attribute of
    this module or the UniProt website.
    """

    rev = (
        ' AND reviewed: yes' if reviewed == True or reviewed == 'yes' else
        ' AND reviewed: no' if reviewed == False or reviewed == 'no' else '')
    _field = _uniprot_fields[field] if field in _uniprot_fields else field
    url = urls.urls['uniprot_basic']['url']
    get = {
        'query': 'organism:%s%s' % (str(organism), rev),
        'format': 'tab',
        'columns': 'id,%s' % _field,
        'compress': 'yes',
    }

    c = curl.Curl(url, get=get, silent=False, large=True, compr='gz')
    _ = next(c.result)

    return dict(id_value for id_value in (line.strip('\n\r').split('\t')
                                          for line in c.result
                                          if line.strip('\n\r'))
                if id_value[1])
Esempio n. 25
0
def _protein_datasheet(url):

    cache = True

    for a in range(3):

        c = curl.Curl(
            url,
            silent=True,
            large=False,
            cache=cache,
            connect_timeout=(
                settings.get('uniprot_datasheet_connect_timeout')),
            timeout=settings.get('uniprot_datasheet_timeout'),
        )

        if not c.result or c.result.startswith('<!DOCTYPE'):

            cache = False

        else:

            break

    if not c.result:

        _logger._log('Could not retrieve UniProt datasheet by URL `%s`.' % url)

    return _redatasheet.findall(c.result) if c.result else []
Esempio n. 26
0
def adhesome_interactions():

    AdhesomeInteraction = collections.namedtuple(
        'AdhesomeInteraction',
        ['source', 'target', 'effect', 'type', 'pmid'],
    )

    url = urls.urls['adhesome']['interactions']

    c = curl.Curl(url, large = True, silent = False)

    data = csv.DictReader(c.result, delimiter = ',')

    result = []

    for rec in data:

        result.append(
            AdhesomeInteraction(
                source = rec['Source'],
                target = rec['Target'],
                effect = rec['Effect'],
                type   = common.upper0(rec['Type']),
                pmid   = rec['PMID'],
            )
        )

    return result
Esempio n. 27
0
def phobius_annotations():

    rewrongtab = re.compile(r'(\t[A-Z\d]+_[A-Z]+)\t([A-Z]+)\s+(\d)')

    PhobiusAnnotation = collections.namedtuple('PhobiusAnnotation', [
        'tm_helices',
        'signal_peptide',
        'cytoplasmic',
        'non_cytoplasmic',
    ])

    url = urls.urls['phobius']['url']

    c = curl.Curl(url, silent=False, large=True)

    _ = next(c.result)

    result = collections.defaultdict(set)

    for line in c.result:

        line = rewrongtab.sub(r'\1\2\t\3', line)
        line = line.strip().split('\t')

        result[line[1]].add(
            PhobiusAnnotation(
                tm_helices=int(line[3]),
                signal_peptide=line[4] == 'Y',
                cytoplasmic=line[5].count('i'),
                non_cytoplasmic=line[5].count('o'),
            ))

    return dict(result)
Esempio n. 28
0
def ramilowski_interactions(putative = False):
    """
    Downloads and processes ligand-receptor interactions from
    Supplementary Table 2 of Ramilowski 2015.
    
    Returns list of lists with ligand and receptor gene symbols, reference
    and resources as elements.
    """

    c = curl.Curl(urls.urls['rami']['url'], silent = False, large = True)
    xlsname = c.fname
    del(c)
    raw = inputs_common.read_xls(xlsname, 'All.Pairs')[1:]

    return [
        [
            r[1],
            r[3],
            r[13].replace(' ', ''), # references
            ';'.join(filter(len, itertools.chain(r[5:11], [r[15]])))
        ]
        for r in raw
        if r[15] != 'EXCLUDED not ligand' and (
            putative or r[15] != 'putative'
        )
    ]

    return raw
Esempio n. 29
0
def phosphoelm_enzyme_substrate(organism=9606, ltp_only=True):
    """
    Downloads kinase-substrate interactions from phosphoELM.
    Returns list of dicts.

    :param int organism: NCBI Taxonomy ID.
    :param bool ltp_only: Include only low-throughput interactions.
    """

    result = []
    non_digit = re.compile(r'[^\d.-]+')

    if organism is None:
        _organism = None
    elif organism in taxonomy.phosphoelm_taxids:
        _organism = taxonomy.phosphoelm_taxids[organism]
    else:
        sys.stdout.write('\t:: Unknown organism: `%u`.\n' % organism)
        return []

    url = urls.urls['p_elm']['url']
    c = curl.Curl(url, silent=False)
    data = c.result
    data = [
        n for d, n in iteritems(data)
        if d.startswith(urls.urls['p_elm']['psites'])
    ]
    data = data[0] if len(data) > 0 else ''
    data = [l.split('\t') for l in data.split('\n')]
    kinases = phosphoelm_kinases()
    del data[0]

    for l in data:

        if (len(l) == 9 and (l[7] == _organism or _organism is None)
                and (not ltp_only or l[6] == 'LTP')):

            l[1] = 1 if '-' not in l[0] else int(l[0].split('-')[1])
            l[0] = l[0].split('-')[0]
            del l[-1]

            if len(l[5]) > 0 and l[5] in kinases:
                kinase = kinases[l[5]]

                result.append({
                    'instance': None,
                    'isoform': l[1],
                    'resaa': l[3],
                    'resnum': int(non_digit.sub('', l[2])),
                    'start': None,
                    'end': None,
                    'substrate': l[0],
                    'kinase': kinase,
                    'references': l[4].split(';'),
                    'experiment': l[6],
                    'organism': l[7]
                })

    return result
Esempio n. 30
0
def _huri_interactions(dataset):

    reuniprot = re.compile(r'[a-z]+:([\w\.]+)(?:-?([0-9]?))?')
    rescore = re.compile(r'author score: ([\.0-9]+)')

    HuriInteraction = collections.namedtuple('HuriInteraction', [
        'uniprot_a',
        'uniprot_b',
        'isoform_a',
        'isoform_b',
        'score',
    ])

    def _map_ids(_id):

        return mapping.map_name(
            _id,
            _id[:4].lower() if _id[:4] in {'ensp', 'enst'} else 'uniprot',
            'uniprot',
        )

    url = dataset if dataset.startswith('http') else urls.urls['hid'][dataset]
    c = curl.Curl(url, large=True, silent=False)
    path = (c.fileobj.name
            if hasattr(c, 'fileobj') else c.cache_file_name or c.outfile)
    del c
    c = curl.FileOpener(path)

    for row in c.result:

        score = rescore.search(row)

        if score:

            score = float(score.groups()[0])

        row = row.split()

        if len(row) < 2:

            continue

        id_a, isoform_a = reuniprot.match(row[0]).groups()
        id_b, isoform_b = reuniprot.match(row[1]).groups()

        uniprots_a = _map_ids(id_a)
        uniprots_b = _map_ids(id_b)

        for uniprot_a, uniprot_b in itertools.product(uniprots_a, uniprots_b):

            #pass
            yield HuriInteraction(
                uniprot_a=uniprot_a,
                uniprot_b=uniprot_b,
                isoform_a=int(isoform_a) if isoform_a else 1,
                isoform_b=int(isoform_b) if isoform_b else 1,
                score=score,
            )