Beispiel #1
0
def adhesome_annotations():

    AdhesomeAnnotation = collections.namedtuple(
        'AdhesomeAnnotation',
        ['mainclass', 'intrinsic'],
    )

    result = collections.defaultdict(set)

    url = urls.urls['adhesome']['components']
    c = curl.Curl(url, large = True, silent = False)

    data = csv.DictReader(c.result, delimiter = ',')

    for rec in data:
        uniprots = rec['Swiss-Prot ID']

        for uniprot in uniprots.split(','):
            uniprot = uniprot.strip()

            if uniprot == 'null':
                continue

            for _uniprot in mapping.map_name(uniprot, 'uniprot', 'uniprot'):
                result[uniprot].add(AdhesomeAnnotation(
                    mainclass = (
                        common.upper0(rec['Functional Category'].strip())
                    ),
                    intrinsic = rec['FA'].strip() == 'Intrinsic Proteins',
                ))

    return result
Beispiel #2
0
def adhesome_interactions():

    AdhesomeInteraction = collections.namedtuple(
        'AdhesomeInteraction',
        ['source', 'target', 'effect', 'type', 'pmid'],
    )

    url = urls.urls['adhesome']['interactions']

    c = curl.Curl(url, large = True, silent = False)

    data = csv.DictReader(c.result, delimiter = ',')

    result = []

    for rec in data:

        result.append(
            AdhesomeInteraction(
                source = rec['Source'],
                target = rec['Target'],
                effect = rec['Effect'],
                type   = common.upper0(rec['Type']),
                pmid   = rec['PMID'],
            )
        )

    return result
Beispiel #3
0
def uniprot_preprocess(field, organism=9606, reviewed=True):

    relabel = re.compile(r'[A-Z\s]+:\s')
    reisoform = re.compile(r'\[[-\w\s]+\]:?\s?')
    retermsep = re.compile(r'\s?[\.,]\s?')
    reref = re.compile(r'\{[-\w :\|,\.]*\}')

    result = collections.defaultdict(set)

    data = uniprot_data(
        field=field,
        organism=organism,
        reviewed=reviewed,
    )

    for uniprot, raw in iteritems(data):

        raw = raw.split('Note=')[0]
        raw = relabel.sub('', raw)
        raw = reref.sub('', raw)
        raw = reisoform.sub('', raw)
        raw = retermsep.split(raw)

        for item in raw:

            if item.startswith('Note'):

                continue

            item = item.split('{')[0]
            elements = tuple(it0 for it0 in (common.upper0(it.strip(' .;,'))
                                             for it in item.split(';')) if it0)

            if elements:

                result[uniprot].add(elements)

    return result
Beispiel #4
0
def uniprot_tissues(organism=9606, reviewed=True):

    reref = re.compile(r'\s?\{.*\}\s?')
    resep = re.compile(r',?(?:'
                       r' in almost all |'
                       r' but also in |'
                       r' but also at |'
                       r' within the |'
                       r', in |'
                       r' in |'
                       r' but |'
                       r', and |'
                       r' and |'
                       r' such as |'
                       r' \(both |'
                       r' as well as |'
                       r' as |'
                       r' or |'
                       r' at the |'
                       r' at |'
                       r' including |'
                       r' during |'
                       r' especially |'
                       r' to |'
                       r' into |'
                       r' = |'
                       r' > |'
                       r'; |'
                       r', '
                       r')(?=[^\d])')
    relabel = re.compile(r'^TISSUE SPECIFICITY: ')
    repubmed = re.compile(r'\(?PubMed:?\d+\)?')
    respeci = re.compile(r'(\w+)[-\s]specific')
    rethe = re.compile(r'\s?(?:'
                       r'[Tt]he |'
                       r'[Ii]n |'
                       r'[Ss]ome|'
                       r'[Ii]n the|'
                       r'[Ww]ithin the|'
                       r'[Ww]ithin|'
                       r'[Ii]nto|'
                       r'[Ww]ith only|'
                       r'[Ww]ith the|'
                       r'[Ww]ith an|'
                       r'[Ww]ith |'
                       r'[Ii]s |'
                       r'[Mm]any  |'
                       r'[Aa] variety of '
                       r'[Aa] |'
                       r'[Ii]t |'
                       r'[Tt]o |'
                       r'[Oo]n |'
                       r'[Oo]f |'
                       r'[Tt]hose |'
                       r'[Ff]rom |'
                       r'[Aa]lso|'
                       r'[Bb]y |'
                       r'[Pp]articularly|'
                       r'[Pp]articular|'
                       r'[Pp]atients|'
                       r'[Aa]n |'
                       r'\'|'
                       r':|'
                       r'/'
                       r')?(.*)')
    reand = re.compile(r'(?: and| of| from| or| than)$')
    replevel = re.compile(r'\(at \w+ levels?\)')
    reiso = re.compile(r'[Ii]soform \w+')
    reindef = re.compile(r'\w'
                         r'(?:'
                         r'ifferent parts of |'
                         r'ariety of tissues |'
                         r' variety of tissues |'
                         r' number of |'
                         r'everal regions of '
                         r')')

    level_kw = (
        ('low', 'low'),
        ('weak', 'low'),
        ('lesser extent', 'low'),
        ('minimal level', 'low'),
        ('decrease', 'low'),
        ('moderate', 'low'),
        ('barely', 'low'),
        ('minor level', 'low'),
        ('reduced', 'low'),
        ('lesser', 'low'),
        ('down-regulated', 'low'),
        ('high', 'high'),
        ('elevated', 'high'),
        ('strong', 'high'),
        ('prominent', 'high'),
        ('greatest level', 'high'),
        ('concentrated', 'high'),
        ('predominant', 'high'),
        ('increase', 'high'),
        ('enrich', 'high'),
        ('abundant', 'high'),
        ('primarily', 'high'),
        ('induced', 'high'),
        ('up-regulated', 'high'),
        ('up regulated', 'high'),
        ('expression is restricted', 'high'),
        ('amplified', 'high'),
        ('basal l', 'basal'),
        ('not detected', 'none'),
        ('absent', 'none'),
        ('expressed', 'undefined'),
        ('detect', 'undefined'),
        ('found', 'undefined'),
        ('present', 'undefined'),
        ('expression', 'undefined'),
        ('localized', 'undefined'),
        ('produced', 'undefined'),
        ('confined', 'undefined'),
        ('transcribed', 'undefined'),
        ('xpressed', 'undefined'),
        ('synthesized', 'undefined'),
        ('secreted', 'undefined'),
        ('seen', 'undefined'),
        ('prevalent', 'undefined'),
        ('released', 'undefined'),
        ('appears', 'undefined'),
        ('varying levels', 'undefined'),
        ('various levels', 'undefined'),
        ('identified', 'undefined'),
        ('observed', 'undefined'),
        ('occurs', 'undefined'),
    )

    wide_kw = (
        ('widely', 'wide'),
        ('wide tissue distribution', 'wide'),
        ('wide range of tissues', 'wide'),
        ('wide range of adult tissues', 'wide'),
        ('wide range of cells', 'wide'),
        ('wide variety of normal adult tissues', 'wide'),
        ('widespread', 'wide'),
        ('ubiquitous', 'ubiquitous'),
        ('variety of tissues', 'wide'),
        ('many tissues', 'wide'),
        ('many organs', 'wide'),
        ('various organs', 'wide'),
        ('various tissues', 'wide'),
    )

    tissue_exclude = {
        'Adult',
        'All',
        'Apparently not',
        'Areas',
        'Are likely',
        'Both',
        'By contrast',
        'Normal cells',
        'Not only',
        'A',
        '[]: Localized',
        'Early',
        'Change from a quiescent',
        'Central',
        'Beta',
        'This layer',
        'With little',
        'Preferential occurrence',
        'Stage III',
        'Take up',
        'Hardly',
        'Only seen',
        'Prevalent',
        'Inner segment',
        'Memory',
        'Many fetal',
        'Tissues',
        '0 kb',
        '9 kb',
        'A 2',
        'A 3',
        'A 5',
        'A 6',
        '1-7',
        '1b-1',
        '2 is widely',
        '8 and 4',
        'Often amplified',
        'Other',
        'Others',
        'Those',
        'Tissues examined',
        'Tissues with',
        'Tissues (e)',
        'Probably shed',
        'Reports that',
        'Primitive',
        'Prolactin',
        'Overlap',
        'A smaller 0',
        'A smaller form',
        'A smaltissues',
        'Different levels',
        'Different amounts',
        'Disappears',
        'Digestion',
        'Very similar',
        'Vivo',
        'Contrary',
        'Contrast',
        'Not',
        'Not all',
        'Has it',
        'Has little',
        'All stages',
        'Soon',
        'Specific',
        'Stage',
        'Stage I',
        'Stage II',
        'Stages II',
        'Ends',
        'A minor degree',
        'A much smaller extent',
        'Lost',
        'Varies',
        'Various',
        'Mostly restricted',
        'Mostly',
        'Most probably',
        'Much more stable',
        'Naive',
        'Neither',
        'Nor',
        'None',
    }

    exclude_startswith = (
        'Were',
        'Where',
        'Which',
        'While',
        'When',
        'There',
        'Their',
        'Then',
        'These',
        'Level',
        'This',
        'Almost',
        'If',
        'Control',
        'Be ',
        'Although',
        'Than',
        'Addition',
    )

    exclude_in = ('kb transcript', 'compared', 'soform', 'concentration of')

    UniprotTissue = collections.namedtuple(
        'UniprotTissue',
        [
            'tissue',
            'level',
        ],
    )

    data = uniprot_data(
        'tissue_specificity',
        organism=organism,
        reviewed=reviewed,
    )

    result = collections.defaultdict(set)

    for uniprot, raw in iteritems(data):

        raw = relabel.sub('', raw)
        raw = reref.sub('', raw)
        raw = replevel.sub('', raw)
        raw = reiso.sub('', raw)
        raw = repubmed.sub('', raw)
        raw = reindef.sub('', raw)
        raw = raw.replace('adult and fetal', '')

        raw = raw.split('.')

        for phrase in raw:

            tokens = tuple(resep.split(phrase))
            level = None

            for token in tokens:

                level_token = False
                wide_token = False
                tissue = None

                token_lower = token.lower()

                for kw, lev in level_kw:

                    if kw in token_lower:

                        level = lev
                        level_token = True
                        break

                if level_token:

                    for kw, wide in wide_kw:

                        if kw in token_lower:

                            tissue = wide
                            wide_token = True
                            break

                if not level_token or wide_token:

                    if not wide_token:

                        specific = respeci.search(token)

                        tissue = (specific.groups()[0].lower()
                                  if specific else token)

                        if specific and not level:

                            level = 'high'

                    if tissue.strip():

                        if any(e in tissue for e in exclude_in):

                            continue

                        tissue = rethe.match(tissue).groups()[0]
                        tissue = rethe.match(tissue).groups()[0]
                        tissue = rethe.match(tissue).groups()[0]

                        if tissue.endswith('+'):

                            tissue = '%s cells' % tissue

                        tissue = tissue.strip(')(.,;- ')

                        if '(' in tissue and ')' not in tissue:

                            tissue = '%s)' % tissue

                        tissue = reand.sub('', tissue)
                        tissue = common.upper0(tissue)
                        tissue = tissue.replace('  ', ' ')

                        if any(
                                tissue.startswith(e)
                                for e in exclude_startswith):

                            continue

                        if tissue in tissue_exclude or len(tissue) < 3:

                            continue

                        result[uniprot].add(
                            UniprotTissue(
                                tissue=tissue,
                                level=level or 'undefined',
                            ))

    return result
Beispiel #5
0
    def name_label(self):

        return common.upper0(self.name).replace('_', ' ')