def adhesome_annotations(): AdhesomeAnnotation = collections.namedtuple( 'AdhesomeAnnotation', ['mainclass', 'intrinsic'], ) result = collections.defaultdict(set) url = urls.urls['adhesome']['components'] c = curl.Curl(url, large = True, silent = False) data = csv.DictReader(c.result, delimiter = ',') for rec in data: uniprots = rec['Swiss-Prot ID'] for uniprot in uniprots.split(','): uniprot = uniprot.strip() if uniprot == 'null': continue for _uniprot in mapping.map_name(uniprot, 'uniprot', 'uniprot'): result[uniprot].add(AdhesomeAnnotation( mainclass = ( common.upper0(rec['Functional Category'].strip()) ), intrinsic = rec['FA'].strip() == 'Intrinsic Proteins', )) return result
def adhesome_interactions(): AdhesomeInteraction = collections.namedtuple( 'AdhesomeInteraction', ['source', 'target', 'effect', 'type', 'pmid'], ) url = urls.urls['adhesome']['interactions'] c = curl.Curl(url, large = True, silent = False) data = csv.DictReader(c.result, delimiter = ',') result = [] for rec in data: result.append( AdhesomeInteraction( source = rec['Source'], target = rec['Target'], effect = rec['Effect'], type = common.upper0(rec['Type']), pmid = rec['PMID'], ) ) return result
def uniprot_preprocess(field, organism=9606, reviewed=True): relabel = re.compile(r'[A-Z\s]+:\s') reisoform = re.compile(r'\[[-\w\s]+\]:?\s?') retermsep = re.compile(r'\s?[\.,]\s?') reref = re.compile(r'\{[-\w :\|,\.]*\}') result = collections.defaultdict(set) data = uniprot_data( field=field, organism=organism, reviewed=reviewed, ) for uniprot, raw in iteritems(data): raw = raw.split('Note=')[0] raw = relabel.sub('', raw) raw = reref.sub('', raw) raw = reisoform.sub('', raw) raw = retermsep.split(raw) for item in raw: if item.startswith('Note'): continue item = item.split('{')[0] elements = tuple(it0 for it0 in (common.upper0(it.strip(' .;,')) for it in item.split(';')) if it0) if elements: result[uniprot].add(elements) return result
def uniprot_tissues(organism=9606, reviewed=True): reref = re.compile(r'\s?\{.*\}\s?') resep = re.compile(r',?(?:' r' in almost all |' r' but also in |' r' but also at |' r' within the |' r', in |' r' in |' r' but |' r', and |' r' and |' r' such as |' r' \(both |' r' as well as |' r' as |' r' or |' r' at the |' r' at |' r' including |' r' during |' r' especially |' r' to |' r' into |' r' = |' r' > |' r'; |' r', ' r')(?=[^\d])') relabel = re.compile(r'^TISSUE SPECIFICITY: ') repubmed = re.compile(r'\(?PubMed:?\d+\)?') respeci = re.compile(r'(\w+)[-\s]specific') rethe = re.compile(r'\s?(?:' r'[Tt]he |' r'[Ii]n |' r'[Ss]ome|' r'[Ii]n the|' r'[Ww]ithin the|' r'[Ww]ithin|' r'[Ii]nto|' r'[Ww]ith only|' r'[Ww]ith the|' r'[Ww]ith an|' r'[Ww]ith |' r'[Ii]s |' r'[Mm]any |' r'[Aa] variety of ' r'[Aa] |' r'[Ii]t |' r'[Tt]o |' r'[Oo]n |' r'[Oo]f |' r'[Tt]hose |' r'[Ff]rom |' r'[Aa]lso|' r'[Bb]y |' r'[Pp]articularly|' r'[Pp]articular|' r'[Pp]atients|' r'[Aa]n |' r'\'|' r':|' r'/' r')?(.*)') reand = re.compile(r'(?: and| of| from| or| than)$') replevel = re.compile(r'\(at \w+ levels?\)') reiso = re.compile(r'[Ii]soform \w+') reindef = re.compile(r'\w' r'(?:' r'ifferent parts of |' r'ariety of tissues |' r' variety of tissues |' r' number of |' r'everal regions of ' r')') level_kw = ( ('low', 'low'), ('weak', 'low'), ('lesser extent', 'low'), ('minimal level', 'low'), ('decrease', 'low'), ('moderate', 'low'), ('barely', 'low'), ('minor level', 'low'), ('reduced', 'low'), ('lesser', 'low'), ('down-regulated', 'low'), ('high', 'high'), ('elevated', 'high'), ('strong', 'high'), ('prominent', 'high'), ('greatest level', 'high'), ('concentrated', 'high'), ('predominant', 'high'), ('increase', 'high'), ('enrich', 'high'), ('abundant', 'high'), ('primarily', 'high'), ('induced', 'high'), ('up-regulated', 'high'), ('up regulated', 'high'), ('expression is restricted', 'high'), ('amplified', 'high'), ('basal l', 'basal'), ('not detected', 'none'), ('absent', 'none'), ('expressed', 'undefined'), ('detect', 'undefined'), ('found', 'undefined'), ('present', 'undefined'), ('expression', 'undefined'), ('localized', 'undefined'), ('produced', 'undefined'), ('confined', 'undefined'), ('transcribed', 'undefined'), ('xpressed', 'undefined'), ('synthesized', 'undefined'), ('secreted', 'undefined'), ('seen', 'undefined'), ('prevalent', 'undefined'), ('released', 'undefined'), ('appears', 'undefined'), ('varying levels', 'undefined'), ('various levels', 'undefined'), ('identified', 'undefined'), ('observed', 'undefined'), ('occurs', 'undefined'), ) wide_kw = ( ('widely', 'wide'), ('wide tissue distribution', 'wide'), ('wide range of tissues', 'wide'), ('wide range of adult tissues', 'wide'), ('wide range of cells', 'wide'), ('wide variety of normal adult tissues', 'wide'), ('widespread', 'wide'), ('ubiquitous', 'ubiquitous'), ('variety of tissues', 'wide'), ('many tissues', 'wide'), ('many organs', 'wide'), ('various organs', 'wide'), ('various tissues', 'wide'), ) tissue_exclude = { 'Adult', 'All', 'Apparently not', 'Areas', 'Are likely', 'Both', 'By contrast', 'Normal cells', 'Not only', 'A', '[]: Localized', 'Early', 'Change from a quiescent', 'Central', 'Beta', 'This layer', 'With little', 'Preferential occurrence', 'Stage III', 'Take up', 'Hardly', 'Only seen', 'Prevalent', 'Inner segment', 'Memory', 'Many fetal', 'Tissues', '0 kb', '9 kb', 'A 2', 'A 3', 'A 5', 'A 6', '1-7', '1b-1', '2 is widely', '8 and 4', 'Often amplified', 'Other', 'Others', 'Those', 'Tissues examined', 'Tissues with', 'Tissues (e)', 'Probably shed', 'Reports that', 'Primitive', 'Prolactin', 'Overlap', 'A smaller 0', 'A smaller form', 'A smaltissues', 'Different levels', 'Different amounts', 'Disappears', 'Digestion', 'Very similar', 'Vivo', 'Contrary', 'Contrast', 'Not', 'Not all', 'Has it', 'Has little', 'All stages', 'Soon', 'Specific', 'Stage', 'Stage I', 'Stage II', 'Stages II', 'Ends', 'A minor degree', 'A much smaller extent', 'Lost', 'Varies', 'Various', 'Mostly restricted', 'Mostly', 'Most probably', 'Much more stable', 'Naive', 'Neither', 'Nor', 'None', } exclude_startswith = ( 'Were', 'Where', 'Which', 'While', 'When', 'There', 'Their', 'Then', 'These', 'Level', 'This', 'Almost', 'If', 'Control', 'Be ', 'Although', 'Than', 'Addition', ) exclude_in = ('kb transcript', 'compared', 'soform', 'concentration of') UniprotTissue = collections.namedtuple( 'UniprotTissue', [ 'tissue', 'level', ], ) data = uniprot_data( 'tissue_specificity', organism=organism, reviewed=reviewed, ) result = collections.defaultdict(set) for uniprot, raw in iteritems(data): raw = relabel.sub('', raw) raw = reref.sub('', raw) raw = replevel.sub('', raw) raw = reiso.sub('', raw) raw = repubmed.sub('', raw) raw = reindef.sub('', raw) raw = raw.replace('adult and fetal', '') raw = raw.split('.') for phrase in raw: tokens = tuple(resep.split(phrase)) level = None for token in tokens: level_token = False wide_token = False tissue = None token_lower = token.lower() for kw, lev in level_kw: if kw in token_lower: level = lev level_token = True break if level_token: for kw, wide in wide_kw: if kw in token_lower: tissue = wide wide_token = True break if not level_token or wide_token: if not wide_token: specific = respeci.search(token) tissue = (specific.groups()[0].lower() if specific else token) if specific and not level: level = 'high' if tissue.strip(): if any(e in tissue for e in exclude_in): continue tissue = rethe.match(tissue).groups()[0] tissue = rethe.match(tissue).groups()[0] tissue = rethe.match(tissue).groups()[0] if tissue.endswith('+'): tissue = '%s cells' % tissue tissue = tissue.strip(')(.,;- ') if '(' in tissue and ')' not in tissue: tissue = '%s)' % tissue tissue = reand.sub('', tissue) tissue = common.upper0(tissue) tissue = tissue.replace(' ', ' ') if any( tissue.startswith(e) for e in exclude_startswith): continue if tissue in tissue_exclude or len(tissue) < 3: continue result[uniprot].add( UniprotTissue( tissue=tissue, level=level or 'undefined', )) return result
def name_label(self): return common.upper0(self.name).replace('_', ' ')