def genecards_datasheet(gene): """ Retrieves a gene (protein) datasheet from GeneCards. Returns HTML as string. :param str gene: A Gene Symbol or UniProt ID. """ url = urls.urls['genecards']['url'] % gene c = curl.Curl( url, silent = True, large = False, connect_timeout = settings.get('genecards_datasheet_connect_timeout'), timeout = settings.get('genecards_datasheet_timeout'), ) if c.status not in {0, 200}: _log('Failed to retrieve gene card for ID `%s`.' % gene) return None return c.result
def _protein_datasheet(url): cache = True for a in range(3): c = curl.Curl( url, silent=True, large=False, cache=cache, connect_timeout=( settings.get('uniprot_datasheet_connect_timeout')), timeout=settings.get('uniprot_datasheet_timeout'), ) if not c.result or c.result.startswith('<!DOCTYPE'): cache = False else: break if not c.result: _logger._log('Could not retrieve UniProt datasheet by URL `%s`.' % url) return _redatasheet.findall(c.result) if c.result else []
def tf_target_resources(): """ Returns the resource set for building the TF-target network dataset. """ transcription = (netres.dorothea_expand_levels( resources=netres.transcription, levels=settings.get('tfregulons_levels'), ) if settings.get('dorothea_expand_levels') else netres.transcription) return transcription
def features_table(uniprot_ids, *features, width=40, maxlen=None, tablefmt='fancy_grid', **kwargs): """ Returns a table with the requested features of a list of UniProt IDs. The underlying table formatting module is ``tabulate``, a versatile module to export various ascii tables as well as HTML or LaTeX -- check the docs for formatting options: https://github.com/astanin/python-tabulate :param **kwargs: Passed to ``tabulate.tabulate``. :return: The table as a string. """ maxlen = maxlen or settings.get('uniprot_info_maxlen') features = features or default_features tbl = collect(uniprot_ids, *features) return common.table_format(tbl, width=width, maxlen=maxlen, tablefmt=tablefmt, **kwargs)
def __init__(self, license_dir=None): session.Logger.__init__(self, name='licenses') self.license_dir = license_dir or settings.get('license_dir') self.licenses = {} self.populate()
def __init__( self, members, name=None, parent=None, aspect='functional', source='resource_specific', scope='specific', resource=None, transmitter=None, receiver=None, limit=None, avoid=None, enabled=True, ): collections_abc.Set.__init__(self) self.members = set(members) self.name = name or 'unnamed' self.parent = parent or self.name self.aspect = aspect self.source = source self.scope = scope self.resource = (resource or settings.get('annot_composite_database_name') or 'Unknown') self.transmitter = transmitter self.receiver = receiver self.limit = common.to_set(limit) self.avoid = common.to_set(avoid) self.enabled = enabled
def new_logger(name=None, logdir=None, verbosity=None, **kwargs): """ Returns a new logger with default settings (can be customized). Parameters ---------- name : str Custom name for the log. logdir : str Path to the directoty to store log files. verbosity : int Verbosity level, lowest is 0. Messages from levels above this won't be written to the log.. Returns ------- ``log.Logger`` instance. """ name = name or settings.get('module_name') logdir = logdir or '%s_log' % name return Logger(fname='%s__%s.log' % ( name, Logger.timestamp().replace(' ', '_').replace(':', '.'), ), verbosity=0, logdir=logdir, **kwargs)
def webservice_interactions_df_legacy(self): sources_omnipath = set(f.name for f in data_formats.omnipath.values()) sources_extra_directions = settings.get('network_extra_directions') sources_kinase_extra = set(f.name for f in data_formats.ptm_misc.values()) sources_ligrec_extra = set( f.name for f in data_formats.ligand_receptor.values()) sources_pathway_extra = set( f.name for f in data_formats.pathway_noref.values()) sources_mirna = set(f.name for f in data_formats.mirna_target.values()) self.make_df( unique_pairs=False, extra_node_attrs={'ncbi_tax_id': 'ncbi_tax_id'}, extra_edge_attrs={ 'omnipath': lambda e, d: ((bool(e['dirs'].sources[d] & sources_omnipath) or (bool(e['dirs'].sources['undirected'] & sources_omnipath) and bool(e['dirs'].sources[d] & sources_extra_directions))) and 'PPI' in e['type']), 'kinaseextra': lambda e, d: (bool(e['dirs'].sources[d] & sources_kinase_extra) and 'PPI' in e['type']), 'ligrecextra': lambda e, d: (bool(e['dirs'].sources[d] & sources_ligrec_extra) and 'PPI' in e['type']), 'pathwayextra': lambda e, d: (bool(e['dirs'].sources[d] & sources_pathway_extra ) and 'TF' in e['type']), 'mirnatarget': lambda e, d: (bool(e['dirs'].sources[d] & sources_mirna) and 'MTI' in e['type']), 'dorothea': lambda e, d: ('TF' in e['sources_by_type'] and bool(e[ 'sources_by_type']['TF'] & e['dirs'].sources[d])), 'dorothea_curated': 'dorothea_curated', 'dorothea_chipseq': 'dorothea_chipseq', 'dorothea_tfbs': 'dorothea_tfbs', 'dorothea_coexp': 'dorothea_coexp', 'dorothea_level': lambda e, d: (';'.join(sorted(e['dorothea_level'])) if 'dorothea_level' in e .attributes() and 'TF' in e['sources_by_type'] and bool(e[ 'sources_by_type']['TF'] & e['dirs'].sources[d]) else ''), # quite wrong (taking only the first one): 'type': lambda e: e['type'][0], 'curation_effort': lambda e, d: (e.count_curation_effort(direction=d) + (e.count_curation_effort(direction='undirected') if isinstance(d, tuple) else 0)), })
def __init__( self, fname, verbosity=None, console_level=None, logdir=None, max_width=200, ): """ fname : str Log file name. logdir : name Path to the directory containing the log files. verbosity : int Messages at and below this level will be written into the logfile. All other messages will be dropped. console_level : int Messages below this log level will be printed not only into logfile but also to the console. """ @_log_flush_timeloop.job(interval=datetime.timedelta( seconds=settings.get('log_flush_interval'))) def _flush(): self.flush() _log_flush_timeloop.start(block=False) self.wrapper = textwrap.TextWrapper( width=max_width, subsequent_indent=' ' * 22, break_long_words=False, ) self.logdir = self.get_logdir(logdir) self.fname = os.path.join(self.logdir, fname) self.verbosity = (verbosity if verbosity is not None else settings.get('log_verbosity')) self.console_level = (console_level if console_level is not None else settings.get('console_verbosity')) self.open_logfile() # sending some greetings self.msg('Welcome!') self.msg('Logger started, logging into `%s`.' % self.fname)
def get_cachedir(cachedir=None): """ Ensures the cache directory exists and returns its path. """ cachedir = cachedir or settings.get('cachedir') os.makedirs(cachedir, exist_ok=True) return cachedir
def get_param(self, key): """ Retrieves a parameter from the :py:attr:`param` dict of the current object or from the module settings. """ if key in self.param: return self.param[key] return settings.get(key)
def __init__(self, rebuild=False, **kwargs): session_mod.Logger.__init__(self, name='omnipath.dbmanager') self.timestamp = time.strftime(settings.get('timestamp_format')) self.param = kwargs self.rebuild = rebuild self.datasets = self.get_param('datasets') self.ensure_dirs() self.network_dfs = {} self._log('The OmniPath database manager has been initialized.')
def __init__( self, type_, id_type_a, id_type_b, ncbi_tax_id=None, ): self.type = type_ self.id_type_a = id_type_a self.id_type_b = id_type_b self.ncbi_tax_id = ncbi_tax_id or settings.get('default_organism')
def get_logdir(self, dirname=None): """ Returns the path to log directory. Also creates the directory if does not exist. """ dirname = dirname or '%s_log' % settings.get('module_name') if not os.path.exists(dirname): os.makedirs(dirname) return os.path.abspath(dirname)
def __init__(self, input_args=None, **kwargs): input_args = input_args or {} if 'organism' not in input_args: input_args['organism'] = settings.get('default_organism') AbstractComplexResource.__init__( self, name='CellChatDB', input_method='cellchatdb.cellchatdb_complexes', input_args=input_args or {}, )
def __init__(self, input_args = None, **kwargs): input_args = input_args or {} if 'organism' not in input_args: input_args['organism'] = settings.get('default_organism') AbstractComplexResource.__init__( self, name = 'SIGNOR', input_method = 'signor.signor_complexes', input_args = input_args or {}, )
def which_list(self, id_type, ncbi_tax_id=None): ncbi_tax_id = ncbi_tax_id or settings.get('default_organism') key = (id_type, ncbi_tax_id) self.expiry[key] = time.time() if key not in self.lists: self.load(key) if key in self.lists: return self.lists[key]
def __new__( cls, name, resource, parent=None, aspect='functional', scope='specific', source='resource_specific', args=None, exclude=None, transmitter=None, receiver=None, resource_name=None, limit=None, avoid=None, enabled=True, ): resource_name = (resource if cls._is_resource_name(resource) else (resource_name or settings.get('annot_composite_database_name') or 'Unknown')) return super().__new__( cls, name=name, resource=resource, parent=parent or name, aspect=aspect, scope=scope, source=source, args=args, exclude=exclude, transmitter=transmitter, receiver=receiver, resource_name=resource_name, limit=cls._zero_one_or_more(limit), avoid=cls._zero_one_or_more(avoid), enabled=enabled, )
def __init__( self, total=None, name='Progress', interval=None, percent=True, status='initializing', done=0, init=True, unit='it', off=None, iterable=None, ): if off is None: self.off = not settings.get('progressbars') else: self.off = off self.name = name self.interval = (interval if interval is not None else max(int(total / 100), 1) if isinstance(total, (int, float)) else 1) self.total = total self.iterable = iterable self.done = done self.status = status self.unit = unit self.start_time = time.time() self.min_update_interval = 0.1 self.last_printed_value = 0 if init and not self.off: self.init_tqdm()
def dorothea_expand_levels(resources=None, levels=None): """ In a dictionary of resource definitions creates a separate ``NetworkResource`` object for each confidence levels of DoRothEA just like each level was a different resource. No matter ``resources`` is a ``NetworkResource`` or a dict of network resources, returns always a dict of network resources. """ resources = resources or transcription levels = levels or settings.get('tfregulons_levels') dorothea = {} dorothea_original = (resources if hasattr( resources, 'networkinput') else resources['dorothea'] if 'dorothea' in resources else transcription['dorothea']) for level in levels: level_key = 'dorothea_%s' % level dorothea[level_key] = copy.deepcopy(dorothea_original) dorothea[level_key].name = 'DoRothEA_%s' % level dorothea[level_key].networkinput.name = 'DoRothEA_%s' % level dorothea[level_key].networkinput.input_args = {'levels': {level}} if resources: resources = copy.deepcopy(resources) _ = resources.pop('dorothea', None) resources.update(dorothea) return resources else: return dorothea
def print_features(uniprot_ids, *features, fileobj=None, width=None, maxlen=None, tablefmt='fancy_grid', **kwargs): """ Prints a table with the requested features of a list of UniProt IDs. The underlying table formatting module is ``tabulate``, a versatile module to export various ascii tables as well as HTML or LaTeX -- check the docs for formatting options: https://github.com/astanin/python-tabulate :param **kwargs: Passed to ``tabulate.tabulate``. :return: None. """ maxlen = maxlen or settings.get('uniprot_info_maxlen') features = features or default_features term_width = (os.get_terminal_size().columns - 120) * 2 + 100 width = width or int(term_width / len(features)) if term_width else 40 fileobj = fileobj or sys.stdout fileobj.write( features_table(uniprot_ids, *features, width=width, maxlen=maxlen, tablefmt=tablefmt, **kwargs)) fileobj.write(os.linesep) fileobj.flush()
def take_a_trip(cachefile=None): """ Downloads TRIP data from webpage and preprocesses it. Saves preprocessed data into `cachefile` and next time loads from this file. :arg cachefile str: Path to pickle dump of preprocessed TRIP database. If does not exist the database will be downloaded and saved to this file. By default the path queried from the ``settings`` module. """ cachefile = cachefile or settings.get('trip_preprocessed') if os.path.exists(cachefile): _log('Loading preprocessed TRIP database ' 'content from `%s`' % cachefile) result = pickle.load(open(cachefile, 'rb')) return result _log('No cache found, downloading and preprocessing TRIP database.') result = {'sc': {}, 'cc': {}, 'vvc': {}, 'vtc': {}, 'fc': {}} intrs = {} titles = { 'Characterization': 'cc', 'Screening': 'sc', 'Validation: In vitro validation': 'vtc', 'Validation: In vivo validation': 'vvc', 'Functional consequence': 'fc', } interactors = {} base_url = urls.urls['trip']['base'] show_url = urls.urls['trip']['show'] c = curl.Curl(base_url) mainhtml = c.result mainsoup = bs4.BeautifulSoup(mainhtml, 'html.parser') trppages = common.flat_list( [[a.attrs['href'] for a in ul.find_all('a')] for ul in mainsoup.find( 'div', id='trp_selector').find('ul').find_all('ul')]) for trpp in trppages: trp = trpp.split('/')[-1] trpurl = show_url % trp c = curl.Curl(trpurl, silent=False) trphtml = c.result trpsoup = bs4.BeautifulSoup(trphtml, 'html.parser') trp_uniprot = trip_find_uniprot(trpsoup) if trp_uniprot is None or len(trp_uniprot) < 6: _log('Could not find UniProt for %s' % trp) for tab in trpsoup.find_all('th', colspan=['11', '13']): ttl = titles[tab.text.strip()] tab = tab.find_parent('table') trip_process_table(tab, result[ttl], intrs, trp_uniprot) _log('Saving processed TRIP database content to `%s`' % cachefile) pickle.dump(result, open(cachefile, 'wb')) return result
def get_pubmed_data(pp, cachefile=None, htp_threshold=20): """ For one PyPath object, obtains metadata for all PubMed IDs through NCBI E-utils. :param pp: ``pypath.PyPath`` object :param htp_threshold: The number of interactions for one reference above the study considered to be high-throughput. """ if cachefile is None: cachefile = settings.get('pubmed_cache') if htp_threshold is not None: pp.htp_stats() pubmeds = common.uniq_list( common.flat_list([[r.pmid for r in e['references']] for e in pp.graph.es])) if htp_threshold is not None: pubmeds = set(pubmeds) - pp.htp[htp_threshold]['htrefs'] notpmid = [i for i in pubmeds if not i.isdigit()] sys.stdout.write('\t:: Number of non PubMed ID references: %u\n' % len(notpmid)) pmdata = {} if os.path.exists(cachefile): sys.stdout.write('\t:: Loading data previously downloaded ' 'from PubMed, from file `%s`\n' % cachefile) pmdata = pickle.load(open(cachefile, 'rb')) missing = list(set(pubmeds) - set(pmdata.keys())) sys.stdout.write('\t:: Downloading data from PubMed about %s papers\n' % len(missing)) cached_pubmeds_len = len(pmdata) pmdata_new = pubmed_input.get_pubmeds(missing) pmdata.update(pmdata_new) sys.stdout.write('\t:: Saving PubMed data to file `%s`\n' % cachefile) if len(pmdata) > cached_pubmeds_len: pickle.dump(pmdata, open(cachefile, 'wb')) pmdata = dict(i for i in pmdata.items() if i[0] in pubmeds) points = [] earliest = [] for e in pp.graph.es: for s, rs in iteritems(e['refs_by_source']): pms = [ r.pmid for r in rs if (htp_threshold is None or r.pmid not in pp.htp[htp_threshold]['htrefs']) and r.pmid in pmdata and 'pubdate' in pmdata[r.pmid] ] if len(pms) > 0: yrs = [int(pmdata[pm]['pubdate'][:4]) for pm in pms] earliest.append((s, 0, min(yrs), '', e.index)) for pm in pms: points.append((s, pm, int(pmdata[pm]['pubdate'][:4]), pmdata[pm]['source'], e.index)) points = common.uniq_list(points) earliest = common.uniq_list(earliest) points = pd.DataFrame.from_records(points) earliest = pd.DataFrame.from_records(earliest) points.columns = ['database', 'pmid', 'year', 'journal', 'eid'] earliest.columns = ['database', 'none', 'year', 'none', 'eid'] return points, earliest
def __init__(self, class_definitions=None, excludes=None, excludes_extra=None, cellphonedb_categories=None, baccin_categories=None, hpmr_categories=None, surfaceome_categories=None, gpcrdb_categories=None, icellnet_categories=None, build=True, composite_resource_name=None, **kwargs): """ Builds a database about roles of proteins and complexes in intercellular communication. The built-in category definitions defining the default contents of this database can be found in the ``pypath.core.intercell_annot`` module. :param tuple class_definitions: A series of annotation class definitions, each represented by an instance of ``pypath.internals.annot_formats.AnnotDef``. These definitions carry the attributes and instructions to populate the classes. :param dict excludes: A dict with parent category names (strings) or category keys (tuples) as keys and sets if identifiers as values. The identifiers in this dict will be excluded from all the respective categories while building the database. E.g. if the UniProt ID `P00533` (EGFR) is in the set under the key of `adhesion` it will be excluded from the category `adhesion` and all it's direct children. :param dict excludes_extra: Same kind of dict as `excludes` but it will be added to the built-in default. The built in and the provided extra sets will be merged. If you want to overwrite or modify the built-in sets provide your custom dict as `excludes`. :param bool build: Execute the build upon instantiation or set up an empty object the build can be executed on later. """ if not hasattr(self, '_log_name'): session.Logger.__init__(self, name='intercell') class_definitions = (class_definitions or intercell_annot.annot_combined_classes) excludes = (excludes or intercell_annot.excludes) locals_ = locals() self._resource_categories = dict( (res, locals_['%s_categories' % res] if locals_['%s_categories' % res] is not None else settings.get('intercell_%s_categories' % res)) for res in ( 'baccin', 'cellphonedb', 'hpmr', 'surfaceome', 'gpcrdb', 'icellnet', )) annot.CustomAnnotation.__init__( self, class_definitions=class_definitions, excludes=excludes, excludes_extra=excludes_extra, build=build, composite_resource_name=composite_resource_name, **kwargs)
import datetime import collections import itertools import timeloop timeloop.app.logging.disable(level=9999) import pypath.resources.urls as urls import pypath.share.curl as curl import pypath.share.common as common import pypath.share.session as session_mod import pypath.share.settings as settings _logger = session_mod.Logger(name='uniprot_input') db = {} _cleanup_period = settings.get('mapper_cleanup_interval') _lifetime = 300 _last_used = {} _redatasheet = re.compile(r'([A-Z\s]{2})\s*([^\n\r]+)[\n\r]+') # regex for matching UniProt AC format # from https://www.uniprot.org/help/accession_numbers reac = re.compile(r'[OPQ][0-9][A-Z0-9]{3}[0-9]|' r'[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}') _rename = re.compile(r'Name=([\w\(\)-]+)\W') _retaxid = re.compile(r'=(\d+)[^\d]') def _all_uniprots(organism=9606, swissprot=None):
def _set_pickle_path(self): self._pickle_file = (self._pickle_file or os.path.join( cache.get_cachedir(), settings.get('go_pickle_cache_fname') % self.organism, ))
class Entity(session_mod.Logger): """ Represents a molecular entity such as protein, miRNA, lncRNA or small molecule. :arg str identifier: An identifier from the reference database e.g. UniProt ID for proteins. :arg str entity_type: The type of the molecular entity, defaults to ``'protein'``. :arg str id_type: The type of the identifier (the reference database), default is ``'uniprot'``. :arg int taxon: The NCBI Taxonomy Identifier of the molecular entity, e.g. ``9606`` for human. Use ``0`` for non taxon specific molecules e.g. metabolites or drug compounds. :arg NoneType,dict attrs: A dictionary of additional attributes. """ __slots__ = [ 'identifier', 'entity_type', 'id_type', 'taxon', 'attrs', 'label', 'key', ] _default_id_types = settings.get('default_name_types') _id_type_to_entity_type = { 'uniprot': 'protein', 'genesymbol': 'protein', 'mir-name': 'mirna', 'mir-mat-name': 'mirna', 'mir-pre': 'mirna', 'mir-mat': 'mirna', 'lncrna-genesymbol': 'lncrna', } _label_types = set(mapping.Mapper.label_type_to_id_type.keys()) def __init__( self, identifier, entity_type = None, id_type = None, taxon = 9606, attrs = None, ): if ( isinstance(identifier, Entity) or hasattr(identifier, 'identifier') ): ( identifier, entity_type, id_type, taxon, ) = ( identifier.identifier, identifier.entity_type, identifier.id_type, identifier.taxon, ) self._bootstrap(identifier, id_type, entity_type, taxon) self.key = self._key self.attrs = attrs or {} self.set_label() def reload(self): modname = self.__class__.__module__ mod = __import__(modname, fromlist = [modname.split('.')[0]]) import importlib as imp imp.reload(mod) new = getattr(mod, self.__class__.__name__) setattr(self, '__class__', new) def _bootstrap(self, identifier, id_type, entity_type, taxon): if self._is_complex(identifier): entity_type = 'complex' id_type = 'complex' taxon = ( identifier.ncbi_tax_id if hasattr(identifier, 'ncbi_tax_id') else taxon ) taxon = taxon or settings.get('default_organism') if not entity_type: if id_type and id_type in self._id_type_to_entity_type: entity_type = self._id_type_to_entity_type[id_type] if not id_type: id_type, entity_type = mapping.guess_type( identifier, entity_type = entity_type, ) if not id_type and (not entity_type or entity_type == 'protein'): id_type, entity_type = 'genesymbol', 'protein' if id_type in self._label_types: _identifier = mapping.id_from_label0( label = identifier, label_id_type = id_type, ncbi_tax_id = taxon, ) if _identifier and _identifier != identifier: id_type = mapping.mapper.label_type_to_id_type[id_type] identifier = _identifier if id_type == 'mir-pre': _identifier = mapping.map_name0( identifier, id_type, 'mirbase', ncbi_tax_id = taxon, ) if _identifier and _identifier != identifier: identifier = _identifier id_type = 'mirbase' entity_type = entity_type or self._get_entity_type(identifier) self.identifier = identifier self.id_type = id_type self.entity_type = entity_type self.taxon = taxon @staticmethod def entity_name_str(entity): return ( entity if isinstance(entity, common.basestring) else str(entity) ) @classmethod def igraph_vertex_name(cls, igraph_v): return cls.entity_name_str(igraph_v['name']) @staticmethod def igraph_vertex_label(igraph_v): return igraph_v['label'] @classmethod def igraph_vertex_name_label(cls, igraph_v): return ( cls.igraph_vertex_name(igraph_v), cls.igraph_vertex_label(igraph_v), ) @staticmethod def _is_protein(key): return ( isinstance(key, common.basestring) and not key.startswith('MIMAT') and not key.startswith('COMPLEX') ) @staticmethod def _is_mirna(key): return ( isinstance(key, common.basestring) and key.startswith('MIMAT') ) @staticmethod def _is_complex(key): return key.__class__.__name__ == 'Complex' or ( isinstance(key, common.basestring) and key.startswith('COMPLEX') ) @classmethod def _get_entity_type(cls, key): return ( 'complex' if cls._is_complex(key) else 'mirna' if cls._is_mirna(key) else 'protein' ) def is_protein(self): return self._is_protein(self.identifier) def is_mirna(self): return self._is_mirna(self.identifier) def is_complex(self): return self._is_complex(self.identifier) def get_entity_type(self): return self._get_entity_type(self.identifier) @classmethod def filter_entity_type(cls, entities, entity_type): """ Filters an iterable of entities or identifiers keeping only the ones of type(s) in ``entity_type``. :param iterable entities: A list, set, tuple or other iterable yielding entities or identifiers. :param str,set entity_type: One or more entity types e.g. ``{'protein', 'mirna'}``. :returns: Same type of object as ``entities`` if the type of the object is list, set or tuple, otherwise a generator. """ if not entity_type or not entities: return entities entity_type = common.to_set(entity_type) obj_type = ( type(entities) if isinstance(entities, common.list_like) else lambda x: x ) return obj_type( e for e in entities if cls._get_entity_type(e) in entity_type ) @classmethod def only_proteins(cls, entities): return cls.filter_entity_type(entities, entity_type = 'protein') @classmethod def only_complexes(cls, entities): return cls.filter_entity_type(entities, entity_type = 'complex') @classmethod def only_mirnas(cls, entities): return cls.filter_entity_type(entities, entity_type = 'mirna') @classmethod def count_entity_type(cls, entities, entity_type): """ Counts elements in an iterable of entities or identifiers of type(s) in ``entity_type``. :param iterable entities: A list, set, tuple or other iterable yielding entities or identifiers. :param str,set entity_type: One or more entity types e.g. ``{'protein', 'mirna'}``. :returns: int """ entities = ( entities if isinstance(entities, common.list_like) else list(entities) ) return len( cls.filter_entity_type( entities, entity_type = entity_type, ) ) @property def _key(self): return EntityKey( identifier = self.identifier, id_type = self.id_type, entity_type = self.entity_type, taxon = self.taxon, ) def __hash__(self): return hash(self.key) def __eq__(self, other): return ( self.__hash__() == other.__hash__() if hasattr(other, 'key') else self.identifier == other ) def __lt__(self, other): return ( self.key < other.key if hasattr(other, 'key') else self.identifier < other ) def __gt__(self, other): return ( self.key < other.key if hasattr(other, 'key') else self.identifier < other ) def set_label(self): self.label = mapping.label( name = self.identifier, id_type = self.id_type, ncbi_tax_id = self.taxon, ) or self.identifier def __repr__(self): return '<Entity: %s>' % (self.label or self.identifier) def __iadd__(self, other): if self == other: self.update_attrs(**other.attrs) return self def update_attrs(self, **kwargs): for key, val in iteritems(kwargs): if key in self.attrs: self.attrs[key] = common.combine_attrs((self.attrs[key], val)) else: self.attrs[key] = val @classmethod def info(cls, identifier): if cls._is_protein(identifier): import pypath.utils.uniprot as uniprot return utils_uniprot.info(identifier)
def _bootstrap(self, identifier, id_type, entity_type, taxon): if self._is_complex(identifier): entity_type = 'complex' id_type = 'complex' taxon = ( identifier.ncbi_tax_id if hasattr(identifier, 'ncbi_tax_id') else taxon ) taxon = taxon or settings.get('default_organism') if not entity_type: if id_type and id_type in self._id_type_to_entity_type: entity_type = self._id_type_to_entity_type[id_type] if not id_type: id_type, entity_type = mapping.guess_type( identifier, entity_type = entity_type, ) if not id_type and (not entity_type or entity_type == 'protein'): id_type, entity_type = 'genesymbol', 'protein' if id_type in self._label_types: _identifier = mapping.id_from_label0( label = identifier, label_id_type = id_type, ncbi_tax_id = taxon, ) if _identifier and _identifier != identifier: id_type = mapping.mapper.label_type_to_id_type[id_type] identifier = _identifier if id_type == 'mir-pre': _identifier = mapping.map_name0( identifier, id_type, 'mirbase', ncbi_tax_id = taxon, ) if _identifier and _identifier != identifier: identifier = _identifier id_type = 'mirbase' entity_type = entity_type or self._get_entity_type(identifier) self.identifier = identifier self.id_type = id_type self.entity_type = entity_type self.taxon = taxon
def msigdb_download( registered_email=None, collection='msigdb', id_type='symbols', force_download=False, ): """ Downloads and preprocesses a collection of gmt format gene sets from MSigDB. Returns dict of sets with gene set names as keys and molecular identifiers as values. :arg str,NoneType registered_email: An email address registered at MSigDB. If `None` the `msigdb_email` from ``pypath.settings`` will be used. :arg str collection: The name of the gene set collection. For available collections (e.g. `h.all` or `c2.cpg`) refer to the MSigDB website: http://software.broadinstitute.org/gsea/downloads.jsp#msigdb The default value `msigdb` contains all the genesets however you won't be able to distinguish which geneset comes from which collection. For this you need to download the collections one by one. :arg str id_type: MSigDB provides Gene Symbols (`symbols`) and Entrez Gene IDs (`entrez`). :arg bool force_download: Download even if cache content is available. """ registered_email = registered_email or settings.get('msigdb_email') if not registered_email: _log('To download MSigDB you must provide an email address ' 'you have previously registered at ' '`http://software.broadinstitute.org/gsea/register.jsp`. ' 'Could not proceed, returning empty dict.') return {} url = urls.urls['msigdb']['url'] % ( collection, id_type, ) req_headers_1 = [] c_nocall = curl.Curl( url, call=False, process=False, bypass_url_encoding=True, ) if (not os.path.exists(c_nocall.cache_file_name) or os.path.getsize(c_nocall.cache_file_name) == 0 or force_download): c_login_1 = curl.Curl( urls.urls['msigdb']['login1'], cache=False, write_cache=False, follow=False, large=False, silent=True, ) jsessionid = '' if hasattr(c_login_1, 'resp_headers'): for hdr in c_login_1.resp_headers: if hdr.lower().startswith(b'set-cookie'): jsessionid = hdr.split(b':')[1].split(b';')[0].strip() jsessionid = jsessionid.decode('ascii') _log('msigdb cookie obtained: `%s`.' % jsessionid) break if not jsessionid: _log('msigdb: could not get cookie, returning empty list.') return {} req_headers = ['Cookie: %s' % jsessionid] c_login_2 = curl.Curl( urls.urls['msigdb']['login2'], cache=False, write_cache=False, large=False, silent=True, req_headers=req_headers, post={ 'j_username': registered_email, 'j_password': '******', }, follow=False, empty_attempt_again=False, ) jsessionid_1 = '' if hasattr(c_login_2, 'resp_headers'): for hdr in c_login_2.resp_headers: if hdr.lower().startswith(b'set-cookie'): jsessionid_1 = hdr.split(b':')[1].split(b';')[0].strip() jsessionid_1 = jsessionid_1.decode('ascii') _log('msigdb: logged in with email `%s`, ' 'new cookie obtained: `%s`.' % (registered_email, jsessionid_1)) if not jsessionid_1: _log('msigdb: could not log in with email `%s`, ' 'returning empty dict.' % registered_email) return {} req_headers_1 = ['Cookie: %s' % jsessionid_1] c = curl.Curl( url, req_headers=req_headers_1, silent=False, large=True, bypass_url_encoding=True, cache=not force_download, ) result = {} for gset in c.result: gset = gset.strip().split('\t') result[gset[0]] = set(gset[2:]) return result