コード例 #1
0
ファイル: genecards.py プロジェクト: rfour92/pypath
def genecards_datasheet(gene):
    """
    Retrieves a gene (protein) datasheet from GeneCards.
    Returns HTML as string.
    
    :param str gene:
        A Gene Symbol or UniProt ID.
    """
    
    url = urls.urls['genecards']['url'] % gene
    
    c = curl.Curl(
        url,
        silent = True,
        large = False,
        connect_timeout = settings.get('genecards_datasheet_connect_timeout'),
        timeout = settings.get('genecards_datasheet_timeout'),
    )
    
    if c.status not in {0, 200}:
        
        _log('Failed to retrieve gene card for ID `%s`.' % gene)
        
        return None
    
    return c.result
コード例 #2
0
def _protein_datasheet(url):

    cache = True

    for a in range(3):

        c = curl.Curl(
            url,
            silent=True,
            large=False,
            cache=cache,
            connect_timeout=(
                settings.get('uniprot_datasheet_connect_timeout')),
            timeout=settings.get('uniprot_datasheet_timeout'),
        )

        if not c.result or c.result.startswith('<!DOCTYPE'):

            cache = False

        else:

            break

    if not c.result:

        _logger._log('Could not retrieve UniProt datasheet by URL `%s`.' % url)

    return _redatasheet.findall(c.result) if c.result else []
コード例 #3
0
def tf_target_resources():
    """
    Returns the resource set for building the TF-target network dataset.
    """

    transcription = (netres.dorothea_expand_levels(
        resources=netres.transcription,
        levels=settings.get('tfregulons_levels'),
    ) if settings.get('dorothea_expand_levels') else netres.transcription)

    return transcription
コード例 #4
0
ファイル: uniprot.py プロジェクト: rfour92/pypath
def features_table(uniprot_ids,
                   *features,
                   width=40,
                   maxlen=None,
                   tablefmt='fancy_grid',
                   **kwargs):
    """
    Returns a table with the requested features of a list of UniProt IDs.
    The underlying table formatting module is ``tabulate``, a versatile
    module to export various ascii tables as well as HTML or LaTeX --
    check the docs for formatting options:
    https://github.com/astanin/python-tabulate

    :param **kwargs:
        Passed to ``tabulate.tabulate``.

    :return:
        The table as a string.
    """

    maxlen = maxlen or settings.get('uniprot_info_maxlen')

    features = features or default_features

    tbl = collect(uniprot_ids, *features)

    return common.table_format(tbl,
                               width=width,
                               maxlen=maxlen,
                               tablefmt=tablefmt,
                               **kwargs)
コード例 #5
0
    def __init__(self, license_dir=None):

        session.Logger.__init__(self, name='licenses')

        self.license_dir = license_dir or settings.get('license_dir')
        self.licenses = {}
        self.populate()
コード例 #6
0
    def __init__(
        self,
        members,
        name=None,
        parent=None,
        aspect='functional',
        source='resource_specific',
        scope='specific',
        resource=None,
        transmitter=None,
        receiver=None,
        limit=None,
        avoid=None,
        enabled=True,
    ):

        collections_abc.Set.__init__(self)
        self.members = set(members)
        self.name = name or 'unnamed'
        self.parent = parent or self.name
        self.aspect = aspect
        self.source = source
        self.scope = scope
        self.resource = (resource
                         or settings.get('annot_composite_database_name')
                         or 'Unknown')
        self.transmitter = transmitter
        self.receiver = receiver
        self.limit = common.to_set(limit)
        self.avoid = common.to_set(avoid)
        self.enabled = enabled
コード例 #7
0
ファイル: log.py プロジェクト: rfour92/pypath
def new_logger(name=None, logdir=None, verbosity=None, **kwargs):
    """
    Returns a new logger with default settings (can be customized).

    Parameters
    ----------
    name : str
        Custom name for the log.
    logdir : str
        Path to the directoty to store log files.
    verbosity : int
        Verbosity level, lowest is 0. Messages from levels above this
        won't be written to the log..

    Returns
    -------
    ``log.Logger`` instance.
    """

    name = name or settings.get('module_name')
    logdir = logdir or '%s_log' % name

    return Logger(fname='%s__%s.log' % (
        name,
        Logger.timestamp().replace(' ', '_').replace(':', '.'),
    ),
                  verbosity=0,
                  logdir=logdir,
                  **kwargs)
コード例 #8
0
ファイル: export.py プロジェクト: rfour92/pypath
    def webservice_interactions_df_legacy(self):

        sources_omnipath = set(f.name for f in data_formats.omnipath.values())
        sources_extra_directions = settings.get('network_extra_directions')
        sources_kinase_extra = set(f.name
                                   for f in data_formats.ptm_misc.values())
        sources_ligrec_extra = set(
            f.name for f in data_formats.ligand_receptor.values())
        sources_pathway_extra = set(
            f.name for f in data_formats.pathway_noref.values())
        sources_mirna = set(f.name for f in data_formats.mirna_target.values())

        self.make_df(
            unique_pairs=False,
            extra_node_attrs={'ncbi_tax_id': 'ncbi_tax_id'},
            extra_edge_attrs={
                'omnipath':
                lambda e, d:
                ((bool(e['dirs'].sources[d] & sources_omnipath) or
                  (bool(e['dirs'].sources['undirected'] & sources_omnipath) and
                   bool(e['dirs'].sources[d] & sources_extra_directions))) and
                 'PPI' in e['type']),
                'kinaseextra':
                lambda e, d: (bool(e['dirs'].sources[d] & sources_kinase_extra)
                              and 'PPI' in e['type']),
                'ligrecextra':
                lambda e, d: (bool(e['dirs'].sources[d] & sources_ligrec_extra)
                              and 'PPI' in e['type']),
                'pathwayextra':
                lambda e, d: (bool(e['dirs'].sources[d] & sources_pathway_extra
                                   ) and 'TF' in e['type']),
                'mirnatarget':
                lambda e, d: (bool(e['dirs'].sources[d] & sources_mirna) and
                              'MTI' in e['type']),
                'dorothea':
                lambda e, d: ('TF' in e['sources_by_type'] and bool(e[
                    'sources_by_type']['TF'] & e['dirs'].sources[d])),
                'dorothea_curated':
                'dorothea_curated',
                'dorothea_chipseq':
                'dorothea_chipseq',
                'dorothea_tfbs':
                'dorothea_tfbs',
                'dorothea_coexp':
                'dorothea_coexp',
                'dorothea_level':
                lambda e, d:
                (';'.join(sorted(e['dorothea_level'])) if 'dorothea_level' in e
                 .attributes() and 'TF' in e['sources_by_type'] and bool(e[
                     'sources_by_type']['TF'] & e['dirs'].sources[d]) else ''),
                # quite wrong (taking only the first one):
                'type':
                lambda e: e['type'][0],
                'curation_effort':
                lambda e, d: (e.count_curation_effort(direction=d) +
                              (e.count_curation_effort(direction='undirected')
                               if isinstance(d, tuple) else 0)),
            })
コード例 #9
0
ファイル: log.py プロジェクト: rfour92/pypath
    def __init__(
        self,
        fname,
        verbosity=None,
        console_level=None,
        logdir=None,
        max_width=200,
    ):
        """
        fname : str
            Log file name.
        logdir : name
            Path to the directory containing the log files.
        verbosity : int
            Messages at and below this level will be written into the
            logfile. All other messages will be dropped.
        console_level : int
            Messages below this log level will be printed not only into
            logfile but also to the console.
        """
        @_log_flush_timeloop.job(interval=datetime.timedelta(
            seconds=settings.get('log_flush_interval')))
        def _flush():

            self.flush()

        _log_flush_timeloop.start(block=False)

        self.wrapper = textwrap.TextWrapper(
            width=max_width,
            subsequent_indent=' ' * 22,
            break_long_words=False,
        )
        self.logdir = self.get_logdir(logdir)
        self.fname = os.path.join(self.logdir, fname)
        self.verbosity = (verbosity if verbosity is not None else
                          settings.get('log_verbosity'))
        self.console_level = (console_level if console_level is not None else
                              settings.get('console_verbosity'))
        self.open_logfile()

        # sending some greetings
        self.msg('Welcome!')
        self.msg('Logger started, logging into `%s`.' % self.fname)
コード例 #10
0
def get_cachedir(cachedir=None):
    """
    Ensures the cache directory exists and returns its path.
    """

    cachedir = cachedir or settings.get('cachedir')

    os.makedirs(cachedir, exist_ok=True)

    return cachedir
コード例 #11
0
ファイル: app.py プロジェクト: jgray7700/pypath
    def get_param(self, key):
        """
        Retrieves a parameter from the :py:attr:`param` dict of the current
        object or from the module settings.
        """

        if key in self.param:

            return self.param[key]

        return settings.get(key)
コード例 #12
0
ファイル: app.py プロジェクト: jgray7700/pypath
    def __init__(self, rebuild=False, **kwargs):

        session_mod.Logger.__init__(self, name='omnipath.dbmanager')

        self.timestamp = time.strftime(settings.get('timestamp_format'))
        self.param = kwargs
        self.rebuild = rebuild
        self.datasets = self.get_param('datasets')
        self.ensure_dirs()
        self.network_dfs = {}

        self._log('The OmniPath database manager has been initialized.')
コード例 #13
0
ファイル: input_formats.py プロジェクト: jgray7700/pypath
    def __init__(
        self,
        type_,
        id_type_a,
        id_type_b,
        ncbi_tax_id=None,
    ):

        self.type = type_
        self.id_type_a = id_type_a
        self.id_type_b = id_type_b
        self.ncbi_tax_id = ncbi_tax_id or settings.get('default_organism')
コード例 #14
0
ファイル: log.py プロジェクト: rfour92/pypath
    def get_logdir(self, dirname=None):
        """
        Returns the path to log directory.
        Also creates the directory if does not exist.
        """

        dirname = dirname or '%s_log' % settings.get('module_name')

        if not os.path.exists(dirname):
            os.makedirs(dirname)

        return os.path.abspath(dirname)
コード例 #15
0
    def __init__(self, input_args=None, **kwargs):

        input_args = input_args or {}

        if 'organism' not in input_args:

            input_args['organism'] = settings.get('default_organism')

        AbstractComplexResource.__init__(
            self,
            name='CellChatDB',
            input_method='cellchatdb.cellchatdb_complexes',
            input_args=input_args or {},
        )
コード例 #16
0
ファイル: complex.py プロジェクト: rfour92/pypath
    def __init__(self, input_args = None, **kwargs):

        input_args = input_args or {}

        if 'organism' not in input_args:

            input_args['organism'] = settings.get('default_organism')

        AbstractComplexResource.__init__(
            self,
            name = 'SIGNOR',
            input_method = 'signor.signor_complexes',
            input_args = input_args or {},
        )
コード例 #17
0
ファイル: reflists.py プロジェクト: rfour92/pypath
    def which_list(self, id_type, ncbi_tax_id=None):

        ncbi_tax_id = ncbi_tax_id or settings.get('default_organism')

        key = (id_type, ncbi_tax_id)

        self.expiry[key] = time.time()

        if key not in self.lists:

            self.load(key)

        if key in self.lists:

            return self.lists[key]
コード例 #18
0
    def __new__(
        cls,
        name,
        resource,
        parent=None,
        aspect='functional',
        scope='specific',
        source='resource_specific',
        args=None,
        exclude=None,
        transmitter=None,
        receiver=None,
        resource_name=None,
        limit=None,
        avoid=None,
        enabled=True,
    ):

        resource_name = (resource if cls._is_resource_name(resource) else
                         (resource_name
                          or settings.get('annot_composite_database_name')
                          or 'Unknown'))

        return super().__new__(
            cls,
            name=name,
            resource=resource,
            parent=parent or name,
            aspect=aspect,
            scope=scope,
            source=source,
            args=args,
            exclude=exclude,
            transmitter=transmitter,
            receiver=receiver,
            resource_name=resource_name,
            limit=cls._zero_one_or_more(limit),
            avoid=cls._zero_one_or_more(avoid),
            enabled=enabled,
        )
コード例 #19
0
    def __init__(
        self,
        total=None,
        name='Progress',
        interval=None,
        percent=True,
        status='initializing',
        done=0,
        init=True,
        unit='it',
        off=None,
        iterable=None,
    ):

        if off is None:

            self.off = not settings.get('progressbars')

        else:

            self.off = off

        self.name = name
        self.interval = (interval if interval is not None else
                         max(int(total /
                                 100), 1) if isinstance(total,
                                                        (int, float)) else 1)
        self.total = total
        self.iterable = iterable
        self.done = done
        self.status = status
        self.unit = unit
        self.start_time = time.time()
        self.min_update_interval = 0.1
        self.last_printed_value = 0

        if init and not self.off:

            self.init_tqdm()
コード例 #20
0
def dorothea_expand_levels(resources=None, levels=None):
    """
    In a dictionary of resource definitions creates a separate
    ``NetworkResource`` object for each confidence levels of DoRothEA
    just like each level was a different resource.
    
    No matter ``resources`` is a ``NetworkResource`` or a dict of network
    resources, returns always a dict of network resources.
    """

    resources = resources or transcription
    levels = levels or settings.get('tfregulons_levels')
    dorothea = {}

    dorothea_original = (resources if hasattr(
        resources, 'networkinput') else resources['dorothea'] if 'dorothea'
                         in resources else transcription['dorothea'])

    for level in levels:

        level_key = 'dorothea_%s' % level

        dorothea[level_key] = copy.deepcopy(dorothea_original)
        dorothea[level_key].name = 'DoRothEA_%s' % level
        dorothea[level_key].networkinput.name = 'DoRothEA_%s' % level
        dorothea[level_key].networkinput.input_args = {'levels': {level}}

    if resources:

        resources = copy.deepcopy(resources)
        _ = resources.pop('dorothea', None)
        resources.update(dorothea)

        return resources

    else:

        return dorothea
コード例 #21
0
ファイル: uniprot.py プロジェクト: rfour92/pypath
def print_features(uniprot_ids,
                   *features,
                   fileobj=None,
                   width=None,
                   maxlen=None,
                   tablefmt='fancy_grid',
                   **kwargs):
    """
    Prints a table with the requested features of a list of UniProt IDs.
    The underlying table formatting module is ``tabulate``, a versatile
    module to export various ascii tables as well as HTML or LaTeX --
    check the docs for formatting options:
    https://github.com/astanin/python-tabulate

    :param **kwargs:
        Passed to ``tabulate.tabulate``.

    :return:
        None.
    """

    maxlen = maxlen or settings.get('uniprot_info_maxlen')
    features = features or default_features
    term_width = (os.get_terminal_size().columns - 120) * 2 + 100
    width = width or int(term_width / len(features)) if term_width else 40
    fileobj = fileobj or sys.stdout

    fileobj.write(
        features_table(uniprot_ids,
                       *features,
                       width=width,
                       maxlen=maxlen,
                       tablefmt=tablefmt,
                       **kwargs))
    fileobj.write(os.linesep)
    fileobj.flush()
コード例 #22
0
ファイル: trip.py プロジェクト: rfour92/pypath
def take_a_trip(cachefile=None):
    """
    Downloads TRIP data from webpage and preprocesses it.
    Saves preprocessed data into `cachefile` and next
    time loads from this file.

    :arg cachefile str:
        Path to pickle dump of preprocessed TRIP database. If does not exist
        the database will be downloaded and saved to this file. By default
        the path queried from the ``settings`` module.
    """

    cachefile = cachefile or settings.get('trip_preprocessed')

    if os.path.exists(cachefile):
        _log('Loading preprocessed TRIP database '
             'content from `%s`' % cachefile)
        result = pickle.load(open(cachefile, 'rb'))

        return result

    _log('No cache found, downloading and preprocessing TRIP database.')

    result = {'sc': {}, 'cc': {}, 'vvc': {}, 'vtc': {}, 'fc': {}}
    intrs = {}
    titles = {
        'Characterization': 'cc',
        'Screening': 'sc',
        'Validation: In vitro validation': 'vtc',
        'Validation: In vivo validation': 'vvc',
        'Functional consequence': 'fc',
    }

    interactors = {}
    base_url = urls.urls['trip']['base']
    show_url = urls.urls['trip']['show']
    c = curl.Curl(base_url)
    mainhtml = c.result
    mainsoup = bs4.BeautifulSoup(mainhtml, 'html.parser')
    trppages = common.flat_list(
        [[a.attrs['href'] for a in ul.find_all('a')] for ul in mainsoup.find(
            'div', id='trp_selector').find('ul').find_all('ul')])

    for trpp in trppages:
        trp = trpp.split('/')[-1]
        trpurl = show_url % trp
        c = curl.Curl(trpurl, silent=False)
        trphtml = c.result
        trpsoup = bs4.BeautifulSoup(trphtml, 'html.parser')
        trp_uniprot = trip_find_uniprot(trpsoup)

        if trp_uniprot is None or len(trp_uniprot) < 6:
            _log('Could not find UniProt for %s' % trp)

        for tab in trpsoup.find_all('th', colspan=['11', '13']):
            ttl = titles[tab.text.strip()]
            tab = tab.find_parent('table')
            trip_process_table(tab, result[ttl], intrs, trp_uniprot)

    _log('Saving processed TRIP database content to `%s`' % cachefile)
    pickle.dump(result, open(cachefile, 'wb'))

    return result
コード例 #23
0
def get_pubmed_data(pp, cachefile=None, htp_threshold=20):
    """
    For one PyPath object, obtains metadata for all PubMed IDs
    through NCBI E-utils.

    :param pp:
        ``pypath.PyPath`` object
    :param htp_threshold:
        The number of interactions for one reference
        above the study considered to be high-throughput.
    """

    if cachefile is None:

        cachefile = settings.get('pubmed_cache')

    if htp_threshold is not None:
        pp.htp_stats()

    pubmeds = common.uniq_list(
        common.flat_list([[r.pmid for r in e['references']]
                          for e in pp.graph.es]))

    if htp_threshold is not None:
        pubmeds = set(pubmeds) - pp.htp[htp_threshold]['htrefs']

    notpmid = [i for i in pubmeds if not i.isdigit()]

    sys.stdout.write('\t:: Number of non PubMed ID references: %u\n' %
                     len(notpmid))

    pmdata = {}
    if os.path.exists(cachefile):
        sys.stdout.write('\t:: Loading data previously downloaded '
                         'from PubMed, from file `%s`\n' % cachefile)
        pmdata = pickle.load(open(cachefile, 'rb'))

    missing = list(set(pubmeds) - set(pmdata.keys()))
    sys.stdout.write('\t:: Downloading data from PubMed about %s papers\n' %
                     len(missing))
    cached_pubmeds_len = len(pmdata)
    pmdata_new = pubmed_input.get_pubmeds(missing)
    pmdata.update(pmdata_new)

    sys.stdout.write('\t:: Saving PubMed data to file `%s`\n' % cachefile)

    if len(pmdata) > cached_pubmeds_len:
        pickle.dump(pmdata, open(cachefile, 'wb'))

    pmdata = dict(i for i in pmdata.items() if i[0] in pubmeds)

    points = []
    earliest = []

    for e in pp.graph.es:

        for s, rs in iteritems(e['refs_by_source']):

            pms = [
                r.pmid for r in rs
                if (htp_threshold is None
                    or r.pmid not in pp.htp[htp_threshold]['htrefs'])
                and r.pmid in pmdata and 'pubdate' in pmdata[r.pmid]
            ]
            if len(pms) > 0:
                yrs = [int(pmdata[pm]['pubdate'][:4]) for pm in pms]
                earliest.append((s, 0, min(yrs), '', e.index))
                for pm in pms:
                    points.append((s, pm, int(pmdata[pm]['pubdate'][:4]),
                                   pmdata[pm]['source'], e.index))

    points = common.uniq_list(points)
    earliest = common.uniq_list(earliest)

    points = pd.DataFrame.from_records(points)
    earliest = pd.DataFrame.from_records(earliest)
    points.columns = ['database', 'pmid', 'year', 'journal', 'eid']
    earliest.columns = ['database', 'none', 'year', 'none', 'eid']

    return points, earliest
コード例 #24
0
ファイル: intercell.py プロジェクト: rfour92/pypath
    def __init__(self,
                 class_definitions=None,
                 excludes=None,
                 excludes_extra=None,
                 cellphonedb_categories=None,
                 baccin_categories=None,
                 hpmr_categories=None,
                 surfaceome_categories=None,
                 gpcrdb_categories=None,
                 icellnet_categories=None,
                 build=True,
                 composite_resource_name=None,
                 **kwargs):
        """
        Builds a database about roles of proteins and complexes in
        intercellular communication. The built-in category definitions
        defining the default contents of this database can be found in the
        ``pypath.core.intercell_annot`` module.

        :param tuple class_definitions:
            A series of annotation class definitions, each represented by
            an instance of ``pypath.internals.annot_formats.AnnotDef``.
            These definitions carry the attributes and instructions to
            populate the classes.
        :param dict excludes:
            A dict with parent category names (strings) or category keys
            (tuples) as keys and sets if identifiers as values.
            The identifiers in this dict will be excluded from all the
            respective categories while building the database. E.g. if
            the UniProt ID `P00533` (EGFR) is in the set under the key of
            `adhesion` it will be excluded from the category `adhesion` and
            all it's direct children.
        :param dict excludes_extra:
            Same kind of dict as `excludes` but it will be added to the
            built-in default. The built in and the provided extra sets
            will be merged. If you want to overwrite or modify the built-in
            sets provide your custom dict as `excludes`.
        :param bool build:
            Execute the build upon instantiation or set up an empty object
            the build can be executed on later.
        """

        if not hasattr(self, '_log_name'):

            session.Logger.__init__(self, name='intercell')

        class_definitions = (class_definitions
                             or intercell_annot.annot_combined_classes)
        excludes = (excludes or intercell_annot.excludes)

        locals_ = locals()
        self._resource_categories = dict(
            (res,
             locals_['%s_categories' % res] if locals_['%s_categories' % res]
             is not None else settings.get('intercell_%s_categories' % res))
            for res in (
                'baccin',
                'cellphonedb',
                'hpmr',
                'surfaceome',
                'gpcrdb',
                'icellnet',
            ))

        annot.CustomAnnotation.__init__(
            self,
            class_definitions=class_definitions,
            excludes=excludes,
            excludes_extra=excludes_extra,
            build=build,
            composite_resource_name=composite_resource_name,
            **kwargs)
コード例 #25
0
import datetime
import collections
import itertools
import timeloop
timeloop.app.logging.disable(level=9999)

import pypath.resources.urls as urls
import pypath.share.curl as curl
import pypath.share.common as common
import pypath.share.session as session_mod
import pypath.share.settings as settings

_logger = session_mod.Logger(name='uniprot_input')

db = {}
_cleanup_period = settings.get('mapper_cleanup_interval')
_lifetime = 300
_last_used = {}

_redatasheet = re.compile(r'([A-Z\s]{2})\s*([^\n\r]+)[\n\r]+')

# regex for matching UniProt AC format
# from https://www.uniprot.org/help/accession_numbers
reac = re.compile(r'[OPQ][0-9][A-Z0-9]{3}[0-9]|'
                  r'[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}')
_rename = re.compile(r'Name=([\w\(\)-]+)\W')
_retaxid = re.compile(r'=(\d+)[^\d]')


def _all_uniprots(organism=9606, swissprot=None):
コード例 #26
0
ファイル: go.py プロジェクト: rfour92/pypath
    def _set_pickle_path(self):

        self._pickle_file = (self._pickle_file or os.path.join(
            cache.get_cachedir(),
            settings.get('go_pickle_cache_fname') % self.organism,
        ))
コード例 #27
0
class Entity(session_mod.Logger):
    """
    Represents a molecular entity such as protein, miRNA, lncRNA or small
    molecule.

    :arg str identifier:
        An identifier from the reference database e.g. UniProt ID for
        proteins.
    :arg str entity_type:
        The type of the molecular entity, defaults to ``'protein'``.
    :arg str id_type:
        The type of the identifier (the reference database), default is
        ``'uniprot'``.
    :arg int taxon:
        The NCBI Taxonomy Identifier of the molecular entity, e.g. ``9606``
        for human. Use ``0`` for non taxon specific molecules e.g. metabolites
        or drug compounds.
    :arg NoneType,dict attrs:
        A dictionary of additional attributes.
    """

    __slots__ = [
        'identifier',
        'entity_type',
        'id_type',
        'taxon',
        'attrs',
        'label',
        'key',
    ]


    _default_id_types = settings.get('default_name_types')

    _id_type_to_entity_type = {
        'uniprot': 'protein',
        'genesymbol': 'protein',
        'mir-name': 'mirna',
        'mir-mat-name': 'mirna',
        'mir-pre': 'mirna',
        'mir-mat': 'mirna',
        'lncrna-genesymbol': 'lncrna',
    }

    _label_types = set(mapping.Mapper.label_type_to_id_type.keys())


    def __init__(
            self,
            identifier,
            entity_type = None,
            id_type = None,
            taxon = 9606,
            attrs = None,
        ):

        if (
            isinstance(identifier, Entity) or
            hasattr(identifier, 'identifier')
        ):

            (
                identifier,
                entity_type,
                id_type,
                taxon,
            ) = (
                identifier.identifier,
                identifier.entity_type,
                identifier.id_type,
                identifier.taxon,
            )

        self._bootstrap(identifier, id_type, entity_type, taxon)
        self.key = self._key

        self.attrs = attrs or {}

        self.set_label()


    def reload(self):

        modname = self.__class__.__module__
        mod = __import__(modname, fromlist = [modname.split('.')[0]])
        import importlib as imp
        imp.reload(mod)
        new = getattr(mod, self.__class__.__name__)
        setattr(self, '__class__', new)


    def _bootstrap(self, identifier, id_type, entity_type, taxon):

        if self._is_complex(identifier):

            entity_type = 'complex'
            id_type = 'complex'
            taxon = (
                identifier.ncbi_tax_id
                    if hasattr(identifier, 'ncbi_tax_id') else
                taxon
            )

        taxon = taxon or settings.get('default_organism')

        if not entity_type:

            if id_type and id_type in self._id_type_to_entity_type:

                entity_type = self._id_type_to_entity_type[id_type]


        if not id_type:

            id_type, entity_type = mapping.guess_type(
                identifier,
                entity_type = entity_type,
            )

        if not id_type and (not entity_type or entity_type == 'protein'):

            id_type, entity_type = 'genesymbol', 'protein'

        if id_type in self._label_types:

            _identifier = mapping.id_from_label0(
                label = identifier,
                label_id_type = id_type,
                ncbi_tax_id = taxon,
            )



            if _identifier and _identifier != identifier:

                id_type = mapping.mapper.label_type_to_id_type[id_type]
                identifier = _identifier

            if id_type == 'mir-pre':

                _identifier = mapping.map_name0(
                    identifier,
                    id_type,
                    'mirbase',
                    ncbi_tax_id = taxon,
                )

                if _identifier and _identifier != identifier:

                    identifier = _identifier
                    id_type = 'mirbase'

        entity_type = entity_type or self._get_entity_type(identifier)

        self.identifier = identifier
        self.id_type = id_type
        self.entity_type = entity_type
        self.taxon = taxon




    @staticmethod
    def entity_name_str(entity):

        return (
            entity
                if isinstance(entity, common.basestring) else
            str(entity)
        )


    @classmethod
    def igraph_vertex_name(cls, igraph_v):

        return cls.entity_name_str(igraph_v['name'])


    @staticmethod
    def igraph_vertex_label(igraph_v):

        return igraph_v['label']


    @classmethod
    def igraph_vertex_name_label(cls, igraph_v):

        return (
            cls.igraph_vertex_name(igraph_v),
            cls.igraph_vertex_label(igraph_v),
        )


    @staticmethod
    def _is_protein(key):

        return (
            isinstance(key, common.basestring) and
            not key.startswith('MIMAT') and
            not key.startswith('COMPLEX')
        )


    @staticmethod
    def _is_mirna(key):

        return (
            isinstance(key, common.basestring) and
            key.startswith('MIMAT')
        )


    @staticmethod
    def _is_complex(key):

        return key.__class__.__name__ == 'Complex' or (
            isinstance(key, common.basestring) and
            key.startswith('COMPLEX')
        )


    @classmethod
    def _get_entity_type(cls, key):

        return (
            'complex'
                if cls._is_complex(key) else
            'mirna'
                if cls._is_mirna(key) else
            'protein'
        )


    def is_protein(self):

        return self._is_protein(self.identifier)


    def is_mirna(self):

        return self._is_mirna(self.identifier)


    def is_complex(self):

        return self._is_complex(self.identifier)


    def get_entity_type(self):

        return self._get_entity_type(self.identifier)


    @classmethod
    def filter_entity_type(cls, entities, entity_type):
        """
        Filters an iterable of entities or identifiers keeping only the ones
        of type(s) in ``entity_type``.
        
        :param iterable entities:
            A list, set, tuple or other iterable yielding entities or
            identifiers.
        :param str,set entity_type:
            One or more entity types e.g. ``{'protein', 'mirna'}``.
        
        :returns:
            Same type of object as ``entities`` if the type of the object is
            list, set or tuple, otherwise a generator.
        """
        
        if not entity_type or not entities:
            
            return entities
        
        entity_type = common.to_set(entity_type)
        obj_type = (
            type(entities)
                if isinstance(entities, common.list_like) else
            lambda x: x
        )
        
        return obj_type(
            e
            for e in entities
            if cls._get_entity_type(e) in entity_type
        )
    
    
    @classmethod
    def only_proteins(cls, entities):
        
        return cls.filter_entity_type(entities, entity_type = 'protein')
    
    
    @classmethod
    def only_complexes(cls, entities):
        
        return cls.filter_entity_type(entities, entity_type = 'complex')
    
    
    @classmethod
    def only_mirnas(cls, entities):
        
        return cls.filter_entity_type(entities, entity_type = 'mirna')


    @classmethod
    def count_entity_type(cls, entities, entity_type):
        """
        Counts elements in an iterable of entities or identifiers of type(s)
        in ``entity_type``.
        
        :param iterable entities:
            A list, set, tuple or other iterable yielding entities or
            identifiers.
        :param str,set entity_type:
            One or more entity types e.g. ``{'protein', 'mirna'}``.
        
        :returns:
            int
        """
        
        entities = (
            entities
                if isinstance(entities, common.list_like) else
            list(entities)
        )
        
        return len(
            cls.filter_entity_type(
                entities,
                entity_type = entity_type,
            )
        )


    @property
    def _key(self):

        return EntityKey(
            identifier = self.identifier,
            id_type = self.id_type,
            entity_type = self.entity_type,
            taxon = self.taxon,
        )


    def __hash__(self):

        return hash(self.key)


    def __eq__(self, other):

        return (
            self.__hash__() == other.__hash__()
                if hasattr(other, 'key') else
            self.identifier == other
        )


    def __lt__(self, other):

        return (
            self.key < other.key
                if hasattr(other, 'key') else
            self.identifier < other
        )


    def __gt__(self, other):

        return (
            self.key < other.key
                if hasattr(other, 'key') else
            self.identifier < other
        )


    def set_label(self):

        self.label = mapping.label(
            name = self.identifier,
            id_type = self.id_type,
            ncbi_tax_id = self.taxon,
        ) or self.identifier


    def __repr__(self):

        return '<Entity: %s>' % (self.label or self.identifier)


    def __iadd__(self, other):

        if self == other:

            self.update_attrs(**other.attrs)

        return self


    def update_attrs(self, **kwargs):

        for key, val in iteritems(kwargs):

            if key in self.attrs:

                self.attrs[key] = common.combine_attrs((self.attrs[key], val))

            else:

                self.attrs[key] = val


    @classmethod
    def info(cls, identifier):

        if cls._is_protein(identifier):

            import pypath.utils.uniprot as uniprot
            return utils_uniprot.info(identifier)
コード例 #28
0
    def _bootstrap(self, identifier, id_type, entity_type, taxon):

        if self._is_complex(identifier):

            entity_type = 'complex'
            id_type = 'complex'
            taxon = (
                identifier.ncbi_tax_id
                    if hasattr(identifier, 'ncbi_tax_id') else
                taxon
            )

        taxon = taxon or settings.get('default_organism')

        if not entity_type:

            if id_type and id_type in self._id_type_to_entity_type:

                entity_type = self._id_type_to_entity_type[id_type]


        if not id_type:

            id_type, entity_type = mapping.guess_type(
                identifier,
                entity_type = entity_type,
            )

        if not id_type and (not entity_type or entity_type == 'protein'):

            id_type, entity_type = 'genesymbol', 'protein'

        if id_type in self._label_types:

            _identifier = mapping.id_from_label0(
                label = identifier,
                label_id_type = id_type,
                ncbi_tax_id = taxon,
            )



            if _identifier and _identifier != identifier:

                id_type = mapping.mapper.label_type_to_id_type[id_type]
                identifier = _identifier

            if id_type == 'mir-pre':

                _identifier = mapping.map_name0(
                    identifier,
                    id_type,
                    'mirbase',
                    ncbi_tax_id = taxon,
                )

                if _identifier and _identifier != identifier:

                    identifier = _identifier
                    id_type = 'mirbase'

        entity_type = entity_type or self._get_entity_type(identifier)

        self.identifier = identifier
        self.id_type = id_type
        self.entity_type = entity_type
        self.taxon = taxon
コード例 #29
0
def msigdb_download(
    registered_email=None,
    collection='msigdb',
    id_type='symbols',
    force_download=False,
):
    """
    Downloads and preprocesses a collection of gmt format gene sets from
    MSigDB. Returns dict of sets with gene set names as keys and molecular
    identifiers as values.

    :arg str,NoneType registered_email:
        An email address registered at MSigDB. If `None` the `msigdb_email`
        from ``pypath.settings`` will be used.
    :arg str collection:
        The name of the gene set collection. For available collections (e.g.
        `h.all` or `c2.cpg`) refer to the MSigDB website:
        http://software.broadinstitute.org/gsea/downloads.jsp#msigdb
        The default value `msigdb` contains all the genesets however you
        won't be able to distinguish which geneset comes from which
        collection. For this you need to download the collections one by one.
    :arg str id_type:
        MSigDB provides Gene Symbols (`symbols`) and Entrez Gene IDs
        (`entrez`).
    :arg bool force_download:
        Download even if cache content is available.
    """

    registered_email = registered_email or settings.get('msigdb_email')

    if not registered_email:
        _log('To download MSigDB you must provide an email address '
             'you have previously registered at '
             '`http://software.broadinstitute.org/gsea/register.jsp`. '
             'Could not proceed, returning empty dict.')

        return {}

    url = urls.urls['msigdb']['url'] % (
        collection,
        id_type,
    )

    req_headers_1 = []

    c_nocall = curl.Curl(
        url,
        call=False,
        process=False,
        bypass_url_encoding=True,
    )

    if (not os.path.exists(c_nocall.cache_file_name)
            or os.path.getsize(c_nocall.cache_file_name) == 0
            or force_download):
        c_login_1 = curl.Curl(
            urls.urls['msigdb']['login1'],
            cache=False,
            write_cache=False,
            follow=False,
            large=False,
            silent=True,
        )

        jsessionid = ''

        if hasattr(c_login_1, 'resp_headers'):
            for hdr in c_login_1.resp_headers:
                if hdr.lower().startswith(b'set-cookie'):
                    jsessionid = hdr.split(b':')[1].split(b';')[0].strip()
                    jsessionid = jsessionid.decode('ascii')
                    _log('msigdb cookie obtained: `%s`.' % jsessionid)

                    break

        if not jsessionid:
            _log('msigdb: could not get cookie, returning empty list.')

            return {}

        req_headers = ['Cookie: %s' % jsessionid]

        c_login_2 = curl.Curl(
            urls.urls['msigdb']['login2'],
            cache=False,
            write_cache=False,
            large=False,
            silent=True,
            req_headers=req_headers,
            post={
                'j_username': registered_email,
                'j_password': '******',
            },
            follow=False,
            empty_attempt_again=False,
        )

        jsessionid_1 = ''

        if hasattr(c_login_2, 'resp_headers'):
            for hdr in c_login_2.resp_headers:
                if hdr.lower().startswith(b'set-cookie'):

                    jsessionid_1 = hdr.split(b':')[1].split(b';')[0].strip()
                    jsessionid_1 = jsessionid_1.decode('ascii')

            _log('msigdb: logged in with email `%s`, '
                 'new cookie obtained: `%s`.' %
                 (registered_email, jsessionid_1))

        if not jsessionid_1:
            _log('msigdb: could not log in with email `%s`, '
                 'returning empty dict.' % registered_email)

            return {}

        req_headers_1 = ['Cookie: %s' % jsessionid_1]

    c = curl.Curl(
        url,
        req_headers=req_headers_1,
        silent=False,
        large=True,
        bypass_url_encoding=True,
        cache=not force_download,
    )

    result = {}

    for gset in c.result:
        gset = gset.strip().split('\t')

        result[gset[0]] = set(gset[2:])

    return result