def test_gds_data(self):
        # test url
        self.assertIsNotNone(gds_download_url(self.test_sample))

        # file not in cache
        self.assertFalse(gds_is_cached(self.test_sample))

        # download gds from serverfiles
        try:
            makedirs(serverfiles.localpath(DOMAIN))
        except OSError:
            if path.exists(serverfiles.localpath(DOMAIN)):
                pass
            else:
                # There was an error on creation, so make sure we know about it
                raise
        gds_download(self.test_sample)

        # file in cache
        self.assertIsNone(gds_ensure_downloaded(self.test_sample))
        self.assertTrue(gds_is_cached(self.test_sample))

        gds = GDS(self.test_sample)
        self.assertIsNotNone(gds.info)
        self.assertEqual(gds.info['gene_count'], 9561)
        self.assertEqual(len(gds.info['samples']), 4)
        self.assertEqual(len(gds.info['subsets']), 2)

        self.assertEqual(gds.info['taxid'], self.test_organism)

        self.assertIsInstance(gds.get_data(), Table)
        self.assertIsInstance(gds.get_data(transpose=True), Table)
Exemple #2
0
    def __init__(self, gds_name, remove_unknown=None):
        """ Retrieval of a specific GEO DataSet as a :obj:`Orange.data.Table`.

        Constructor returns the object that can retrieve GEO DataSet (samples and gene expressions).
        It first checks a local cache directory if the particular data file is loaded locally,
        else it downloads it from `NCBI's GEO FTP site <ftp://ftp.ncbi.nih.gov/pub/geo/DATA/SOFT/GDS/>`_.

        :param gds_name: An NCBI's ID for the data set in the form "GDSn" where "n" is a GDS ID number.

        :param remove_unknown: Remove spots with sample profiles that include unknown values. They are removed
                               if the proportion of samples with unknown values is above the threshold set by
                               ``remove_unknown``. If None, nothing is removed.

        """

        self.gds_name = gds_name
        self.filename = serverfiles.localpath(DOMAIN,
                                              self.gds_name + '.soft.gz')
        gds_ensure_downloaded(self.gds_name)

        self.spot2gene = {}
        self.gene2spots = {}

        self.info = None
        self.gds_data = None
        self.parse_file(remove_unknown=remove_unknown)

        taxid = taxonomy.search(self.info["sample_organism"], exact=True)
        self.info["taxid"] = taxid[0] if len(taxid) == 1 else None

        self.genes = sorted(self.gene2spots.keys())
        self.spots = sorted(self.spot2gene.keys())
        self.info["gene_count"] = len(self.genes)
class OMIM:
    VERSION = 1
    DEFAULT_DATABASE_PATH = serverfiles.localpath(DOMAIN)

    def __init__(self, local_database_path=None):
        self.local_database_path = local_database_path \
            if local_database_path is not None else self.DEFAULT_DATABASE_PATH

        if self.local_database_path == self.DEFAULT_DATABASE_PATH:
            filename = serverfiles.localpath_download(DOMAIN, FILENAME)
        else:
            filename = os.path.join(self.local_database_path, FILENAME)

        self.load(filename)

    @classmethod
    def download_from_NCBI(cls, file=None):
        if isinstance(file, str):
            file = open(file, "wb")
        stream = urlopen(FTP_URL)
        shutil.copyfileobj(stream, file, length=10)
        file.close()

    @classmethod
    def get_instance(cls):
        if not hasattr(cls, "_shared_dict"):
            omim = OMIM()
            cls._shared_dict = omim.__dict__
        instance = OMIM.__new__(OMIM)
        instance.__dict__ = cls._shared_dict
        return instance

    def load(self, filename):
        file = open(filename, "r")
        lines = file.read().splitlines()
        self._disease_dict = dict([(Disease(line), line) for line in lines
                                   if line])

    def diseases(self):
        print(self._disease_dict)
        return self._disease_dict.keys()

    def genes(self):
        return sorted(
            set(
                reduce(list.__add__, [
                    self.disease_genes(disease) for disease in self.diseases()
                ], [])))

    def disease_genes(self, disease):
        return self._disease_dict[disease].split("|")[1].split(", ")

    def gene_diseases(self):
        d = defaultdict(set)
        for disease, genes in [(disease, self.disease_genes(disease))
                               for disease in self.diseases()]:
            for gene in genes:
                d[gene].add(disease)
        return d
Exemple #4
0
 def updateInfo(self):
     gds_info = self.gds_info
     text = ("%i datasets\n%i datasets cached\n" %
             (len(gds_info),
              len(glob.glob(serverfiles.localpath("GEO") + "/GDS*"))))
     filtered = self.treeWidget.model().rowCount()
     if len(self.gds) != filtered:
         text += ("%i after filtering") % filtered
     self.infoBox.setText(text)
    def __move_to_serverfiles_folder(self, selected_file_path):
        domain_path = serverfiles.localpath(self.info_state['domain'])
        file_path = os.path.join(domain_path, self.info_state['filename'])
        create_folder(domain_path)

        try:
            copyfile(selected_file_path, file_path)
        except IOError as e:
            # TODO: handle error properly
            raise e

        # if copy successful create .info file
        create_info_file(file_path, **self.info_state)
Exemple #6
0
def gds_download(gds_name, progress=None):
    """ Download the GDS dataset into the cache.
    """
    gds_url = gds_download_url(gds_name)
    basename = gds_name + ".soft.gz"
    target_path = os.path.join(serverfiles.localpath(DOMAIN), basename)

    temp = NamedTemporaryFile(prefix=basename + "-",
                              dir=serverfiles.localpath(DOMAIN),
                              delete=False)
    try:
        retrieve_url(gds_url, temp, progress=progress)
    except BaseException as err:
        try:
            temp.close()
            os.remove(temp.name)
        except (OSError, IOError):
            pass
        raise err
    else:
        temp.close()
        os.replace(temp.name, target_path)
    def _update_tool_tip(self, fs):
        state_str = self.STATE_STRINGS[fs.state]
        if fs == DEPRECATED:
            diff_date = fs.info_server.datetime - fs.info_local.datetime
        else:
            diff_date = None

        tooltip = "State: {}\nTags: {}".format(
            state_str,
            ', '.join(tag for tag in fs.tags if not tag.startswith("#")))

        if fs.state in [CURRENT, OUTDATED, DEPRECATED]:
            tooltip += "\nFile: {}".format(
                serverfiles.localpath(fs.domain, fs.filename))

        if fs.state == OUTDATED and diff_date:
            tooltip += "\nServer version: {}\nStatus: old {} days".format(
                fs.datetime, diff_date.days)
        else:
            tooltip += "\nServer version: {}".format(fs.datetime)

        for i in range(1, len(header_labels) - 1):
            self.setToolTip(i, tooltip)
    def _updateToolTip(self):
        state_str = self.STATE_STRINGS[self.item.state]
        try:
            diff_date = self.item.latest - self.item.local
        except:
            diff_date = None

        tooltip = ("State: %s\nTags: %s" %
                   (state_str, ", ".join(tag for tag in self.item.tags
                    if not tag.startswith("#"))))

        if self.item.state in [CURRENT, OUTDATED, DEPRECATED]:
            tooltip += ("\nFile: %s" %
                        serverfiles.localpath(self.item.domain,
                                              self.item.filename))

        if self.item.state == OUTDATED and diff_date:
            tooltip += ("\nServer version: %s\nStatus: old (%d days)" % (self.item.latest, diff_date.days))
        else:
            tooltip += ("\nServer version: %s" % self.item.latest)

        for i in range(1, 4):
            self.setToolTip(i, tooltip)
"""
from __future__ import absolute_import

import os

from six import StringIO

from orangecontrib.bioinformatics.utils import serverfiles

try:
    import ConfigParser as configparser
except ImportError:
    import configparser

kegg_dir = serverfiles.localpath("KEGG2")

default = """
[cache]
# path = %(home)s/.obiKEGG/
path = %(kegg_dir)s/
store = sqlite3
invalidate = weekly

[service]
transport = urllib2
# transport = requests

"""

# Orange kegg files dir
Exemple #10
0
def gds_is_cached(gds_name):
    return os.path.isfile(
        os.path.join(serverfiles.localpath(DOMAIN), gds_name + ".soft.gz"))
Exemple #11
0
def get_gds_model(progress=lambda val: None):
    """
    Initialize and return a GDS datasets model.

    :param progress: A progress callback.
    :rval tuple:
        A tuple of (QStandardItemModel, GDSInfo, [GDS])

    .. note::
        The returned QStandardItemModel's thread affinity is set to
        the GUI thread.

    """
    progress(1)
    info = GDSInfo()
    search_keys = ["dataset_id", "title", "platform_organism", "description"]
    cache_dir = serverfiles.localpath(DOMAIN)
    gds_link = "http://www.ncbi.nlm.nih.gov/sites/GDSbrowser?acc={0}"
    pm_link = "http://www.ncbi.nlm.nih.gov/pubmed/{0}"
    gds_list = []

    def is_cached(gds):
        return os.path.exists(
            os.path.join(cache_dir, gds["dataset_id"]) + ".soft.gz")

    def item(displayvalue, item_values={}):
        item = QStandardItem()
        item.setData(displayvalue, Qt.DisplayRole)
        for role, value in item_values.items():
            item.setData(value, role)
        return item

    def gds_to_row(gds):
        #: Text for easier full search.
        search_text = " | ".join(
            [gds.get(key, "").lower() for key in search_keys])
        row = [
            item(" " if is_cached(gds) else "", {TextFilterRole: search_text}),
            item(gds["dataset_id"],
                 {gui.LinkRole: gds_link.format(gds["dataset_id"])}),
            item(gds["title"]),
            item(gds["platform_organism"]),
            item(len(gds["samples"])),
            item(gds["feature_count"]),
            item(gds["gene_count"]),
            item(len(gds["subsets"])),
            item(
                gds.get("pubmed_id", ""), {
                    gui.LinkRole:
                    pm_link.format(gds["pubmed_id"])
                    if gds.get("pubmed_id") else None
                })
        ]
        return row

    model = QStandardItemModel()
    model.setHorizontalHeaderLabels([
        "", "ID", "Title", "Organism", "Samples", "Features", "Genes",
        "Subsets", "PubMedID"
    ])
    progress(20)
    for gds in info.values():
        model.appendRow(gds_to_row(gds))

        gds_list.append(gds)

    progress(50)

    if QThread.currentThread() is not QCoreApplication.instance().thread():
        model.moveToThread(QCoreApplication.instance().thread())
    return model, info, gds_list
class DictyMutants:
    DEFAULT_DATABASE_PATH = serverfiles.localpath(DOMAIN)  # use a default local folder for storing the genesets

    def __init__(self, local_database_path=None):
        """  A collection of Dictybase mutants as a dictionary of :obj:`DictyMutant` objects.

        :param local_database_path: A path for storing D. dictyostelium mutants objects. If `None` then
                                    a default database path is used.
        """

        self.local_database_path = local_database_path \
            if local_database_path is not None else self.DEFAULT_DATABASE_PATH

        if not os.path.exists(self.local_database_path):
            os.mkdir(self.local_database_path)

        self._mutants = pickle.load(open(serverfiles.localpath_download(DOMAIN, PHENOTYPES_FILENAME), "rb"))

    def update_file(self, name):
        url = "http://dictybase.org/db/cgi-bin/dictyBase/download/download.pl?area=mutant_phenotypes&ID="
        filename = os.path.join(self.local_database_path, name)
        temp_file = os.path.join(self.local_database_path, name + "_temp")
        stream = urlopen(url + name)

        with open(temp_file, "wb") as file:
            shutil.copyfileobj(stream, file)

        os.rename(temp_file, filename)
        return filename

    def load_mutants(self, file):
        data = open(file)
        data.readline()  # remove data_header
        data = data.read()
        return data.splitlines()

    def download_mutants(self):
        all_mutants = self.load_mutants(self.update_file("all-mutants.txt"))
        null_mutants = self.load_mutants(
            self.update_file("null-mutants.txt"))
        overexp_mutants = self.load_mutants(
            self.update_file("overexpression-mutants.txt"))
        multiple_mutants = self.load_mutants(
            self.update_file("multiple-mutants.txt"))
        develop_mutants = self.load_mutants(
            self.update_file("developmental-mutants.txt"))
        other_mutants = self.load_mutants(
            self.update_file("other-mutants.txt"))

        _mutants = [DictyMutant(mutant) for mutant in all_mutants]

        the_nulls = set([DictyMutant(line).name for line in null_mutants])
        the_overexps = set([DictyMutant(line).name for line in overexp_mutants])
        the_multiples = set([DictyMutant(line).name for line in multiple_mutants])
        the_develops = set([DictyMutant(line).name for line in develop_mutants])
        the_others = set([DictyMutant(line).name for line in other_mutants])

        for mutant in _mutants:
            if mutant.name in the_nulls: mutant.null = True
            if mutant.name in the_overexps: mutant.overexp = True
            if mutant.name in the_multiples: mutant.multiple = True
            if mutant.name in the_develops: mutant.develop = True
            if mutant.name in the_others: mutant.other = True

        final_mutants = {x: x for x in _mutants}
        return final_mutants

    def pickle_data(self):
        return pickle.dumps(self.download_mutants(), -1)

    @classmethod
    def get_instance(cls):
        if not hasattr(cls, "_shared_dict"):
            dicty = DictyMutants()
            cls._shared_dict = dicty.__dict__
        instance = DictyMutants.__new__(DictyMutants)
        instance.__dict__ = cls._shared_dict
        return instance

    def mutants(self):
        return list(self._mutants.keys())

    def genes(self):
        return sorted(set(reduce(list.__add__, [self.mutant_genes(mutant) for mutant in self.mutants()], [])))

    def phenotypes(self):
        return sorted(set(reduce(list.__add__, [self.mutant_phenotypes(mutant) for mutant in self.mutants()], [])))

    def mutant_genes(self, mutant):
        return self._mutants[mutant].genes

    def mutant_phenotypes(self, mutant):
        return self._mutants[mutant].phenotypes

    def gene_mutants(self):
        dgm = defaultdict(set)
        for mutant, genes in [(mutant, self.mutant_genes(mutant))
                              for mutant in self.mutants()]:
            for gene in genes:
                dgm[gene].add(mutant)
        return dgm

    def phenotype_mutants(self):
        dpm = defaultdict(set)
        for mutant, phenotypes in [(mutant, self.mutant_phenotypes(mutant))
                                   for mutant in self.mutants()]:
            for phenotype in phenotypes:
                dpm[phenotype].add(mutant)
        return dpm
"""  Gene Ontology module """
import os
import re
import sys
import tarfile
import warnings
from collections import namedtuple, defaultdict

import six

from orangecontrib.bioinformatics.ncbi import taxonomy
from orangecontrib.bioinformatics.utils import statistics, serverfiles, progress_bar_milestones
from orangecontrib.bioinformatics.go.config import DOMAIN, FILENAME_ONTOLOGY, FILENAME_ANNOTATION

intern = sys.intern
default_database_path = os.path.join(serverfiles.localpath(), DOMAIN)

_CVS_REVISION_RE = re.compile(r"^(rev)?(\d+\.\d+)+$")

evidence_types = {
    # Experimental
    'EXP': 'Inferred from Experiment',
    'IDA': 'Inferred from Direct Assay',
    'IPI':
    'Inferred from Physical Interaction',  # [with <database:protein_name>]',
    'IMP': 'Inferred from Mutant Phenotype',
    'IGI':
    'Inferred from Genetic Interaction',  # [with <database:gene_symbol[allele_symbol]>]',
    'IEP': 'Inferred from Expression Pattern',
    # Computational Analysis Evidence Codes
    'ISS':
class DictyMutants:
    DEFAULT_DATABASE_PATH = serverfiles.localpath(
        DOMAIN)  # use a default local folder for storing the genesets

    def __init__(self, file_path=None):
        """  A collection of Dictybase mutants as a dictionary of :obj:`DictyMutant` objects.
        """
        if file_path is None:
            file_path = serverfiles.localpath_download(DOMAIN,
                                                       PHENOTYPES_FILENAME)

        with open(file_path, 'r') as fp:
            _mutants = [DictyMutant(mutant) for mutant in json.load(fp)]
            self._mutants = {m: m for m in _mutants}

    @classmethod
    def get_instance(cls):
        if not hasattr(cls, "_shared_dict"):
            dicty = DictyMutants()
            cls._shared_dict = dicty.__dict__
        instance = DictyMutants.__new__(DictyMutants)
        instance.__dict__ = cls._shared_dict
        return instance

    def mutants(self):
        return list(self._mutants.keys())

    def genes(self):
        return sorted(
            set(
                reduce(
                    list.__add__,
                    [self.mutant_genes(mutant)
                     for mutant in self.mutants()], [])))

    def phenotypes(self):
        return sorted(
            set(
                reduce(list.__add__, [
                    self.mutant_phenotypes(mutant)
                    for mutant in self.mutants()
                ], [])))

    def mutant_genes(self, mutant):
        return self._mutants[mutant].genes

    def mutant_phenotypes(self, mutant):
        return self._mutants[mutant].phenotypes

    def gene_mutants(self):
        dgm = defaultdict(set)
        for mutant, genes in [(mutant, self.mutant_genes(mutant))
                              for mutant in self.mutants()]:
            for gene in genes:
                dgm[gene].add(mutant)
        return dgm

    def phenotype_mutants(self):
        dpm = defaultdict(set)
        for mutant, phenotypes in [(mutant, self.mutant_phenotypes(mutant))
                                   for mutant in self.mutants()]:
            for phenotype in phenotypes:
                dpm[phenotype].add(mutant)
        return dpm