Example #1
0
    def _download_file(self, url, extract=True):
        """ Download the `file` from the ArrayExpress into a local
        repository directory.

        """
        rest, basename = posixpath.split(url)
        dirname = posixpath.basename(rest)
        repo_dir = serverfiles.localpath("ArrayExpress", dirname)
        try:
            os.makedirs(repo_dir)
        except OSError:
            pass
        stream = urllib2.urlopen(url)
        local_filename = os.path.join(repo_dir, basename)
        shutil.copyfileobj(stream, open(local_filename, "wb"))

        if extract:
            _, extension = os.path.splitext(local_filename)
            if extension == ".zip":
                import zipfile
                zfile = zipfile.ZipFile(local_filename)
                zfile.extractall(repo_dir)
            elif extension == ".gz":
                import gzip
                gzfile = gzip.open(local_filename)
                gzfile.extractall(repo_dir)
            elif extension in [".tgz"]:
                import tarfile
                tfile = tarfile.TarFile(local_filename)
                tfile.extractall(repo_dir)
            elif extension == ".txt":
                pass
            else:
                raise ValueError("Unknown extension ('{0}').".format(basename))
Example #2
0
    def _download_file(self, url, extract=True):
        """ Download the `file` from the ArrayExpress into a local
        repository directory.

        """
        rest, basename = posixpath.split(url)
        dirname = posixpath.basename(rest)
        repo_dir = serverfiles.localpath("ArrayExpress", dirname)
        try:
            os.makedirs(repo_dir)
        except OSError:
            pass
        stream = urllib2.urlopen(url)
        local_filename = os.path.join(repo_dir, basename)
        shutil.copyfileobj(stream, open(local_filename, "wb"))

        if extract:
            _, extension = os.path.splitext(local_filename)
            if extension == ".zip":
                import zipfile
                zfile = zipfile.ZipFile(local_filename)
                zfile.extractall(repo_dir)
            elif extension == ".gz":
                import gzip
                gzfile = gzip.open(local_filename)
                gzfile.extractall(repo_dir)
            elif extension in [".tgz"]:
                import tarfile
                tfile = tarfile.TarFile(local_filename)
                tfile.extractall(repo_dir)
            elif extension == ".txt":
                pass
            else:
                raise ValueError("Unknown extension ('{0}').".format(basename))
Example #3
0
 def _local_filepath(self, url):
     """ Return the local file path for url.
     """
     rest, basename = posixpath.split(url)
     dirname = posixpath.basename(rest)
     return serverfiles.localpath("ArrayExpress",
                                  os.path.join(dirname, basename))
Example #4
0
def _cache(name="AtlasGeneResult.shelve"):
    """ Return a open cache instance (a shelve object).
    """
    if not os.path.exists(serverfiles.localpath("GeneAtlas")):
        try:
            os.makedirs(serverfiles.localpath("GeneAtlas"))
        except OSError:
            pass
    cache = shelve.open(serverfiles.localpath("GeneAtlas", name))
    if cache.get(name + "__CACHE_VERSION__", None) == CACHE_VERSION:
        return cache
    else:
        cache.close()
        cache = shelve.open(serverfiles.localpath("GeneAtlas", name), "n")
        cache[name + "__CACHE_VERSION__"] = CACHE_VERSION
        return cache
Example #5
0
def _cache(name="AtlasGeneResult.shelve"):
    """ Return a open cache instance (a shelve object).
    """
    if not os.path.exists(serverfiles.localpath("GeneAtlas")):
        try:
            os.makedirs(serverfiles.localpath("GeneAtlas"))
        except OSError:
            pass
    cache = shelve.open(serverfiles.localpath("GeneAtlas", name))
    if cache.get(name + "__CACHE_VERSION__", None) == CACHE_VERSION:
        return cache
    else:
        cache.close()
        cache = shelve.open(serverfiles.localpath("GeneAtlas", name), "n")
        cache[name + "__CACHE_VERSION__"] = CACHE_VERSION
        return cache
Example #6
0
 def _local_filepath(self, url):
     """ Return the local file path for url.
     """
     rest, basename = posixpath.split(url)
     dirname = posixpath.basename(rest)
     return serverfiles.localpath(
                 "ArrayExpress", os.path.join(dirname, basename))
Example #7
0
 def updateInfo(self):
     gds_info = self.gds_info
     text = ("%i datasets\n%i datasets cached\n" %
             (len(gds_info),
              len(glob.glob(serverfiles.localpath("GEO") + "/GDS*"))))
     filtered = self.treeWidget.model().rowCount()
     if len(self.gds) != filtered:
         text += ("%i after filtering") % filtered
     self.infoBox.setText(text)
Example #8
0
 def updateInfo(self):
     gds_info = self.gds_info
     text = ("%i datasets\n%i datasets cached\n" %
             (len(gds_info),
              len(glob.glob(serverfiles.localpath("GEO") + "/GDS*"))))
     filtered = self.treeWidget.model().rowCount()
     if len(self.gds) != filtered:
         text += ("%i after filtering") % filtered
     self.infoBox.setText(text)
    def _updateToolTip(self):
        state_str = self.STATE_STRINGS[self.item.state]
        tooltip = ("State: %s\nTags: %s" %
                   (state_str,
                    ", ".join(tag for tag in self.item.tags
                              if not tag.startswith("#"))))

        if self.item.state in [CURRENT, OUTDATED, DEPRECATED]:
            tooltip += ("\nFile: %s" %
                        serverfiles.localpath(self.item.domain,
                                              self.item.filename))
        for i in range(1, 4):
            self.setToolTip(i, tooltip)
Example #10
0
    def download_data(cls, address):
        """
        Pass the address of the latest BIOGRID-ALL release (in tab2 format).
        """
        stream = urllib2.urlopen(address)
        stream = StringIO(stream.read())
        zfile = zipfile.ZipFile(stream)
        # Expecting only one file.
        filename = zfile.namelist()[0]

        filepath = serverfiles.localpath("PPI", "BIOGRID-ALL.tab2")
        mkdir_p(os.path.dirname(filepath))

        with open(filepath, "wb") as f:
            shutil.copyfileobj(zfile.open(filename, "r"), f)

        cls.init_db(filepath)
Example #11
0
    def __init__(self, taxid=None, database=None, detailed_database=None):
        STRING.__init__(self, taxid, database)
        if taxid is not None and detailed_database is not None:
            raise ValueError("taxid and detailed_database are exclusive")

        db_file = serverfiles.localpath(self.DOMAIN, self.FILENAME)
        if taxid is not None and detailed_database is None:
            detailed_database = serverfiles.localpath_download(
                self.DOMAIN,
                self.FILENAME_DETAILED.format(taxid=taxid)
            )
        elif taxid is None and detailed_database is not None:
            detailed_database = detailed_database
        elif taxid is None and detailed_database is None:
            # Back compatibility
            detailed_database = serverfiles.localpath_download(
                "PPI", "string-protein-detailed.sqlite")

        self.db_detailed = sqlite3.connect(detailed_database)
        self.db_detailed.execute("ATTACH DATABASE ? as string", (db_file,))
Example #12
0
    def fgem_to_table(self):
        """ Retrieve the processed matrix from the Array Express FTP
        server and convert it to a :class:`Orange.data.Table`.

        """
        assert (self.fgemdatafiles)
        repo_dir = serverfiles.localpath("ArrayExpress", self.accession)
        # Find the file listing the data matrix files
        # (should be in sdrf but sometimes it is in 2column file only, why?)
        sdrf = self._search_files("sdrf", "txt")
        if sdrf:
            sdrf = SampleDataRelationship(self._open(sdrf[0].get("url")))
            if "Derived Array Data Matrix File" not in sdrf.header:
                twocol = self._search_files("twocolumn", "txt")
                if twocol:
                    sdrf = SampleDataRelationship(
                        self._open(twocol[0].get("url")))
        matrix_file = self._search_files("fgem")[0]
        self._open(matrix_file.get("url"))

        idf_file = self._search_files("idf", "txt")[0]
        self._open(idf_file.get("url"))  # To download if not cached
        return mage_tab_to_orange(os.path.join(repo_dir, idf_file.get("name")))
Example #13
0
    def _updateToolTip(self):
        state_str = self.STATE_STRINGS[self.item.state]
        try:
            diff_date = self.item.latest - self.item.local
        except:
            diff_date = None
        
        tooltip = ("State: %s\nTags: %s" %
                   (state_str, ", ".join(tag for tag in self.item.tags
                    if not tag.startswith("#"))))

        if self.item.state in [CURRENT, OUTDATED, DEPRECATED]:
            tooltip += ("\nFile: %s" %
                        serverfiles.localpath(self.item.domain,
                                              self.item.filename))
       
        if self.item.state == OUTDATED and diff_date:
            tooltip += ("\nServer version: %s\nStatus: old (%d days)" % (self.item.latest, diff_date.days))
        else:
            tooltip += ("\nServer version: %s" % self.item.latest)

        for i in range(1, 4):
            self.setToolTip(i, tooltip)
Example #14
0
    def fgem_to_table(self):
        """ Retrieve the processed matrix from the Array Express FTP
        server and convert it to a :class:`Orange.data.Table`.

        """
        assert(self.fgemdatafiles)
        repo_dir = serverfiles.localpath("ArrayExpress", self.accession)
        # Find the file listing the data matrix files
        # (should be in sdrf but sometimes it is in 2column file only, why?)
        sdrf = self._search_files("sdrf", "txt")
        if sdrf:
            sdrf = SampleDataRelationship(self._open(sdrf[0].get("url")))
            if "Derived Array Data Matrix File" not in sdrf.header:
                twocol = self._search_files("twocolumn", "txt")
                if twocol:
                    sdrf = SampleDataRelationship(
                        self._open(twocol[0].get("url"))
                    )
        matrix_file = self._search_files("fgem")[0]
        self._open(matrix_file.get("url"))

        idf_file = self._search_files("idf", "txt")[0]
        self._open(idf_file.get("url"))  # To download if not cached
        return mage_tab_to_orange(os.path.join(repo_dir, idf_file.get("name")))
Example #15
0
def get_gds_model(progress=lambda val: None):
    """
    Initialize and return a GDS datasets model.

    :param progress: A progress callback.
    :rval tuple:
        A tuple of (QStandardItemModel, geo.GDSInfo, [geo.GDS])

    .. note::
        The returned QStandardItemModel's thread affinity is set to
        the GUI thread.

    """
    progress(1)
    info = geo.GDSInfo()
    search_keys = ["dataset_id", "title", "platform_organism", "description"]
    cache_dir = serverfiles.localpath(geo.DOMAIN)
    gds_link = "http://www.ncbi.nlm.nih.gov/sites/GDSbrowser?acc={0}"
    pm_link = "http://www.ncbi.nlm.nih.gov/pubmed/{0}"
    gds_list = []

    def is_cached(gds):
        return os.path.exists(os.path.join(cache_dir, gds["dataset_id"]) +
                              ".soft.gz")

    def item(displayvalue, item_values={}):
        item = QStandardItem()
        item.setData(displayvalue, Qt.DisplayRole)
        for role, value in item_values.iteritems():
            item.setData(value, role)
        return item

    def gds_to_row(gds):
        #: Text for easier full search.
        search_text = unicode(
            " | ".join([gds.get(key, "").lower()
                        for key in search_keys]),
            errors="ignore"
        )
        row = [
            item(" " if is_cached(gds) else "",
                 {TextFilterRole: search_text}),
            item(gds["dataset_id"],
                 {LinkRole: gds_link.format(gds["dataset_id"])}),
            item(gds["title"]),
            item(gds["platform_organism"]),
            item(len(gds["samples"])),
            item(gds["feature_count"]),
            item(gds["gene_count"]),
            item(len(gds["subsets"])),
            item(gds.get("pubmed_id", ""),
                 {LinkRole: pm_link.format(gds["pubmed_id"])
                            if gds.get("pubmed_id")
                            else QVariant()})
        ]
        return row

    model = QStandardItemModel()
    model.setHorizontalHeaderLabels(
        ["", "ID", "Title", "Organism", "Samples", "Features",
         "Genes", "Subsets", "PubMedID"]
    )
    progress(20)
    for gds in info.values():
        model.appendRow(gds_to_row(gds))

        gds_list.append(gds)

    progress(50)

    if QThread.currentThread() is not QCoreApplication.instance().thread():
        model.moveToThread(QCoreApplication.instance().thread())
    return model, info, gds_list
Example #16
0
 def _extract(self):
     self._tmpfile.seek(0, 0)
     archive = tarfile.open(fileobj=self._tmpfile)
     target_dir = serverfiles.localpath()
     archive.extractall(target_dir)
Example #17
0
    def init_db(cls, version, taxid, cache_dir=None, dbfilename=None):
        if cache_dir is None:
            cache_dir = serverfiles.localpath(cls.DOMAIN)
        if dbfilename is None:
            dbfilename = serverfiles.localpath(
                cls.DOMAIN,
                "string-protein-detailed.{taxid}.sqlite".format(taxid=taxid)
            )

        pjoin = os.path.join

        base_url = "http://string-db.org/newstring_download/"
        filename = "{taxid}.protein.links.detailed.{version}.txt.gz"
        filename = filename.format(version=version, taxid=taxid)
        url = base_url + "protein.links.detailed.{version}/" + filename
        url = url.format(version=version)

        if not os.path.exists(pjoin(cache_dir, filename)):
            wget(url, cache_dir, progress=True)

        links_fileobj = open(pjoin(cache_dir, filename), "rb")
        links_file = gzip.GzipFile(fileobj=links_fileobj)

        con = sqlite3.connect(dbfilename)
        with con:
            con.execute("""
                DROP TABLE IF EXISTS evidence
            """)

            con.execute("""
                CREATE TABLE evidence(
                     protein_id1 TEXT,
                     protein_id2 TEXT,
                     neighborhood INTEGER,
                     fusion INTEGER,
                     cooccurence INTEGER,
                     coexpression INTEGER,
                     experimental INTEGER,
                     database INTEGER,
                     textmining INTEGER
                    )
                """)

            links = csv.reader(links_file, delimiter=" ")
            links.next()  # Read header
            filesize = os.stat(pjoin(cache_dir, filename)).st_size

            progress = ConsoleProgressBar("Processing links file:")
            progress(1.0)

            def read_links(reader):
                for i, (p1, p2, n, f, c, cx, ex, db, t, _) in \
                        enumerate(reader):
                    yield p1, p2, n, f, c, cx, ex, db, t

                    if i % 10000 == 0:
                        progress(100.0 * links_fileobj.tell() / filesize)

            con.executemany("""
                INSERT INTO evidence
                VALUES  (?, ?, ?, ?, ?, ?, ?, ?, ?)
                """, read_links(links))

            progress.finish()

            print "Indexing"
            con.execute("""\
                CREATE INDEX IF NOT EXISTS index_evidence
                    ON evidence (protein_id1, protein_id2)
            """)

            con.executescript("""
                DROP TABLE IF EXISTS version;

                CREATE TABLE version (
                     string_version text,
                     api_version text
                );
                """)

            con.execute("""
                INSERT INTO version
                VALUES (?, ?)""", (version, cls.VERSION))
Example #18
0
 def download(cls):
     src = urllib2.urlopen("http://mips.helmholtz-muenchen.de/proj/ppi/data/mppi.gz")
     dest = serverfiles.localpath("PPI", "mppi.gz")
     shutil.copyfileobj(src, open(dest, "wb"))
Example #19
0
    def __init__(self, parent=None, signalManager=None, name="Image viewer"):
        OWWidget.__init__(self, parent, signalManager, name, wantGraph=True)

        self.inputs = [("Data", ExampleTable, self.setData)]
        self.outputs = [("Data", ExampleTable)]

        self.imageAttr = 0
        self.titleAttr = 0
        self.zoom = 25
        self.autoCommit = False
        self.selectionChangedFlag = False

        #
        # GUI
        #

        self.loadSettings()

        self.info = OWGUI.widgetLabel(
            OWGUI.widgetBox(self.controlArea, "Info"), "Waiting for input\n")

        self.imageAttrCB = OWGUI.comboBox(
            self.controlArea,
            self,
            "imageAttr",
            box="Image Filename Attribute",
            tooltip="Attribute with image filenames",
            callback=[self.clearScene, self.setupScene],
            addSpace=True)

        self.titleAttrCB = OWGUI.comboBox(self.controlArea,
                                          self,
                                          "titleAttr",
                                          box="Title Attribute",
                                          tooltip="Attribute with image title",
                                          callback=self.updateTitles,
                                          addSpace=True)

        OWGUI.hSlider(self.controlArea,
                      self,
                      "zoom",
                      box="Zoom",
                      minValue=1,
                      maxValue=100,
                      step=1,
                      callback=self.updateZoom,
                      createLabel=False)

        OWGUI.separator(self.controlArea)

        box = OWGUI.widgetBox(self.controlArea, "Selection")
        b = OWGUI.button(box, self, "Commit", callback=self.commit)
        cb = OWGUI.checkBox(box,
                            self,
                            "autoCommit",
                            "Commit on any change",
                            tooltip="Send selections on any change",
                            callback=self.commitIf)

        OWGUI.setStopper(self,
                         b,
                         cb,
                         "selectionChangedFlag",
                         callback=self.commit)

        OWGUI.rubber(self.controlArea)

        self.scene = GraphicsScene()
        self.sceneView = QGraphicsView(self.scene, self)
        self.sceneView.setAlignment(Qt.AlignTop | Qt.AlignLeft)
        self.sceneView.setRenderHint(QPainter.Antialiasing, True)
        self.sceneView.setRenderHint(QPainter.TextAntialiasing, True)
        self.sceneView.setFocusPolicy(Qt.WheelFocus)
        self.sceneView.setVerticalScrollBarPolicy(Qt.ScrollBarAlwaysOn)
        self.sceneView.installEventFilter(self)
        self.mainArea.layout().addWidget(self.sceneView)

        self.scene.selectionChanged.connect(self.onSelectionChanged)
        self.scene.selectionRectPointChanged.connect(
            self.onSelectionRectPointChanged, Qt.QueuedConnection)
        self.graphButton.clicked.connect(self.saveScene)
        self.resize(800, 600)

        self.thumbnailWidget = None
        self.sceneLayout = None
        self.selectedExamples = []

        #: List of _ImageItems
        self.items = []

        self._errcount = 0
        self._successcount = 0

        self.loader = ImageLoader(self)

        # Add the "orange-sf" path prefix for locating files
        # distributed using `serverfiles`.
        sfdir = serverfiles.localpath()
        if sfdir not in [unicode(p) for p in QDir.searchPaths("orange-sf")]:
            QDir.addSearchPath("orange-sf", sfdir)
Example #20
0
def get_gds_model(progress=lambda val: None):
    """
    Initialize and return a GDS datasets model.

    :param progress: A progress callback.
    :rval tuple:
        A tuple of (QStandardItemModel, geo.GDSInfo, [geo.GDS])

    .. note::
        The returned QStandardItemModel's thread affinity is set to
        the GUI thread.

    """
    progress(1)
    info = geo.GDSInfo()
    search_keys = ["dataset_id", "title", "platform_organism", "description"]
    cache_dir = serverfiles.localpath(geo.DOMAIN)
    gds_link = "http://www.ncbi.nlm.nih.gov/sites/GDSbrowser?acc={0}"
    pm_link = "http://www.ncbi.nlm.nih.gov/pubmed/{0}"
    gds_list = []

    def is_cached(gds):
        return os.path.exists(os.path.join(cache_dir, gds["dataset_id"]) +
                              ".soft.gz")

    def item(displayvalue, item_values={}):
        item = QStandardItem()
        item.setData(displayvalue, Qt.DisplayRole)
        for role, value in item_values.iteritems():
            item.setData(value, role)
        return item

    def gds_to_row(gds):
        #: Text for easier full search.
        search_text = unicode(
            " | ".join([gds.get(key, "").lower()
                        for key in search_keys]),
            errors="ignore"
        )
        row = [
            item(" " if is_cached(gds) else "",
                 {TextFilterRole: search_text}),
            item(gds["dataset_id"],
                 {LinkRole: gds_link.format(gds["dataset_id"])}),
            item(gds["title"]),
            item(gds["platform_organism"]),
            item(len(gds["samples"])),
            item(gds["feature_count"]),
            item(gds["gene_count"]),
            item(len(gds["subsets"])),
            item(gds.get("pubmed_id", ""),
                 {LinkRole: pm_link.format(gds["pubmed_id"])
                            if gds.get("pubmed_id")
                            else QVariant()})
        ]
        return row

    model = QStandardItemModel()
    model.setHorizontalHeaderLabels(
        ["", "ID", "Title", "Organism", "Samples", "Features",
         "Genes", "Subsets", "PubMedID"]
    )
    progress(20)
    for gds in info.values():
        model.appendRow(gds_to_row(gds))

        gds_list.append(gds)

    progress(50)

    if QThread.currentThread() is not QCoreApplication.instance().thread():
        model.moveToThread(QCoreApplication.instance().thread())
    return model, info, gds_list
Example #21
0
 def default_db_filename(cls, taxid):
     return serverfiles.localpath(
         cls.DOMAIN, cls.FILENAME.format(taxid=taxid))
Example #22
0
    def __init__(self, parent=None, signalManager=None, name="Image viewer"):
        OWWidget.__init__(self, parent, signalManager, name, wantGraph=True)

        self.inputs = [("Data", ExampleTable, self.setData)]
        self.outputs = [("Data", ExampleTable)]

        self.imageAttr = 0
        self.titleAttr = 0
        self.zoom = 25
        self.autoCommit = False
        self.selectionChangedFlag = False

        #
        # GUI
        #

        self.loadSettings()

        self.info = OWGUI.widgetLabel(
            OWGUI.widgetBox(self.controlArea, "Info"),
            "Waiting for input\n"
        )

        self.imageAttrCB = OWGUI.comboBox(
            self.controlArea, self, "imageAttr",
            box="Image Filename Attribute",
            tooltip="Attribute with image filenames",
            callback=[self.clearScene, self.setupScene],
            addSpace=True
        )

        self.titleAttrCB = OWGUI.comboBox(
            self.controlArea, self, "titleAttr",
            box="Title Attribute",
            tooltip="Attribute with image title",
            callback=self.updateTitles,
            addSpace=True
        )

        OWGUI.hSlider(
            self.controlArea, self, "zoom",
            box="Zoom", minValue=1, maxValue=100, step=1,
            callback=self.updateZoom,
            createLabel=False
        )

        OWGUI.separator(self.controlArea)

        box = OWGUI.widgetBox(self.controlArea, "Selection")
        b = OWGUI.button(box, self, "Commit", callback=self.commit)
        cb = OWGUI.checkBox(
            box, self, "autoCommit", "Commit on any change",
            tooltip="Send selections on any change",
            callback=self.commitIf
        )

        OWGUI.setStopper(self, b, cb, "selectionChangedFlag",
                         callback=self.commit)

        OWGUI.rubber(self.controlArea)

        self.scene = GraphicsScene()
        self.sceneView = QGraphicsView(self.scene, self)
        self.sceneView.setAlignment(Qt.AlignTop | Qt.AlignLeft)
        self.sceneView.setRenderHint(QPainter.Antialiasing, True)
        self.sceneView.setRenderHint(QPainter.TextAntialiasing, True)
        self.sceneView.setFocusPolicy(Qt.WheelFocus)
        self.sceneView.setVerticalScrollBarPolicy(Qt.ScrollBarAlwaysOn)
        self.sceneView.installEventFilter(self)
        self.mainArea.layout().addWidget(self.sceneView)

        self.scene.selectionChanged.connect(self.onSelectionChanged)
        self.scene.selectionRectPointChanged.connect(
            self.onSelectionRectPointChanged, Qt.QueuedConnection
        )
        self.graphButton.clicked.connect(self.saveScene)
        self.resize(800, 600)

        self.thumbnailWidget = None
        self.sceneLayout = None
        self.selectedExamples = []

        #: List of _ImageItems
        self.items = []

        self._errcount = 0
        self._successcount = 0

        self.loader = ImageLoader(self)

        # Add the "orange-sf" path prefix for locating files
        # distributed using `serverfiles`.
        sfdir = serverfiles.localpath()
        if sfdir not in [unicode(p) for p in QDir.searchPaths("orange-sf")]:
            QDir.addSearchPath("orange-sf", sfdir)
Example #23
0
 def _extract(self):
     self._tmpfile.seek(0, 0)
     archive = tarfile.open(fileobj=self._tmpfile)
     target_dir = serverfiles.localpath()
     archive.extractall(target_dir)
Example #24
0
class ArrayExpressConnection(object):
    """
    Constructs and runs REST query on ArrayExpress.

    :param address: Address of the ArrayExpress API.
    :param timeout: Timeout for the connection.

    """

    DEFAULT_ADDRESS = "http://www.ebi.ac.uk/arrayexpress/{format}/v2/"
    DEFAULT_FORMAT = "json"
    DEFAULT_CACHE = serverfiles.localpath("ArrayExpress",
                                          "ArrayExpressCache.shelve")

    # Order of arguments in the query
    _ARGS_ORDER = ["keywords", "species", "array"]

    def __init__(self,
                 address=None,
                 timeout=30,
                 cache=None,
                 username=None,
                 password=None):
        self.address = address if address is not None else self.DEFAULT_ADDRESS
        self.timeout = timeout
        self.cache = cache if cache is not None else self.DEFAULT_CACHE
        self.username = username
        self.password = password

    def format_query(self, **kwargs):
        """Format the query arguments in `kwargs`.

        >>> conn.format_query(gxa=True, efcount=(1, 5))
        'efcount=[1 TO 5]&gxa=true'

        """

        # Formaters:
        def format_default(val):
            if isinstance(val, basestring):
                return val
            else:
                return "+".join(val)

        def format_species(val):
            return '"%s"' % val.lower()

        def format_gxa(val):
            if val:
                return "true"
            else:
                raise ValueError("gxa={0}".format(val))

        def format_expandefo(val):
            if val:
                return "on"
            else:
                raise ValueError("expandefo={0}".format(val))

        def format_true_false(val):
            return "true" if val else "false"

        def format_interval(val):
            if isinstance(val, tuple):
                return "[{0} TO {1}]".format(*val)
            else:
                raise ValueError("Must be an interval argument (min, max)!")

        def format_date(val):
            # TODO check if val contains a datetime.date object
            # assert proper format
            return format_interval(val)

        def format_wholewords(val):
            if val:
                return "on"
            else:
                raise ValueError("wholewords={0}".format(val))

        formaters = {
            "species": format_species,
            "gxa": format_gxa,
            "expandefo": format_expandefo,
            "directsub": format_true_false,
            "assaycount": format_interval,
            "efcount": format_interval,
            "samplecount": format_interval,
            "sacount": format_interval,
            "rawcount": format_interval,
            "fgemcount": format_interval,
            "miamescore": format_interval,
            "date": format_date,
            "wholewords": format_wholewords,
        }
        parts = []
        arg_items = kwargs.items()

        arg_items = sorted(arg_items,
                           key=lambda arg: self._ARGS_ORDER.index(arg[0])
                           if arg[0] in self._ARGS_ORDER else 100)

        for key, value in arg_items:
            if key == "format":
                continue  # format is handled in query_url
            if key not in ARRAYEXPRESS_FIELDS:
                raise ValueError("Invalid argument name: '{0}'".format(key))
            if value is not None and value != []:
                fmt = formaters.get(key, format_default)
                value = fmt(value)
                parts.append("{0}={1}".format(key, value))

        return "&".join(parts)

    def query_url(self, what="experiments", **kwargs):
        """Return a formatted query URL for the query arguments.

        >>> conn.query_url(accession="E-MEXP-31")
        'http://www.ebi.ac.uk/arrayexpress/json/v2/experiments?accession=E-MEXP-31'

        """
        query = self.format_query(**kwargs)
        url = posixpath.join(self.address, what)
        url = url.format(format=kwargs.get("format", self.DEFAULT_FORMAT))
        url = url + ("?" + query if query else "")
        url = url.replace(" ", "%20")
        return url

    def query_url_experiments(self, **kwargs):
        """Return query URL of formatted experiments for the query arguments.
        """
        return self.query_url("experiments", **kwargs)

    def query_url_files(self, **kwargs):
        """ Return query URL of formatted experiments for the query arguments.
        """
        return self.query_url("files", **kwargs)

    def query_experiment(self, **kwargs):
        """Return an open stream to the experiments query results. 
           Takes the same arguments as the :obj:`query_experiments` function.
        """
        url = self.query_url_experiments(**kwargs)
        stream = self._cache_urlopen(url, timeout=self.timeout)
        return stream

    def query_files(self, **kwargs):
        """Return an open stream to the files query results.
           Takes the same arguments as the :obj:`query_files` function.
        """
        url = self.query_url_files(**kwargs)
        stream = self._cache_urlopen(url, timeout=self.timeout)
        return stream

    def open_file(self, accession, kind="raw", ext=None):
        """ Return a file handle to experiment data.
        
        :param str accession:
        :param str kind: Experiment data type.
        
        Possible values for the parameter `kind`:
            - raw: return the raw data if available
            - processed: return the processed data if available
            - biosamples: a png or svg design image
            - idf: investigation description
            - adf: array design description
            - mageml: MAGE-ML file

        Example::

            >>> raw_file = conn.open_file("E-TABM-1087", kind="raw")
            >>> processed_file = conn.open_file("E-TABM-1087", kind="processed")

        """
        stream = self.query_files(accession=accession, format="json")
        data = json.load(stream)
        try:
            files = data["files"]["experiment"]["file"]
        except KeyError:
            raise ValueError(accession)

        for file in files:
            filekind = file["kind"]
            fileext = file["extension"]
            if (filekind == kind) and (fileext == ext or ext is None):
                url = file["url"]
                return self._cache_urlopen(str(url), timeout=self.timeout)

        raise ValueError("%s does not have a file of kind: %r" %
                         (accession, kind))

    def _cache_urlopen(self, url, timeout=30):
        if self.cache is not None:
            with self.open_cache("r") as cache:
                if url in cache:
                    return StringIO(cache[url])

            stream = urllib2.urlopen(url, timeout=timeout)
            data = stream.read()
            with self.open_cache("w") as cache:
                cache[url] = data

            return StringIO(data)
        else:
            return urllib2.urlopen(url, timeout=timeout)

    def open_cache(self, flag="r"):
        if isinstance(self.cache, basestring):
            try:
                return closing(_open_shelve(self.cache, flag))
            except Exception:
                return _fake_closing({})
        elif hasattr(self.cache, "close"):
            return closing(self.cache)
        elif self.cache is None:
            return _fake_closing({})
        else:
            return _fake_closing(self.cache)
Example #25
0
    def init_db(cls, version, taxid, cache_dir=None, dbfilename=None):
        if cache_dir is None:
            cache_dir = serverfiles.localpath(cls.DOMAIN)

        if dbfilename is None:
            dbfilename = cls.default_db_filename(taxid)

        pjoin = os.path.join

        base_url = "http://string-db.org/newstring_download/"

        def paths(flatfile):
            url = "{flatfile}.{version}/{taxid}.{flatfile}.{version}.txt.gz"
            url = url.format(flatfile=flatfile, version=version, taxid=taxid)
            return posixpath.basename(url), base_url + url

        def ffname(pattern):
            return pattern.format(taxid=taxid, version=version)

        links_filename, links_url = paths("protein.links")

        actions_filename, actions_url = paths("protein.actions")

        aliases_filename, aliases_url = paths("protein.aliases")

        def download(filename, url):
            with open(pjoin(cache_dir, filename + ".tmp"), "wb") as dest:
                wget(url, dst_obj=dest, progress=True)

            shutil.move(pjoin(cache_dir, filename + ".tmp"),
                        pjoin(cache_dir, filename))

        for fname, url in [(links_filename, links_url),
                           (actions_filename, actions_url),
                           (aliases_filename, aliases_url)]:
            if not os.path.exists(pjoin(cache_dir, fname)):
                download(fname, url)

        links_fileobj = open(pjoin(cache_dir, links_filename), "rb")
        actions_fileobj = open(pjoin(cache_dir, actions_filename), "rb")
        aliases_fileobj = open(pjoin(cache_dir, aliases_filename), "rb")

        links_file = gzip.GzipFile(fileobj=links_fileobj)
        actions_file = gzip.GzipFile(fileobj=actions_fileobj)
        aliases_file = gzip.GzipFile(fileobj=aliases_fileobj)

        progress = ConsoleProgressBar("Processing {}:".format(links_filename))
        progress(0.0)

        def st_size(filename):
            return os.stat(pjoin(cache_dir, filename)).st_size

        filesize = st_size(links_filename)

        con = sqlite3.connect(dbfilename)

        with con:
            cls.clear_db(con)

            links_file.readline()  # read the header line

            reader = csv.reader(links_file, delimiter=" ")

            def read_links(reader, progress):
                for i, (p1, p2, score) in enumerate(reader):
                    yield p1, p2, int(score)

                    if i % 100000 == 0:
                        # Update the progress every 100000 lines
                        progress(100.0 * links_fileobj.tell() / filesize)

            con.executemany("INSERT INTO links VALUES (?, ?, ?)",
                            read_links(reader, progress))

            progress.finish()

            def part(string, sep, part):
                return string.split(sep)[part]

            con.create_function("part", 3, part)
            con.execute("""
                INSERT INTO proteins
                SELECT protein_id1, part(protein_id1, '.', 0)
                FROM (SELECT DISTINCT(protein_id1)
                     FROM links
                     ORDER BY protein_id1)
            """)

            filesize = st_size(actions_filename)

            actions_file.readline()  # read header line

            progress = ConsoleProgressBar("Processing actions:")
            reader = csv.reader(actions_file, delimiter="\t")

            def read_actions(reader):
                for i, (p1, p2, mode, action, a_is_acting, score) in \
                        enumerate(reader):
                    yield p1, p2, mode, action, int(score)

                    if i % 10000 == 0:
                        progress(100.0 * actions_fileobj.tell() / filesize)

            con.executemany("INSERT INTO actions VALUES (?, ?, ?, ?, ?)",
                            read_actions(reader))

            progress.finish()

            filesize = st_size(aliases_filename)
            aliases_file.readline()  # read header line

            progress = ConsoleProgressBar("Processing aliases:")

            reader = csv.reader(aliases_file, delimiter="\t")

            def read_aliases(reader, progress):
                for i, (taxid, name, alias, source) in enumerate(reader):
                    yield (".".join([taxid, name]),
                           alias.decode("utf-8", errors="ignore"),
                           source.decode("utf-8", errors="ignore"))
                    if i % 10000 == 0:
                        progress(100.0 * aliases_fileobj.tell() / filesize)

            con.executemany("INSERT INTO aliases VALUES (?, ?, ?)",
                            read_aliases(reader, progress))

            progress.finish()

            print "Indexing the database"
            cls.create_db_index(con)

            con.executescript("""
                DROP TABLE IF EXISTS version;
                CREATE TABLE version (
                     string_version text,
                     api_version text
                );""")

            con.execute("""
                INSERT INTO version
                VALUES (?, ?)""", (version, cls.VERSION))
Example #26
0
class GeneExpressionAtlasConenction(object):
    """
    A connection to Gene Expression Atlas database.

    :param address:
        Address of the GXA server (default: http://www-test.ebi.ac.uk/gxa/api/deprecated).
    :param timeout:
        Socket timeout (default 30).
    :param cache:
        A dict like object to use as a cache.

    """
    DEFAULT_ADDRESS = "http://www-test.ebi.ac.uk/gxa/api/deprecated"
    DEFAULT_CACHE = serverfiles.localpath("GeneAtlas",
                                          "GeneAtlasConnectionCache.shelve")

    def __init__(self, address=None, timeout=30, cache=None):

        self.address = address if address is not None else self.DEFAULT_ADDRESS
        self.timeout = timeout
        self.cache = cache if cache is not None else self.DEFAULT_CACHE

    def query(self,
              condition,
              format="json",
              start=None,
              rows=None,
              indent=False):
        warnings.warn(
            "The Gene Expression Atlas REST api has been deprecated and " +
            "will be removed in the future.", UserWarning)

        url = self.address + "?" + condition.rest()
        if start is not None and rows is not None:
            url += "&start={0}&rows={1}".format(start, rows)
        url += "&format={0}".format(format)
        if indent:
            url += "&indent"
        #print url

        if self.cache is not None:
            return self._query_cached(url, format)
        else:
            return urllib2.urlopen(url)

    def _query_cached(self, url, format):
        if self.cache is not None:
            with self.open_cache("r") as cache:
                if url in cache:
                    return StringIO(cache[url])

            response = urllib2.urlopen(url)
            contents = response.read()
            # Test if the contents is a valid json or xml string (some
            # times the stream just stops in the middle, so this makes
            # sure we don't cache an invalid response
            # TODO: what about errors (e.g. 'cannot handle the
            # query in a timely fashion'
            if format == "json":
                parse_json(StringIO(contents))
            else:
                parse_xml(StringIO(contents))

            with self.open_cache("w") as cache:
                cache[url] = contents

            return StringIO(contents)
        else:
            return urllib2.urlopen(url)

    def open_cache(self, flag="r"):
        """
        Return a context manager for a dict like object.
        """
        if isinstance(self.cache, basestring):
            try:
                return closing(_open_shelve(self.cache, flag))
            except Exception:
                return fake_closing({})
        else:
            return fake_closing(self.cache)