def test_gds_data(self): # test url self.assertIsNotNone(gds_download_url(self.test_sample)) # file not in cache self.assertFalse(gds_is_cached(self.test_sample)) # download gds from serverfiles try: makedirs(serverfiles.localpath(DOMAIN)) except OSError: if path.exists(serverfiles.localpath(DOMAIN)): pass else: # There was an error on creation, so make sure we know about it raise gds_download(self.test_sample) # file in cache self.assertIsNone(gds_ensure_downloaded(self.test_sample)) self.assertTrue(gds_is_cached(self.test_sample)) gds = GDS(self.test_sample) self.assertIsNotNone(gds.info) self.assertEqual(gds.info['gene_count'], 9561) self.assertEqual(len(gds.info['samples']), 4) self.assertEqual(len(gds.info['subsets']), 2) self.assertEqual(gds.info['taxid'], self.test_organism) self.assertIsInstance(gds.get_data(), Table) self.assertIsInstance(gds.get_data(transpose=True), Table)
def __init__(self, gds_name, remove_unknown=None): """ Retrieval of a specific GEO DataSet as a :obj:`Orange.data.Table`. Constructor returns the object that can retrieve GEO DataSet (samples and gene expressions). It first checks a local cache directory if the particular data file is loaded locally, else it downloads it from `NCBI's GEO FTP site <ftp://ftp.ncbi.nih.gov/pub/geo/DATA/SOFT/GDS/>`_. :param gds_name: An NCBI's ID for the data set in the form "GDSn" where "n" is a GDS ID number. :param remove_unknown: Remove spots with sample profiles that include unknown values. They are removed if the proportion of samples with unknown values is above the threshold set by ``remove_unknown``. If None, nothing is removed. """ self.gds_name = gds_name self.filename = serverfiles.localpath(DOMAIN, self.gds_name + '.soft.gz') gds_ensure_downloaded(self.gds_name) self.spot2gene = {} self.gene2spots = {} self.info = None self.gds_data = None self.parse_file(remove_unknown=remove_unknown) taxid = taxonomy.search(self.info["sample_organism"], exact=True) self.info["taxid"] = taxid[0] if len(taxid) == 1 else None self.genes = sorted(self.gene2spots.keys()) self.spots = sorted(self.spot2gene.keys()) self.info["gene_count"] = len(self.genes)
class OMIM: VERSION = 1 DEFAULT_DATABASE_PATH = serverfiles.localpath(DOMAIN) def __init__(self, local_database_path=None): self.local_database_path = local_database_path \ if local_database_path is not None else self.DEFAULT_DATABASE_PATH if self.local_database_path == self.DEFAULT_DATABASE_PATH: filename = serverfiles.localpath_download(DOMAIN, FILENAME) else: filename = os.path.join(self.local_database_path, FILENAME) self.load(filename) @classmethod def download_from_NCBI(cls, file=None): if isinstance(file, str): file = open(file, "wb") stream = urlopen(FTP_URL) shutil.copyfileobj(stream, file, length=10) file.close() @classmethod def get_instance(cls): if not hasattr(cls, "_shared_dict"): omim = OMIM() cls._shared_dict = omim.__dict__ instance = OMIM.__new__(OMIM) instance.__dict__ = cls._shared_dict return instance def load(self, filename): file = open(filename, "r") lines = file.read().splitlines() self._disease_dict = dict([(Disease(line), line) for line in lines if line]) def diseases(self): print(self._disease_dict) return self._disease_dict.keys() def genes(self): return sorted( set( reduce(list.__add__, [ self.disease_genes(disease) for disease in self.diseases() ], []))) def disease_genes(self, disease): return self._disease_dict[disease].split("|")[1].split(", ") def gene_diseases(self): d = defaultdict(set) for disease, genes in [(disease, self.disease_genes(disease)) for disease in self.diseases()]: for gene in genes: d[gene].add(disease) return d
def updateInfo(self): gds_info = self.gds_info text = ("%i datasets\n%i datasets cached\n" % (len(gds_info), len(glob.glob(serverfiles.localpath("GEO") + "/GDS*")))) filtered = self.treeWidget.model().rowCount() if len(self.gds) != filtered: text += ("%i after filtering") % filtered self.infoBox.setText(text)
def __move_to_serverfiles_folder(self, selected_file_path): domain_path = serverfiles.localpath(self.info_state['domain']) file_path = os.path.join(domain_path, self.info_state['filename']) create_folder(domain_path) try: copyfile(selected_file_path, file_path) except IOError as e: # TODO: handle error properly raise e # if copy successful create .info file create_info_file(file_path, **self.info_state)
def gds_download(gds_name, progress=None): """ Download the GDS dataset into the cache. """ gds_url = gds_download_url(gds_name) basename = gds_name + ".soft.gz" target_path = os.path.join(serverfiles.localpath(DOMAIN), basename) temp = NamedTemporaryFile(prefix=basename + "-", dir=serverfiles.localpath(DOMAIN), delete=False) try: retrieve_url(gds_url, temp, progress=progress) except BaseException as err: try: temp.close() os.remove(temp.name) except (OSError, IOError): pass raise err else: temp.close() os.replace(temp.name, target_path)
def _update_tool_tip(self, fs): state_str = self.STATE_STRINGS[fs.state] if fs == DEPRECATED: diff_date = fs.info_server.datetime - fs.info_local.datetime else: diff_date = None tooltip = "State: {}\nTags: {}".format( state_str, ', '.join(tag for tag in fs.tags if not tag.startswith("#"))) if fs.state in [CURRENT, OUTDATED, DEPRECATED]: tooltip += "\nFile: {}".format( serverfiles.localpath(fs.domain, fs.filename)) if fs.state == OUTDATED and diff_date: tooltip += "\nServer version: {}\nStatus: old {} days".format( fs.datetime, diff_date.days) else: tooltip += "\nServer version: {}".format(fs.datetime) for i in range(1, len(header_labels) - 1): self.setToolTip(i, tooltip)
def _updateToolTip(self): state_str = self.STATE_STRINGS[self.item.state] try: diff_date = self.item.latest - self.item.local except: diff_date = None tooltip = ("State: %s\nTags: %s" % (state_str, ", ".join(tag for tag in self.item.tags if not tag.startswith("#")))) if self.item.state in [CURRENT, OUTDATED, DEPRECATED]: tooltip += ("\nFile: %s" % serverfiles.localpath(self.item.domain, self.item.filename)) if self.item.state == OUTDATED and diff_date: tooltip += ("\nServer version: %s\nStatus: old (%d days)" % (self.item.latest, diff_date.days)) else: tooltip += ("\nServer version: %s" % self.item.latest) for i in range(1, 4): self.setToolTip(i, tooltip)
""" from __future__ import absolute_import import os from six import StringIO from orangecontrib.bioinformatics.utils import serverfiles try: import ConfigParser as configparser except ImportError: import configparser kegg_dir = serverfiles.localpath("KEGG2") default = """ [cache] # path = %(home)s/.obiKEGG/ path = %(kegg_dir)s/ store = sqlite3 invalidate = weekly [service] transport = urllib2 # transport = requests """ # Orange kegg files dir
def gds_is_cached(gds_name): return os.path.isfile( os.path.join(serverfiles.localpath(DOMAIN), gds_name + ".soft.gz"))
def get_gds_model(progress=lambda val: None): """ Initialize and return a GDS datasets model. :param progress: A progress callback. :rval tuple: A tuple of (QStandardItemModel, GDSInfo, [GDS]) .. note:: The returned QStandardItemModel's thread affinity is set to the GUI thread. """ progress(1) info = GDSInfo() search_keys = ["dataset_id", "title", "platform_organism", "description"] cache_dir = serverfiles.localpath(DOMAIN) gds_link = "http://www.ncbi.nlm.nih.gov/sites/GDSbrowser?acc={0}" pm_link = "http://www.ncbi.nlm.nih.gov/pubmed/{0}" gds_list = [] def is_cached(gds): return os.path.exists( os.path.join(cache_dir, gds["dataset_id"]) + ".soft.gz") def item(displayvalue, item_values={}): item = QStandardItem() item.setData(displayvalue, Qt.DisplayRole) for role, value in item_values.items(): item.setData(value, role) return item def gds_to_row(gds): #: Text for easier full search. search_text = " | ".join( [gds.get(key, "").lower() for key in search_keys]) row = [ item(" " if is_cached(gds) else "", {TextFilterRole: search_text}), item(gds["dataset_id"], {gui.LinkRole: gds_link.format(gds["dataset_id"])}), item(gds["title"]), item(gds["platform_organism"]), item(len(gds["samples"])), item(gds["feature_count"]), item(gds["gene_count"]), item(len(gds["subsets"])), item( gds.get("pubmed_id", ""), { gui.LinkRole: pm_link.format(gds["pubmed_id"]) if gds.get("pubmed_id") else None }) ] return row model = QStandardItemModel() model.setHorizontalHeaderLabels([ "", "ID", "Title", "Organism", "Samples", "Features", "Genes", "Subsets", "PubMedID" ]) progress(20) for gds in info.values(): model.appendRow(gds_to_row(gds)) gds_list.append(gds) progress(50) if QThread.currentThread() is not QCoreApplication.instance().thread(): model.moveToThread(QCoreApplication.instance().thread()) return model, info, gds_list
class DictyMutants: DEFAULT_DATABASE_PATH = serverfiles.localpath(DOMAIN) # use a default local folder for storing the genesets def __init__(self, local_database_path=None): """ A collection of Dictybase mutants as a dictionary of :obj:`DictyMutant` objects. :param local_database_path: A path for storing D. dictyostelium mutants objects. If `None` then a default database path is used. """ self.local_database_path = local_database_path \ if local_database_path is not None else self.DEFAULT_DATABASE_PATH if not os.path.exists(self.local_database_path): os.mkdir(self.local_database_path) self._mutants = pickle.load(open(serverfiles.localpath_download(DOMAIN, PHENOTYPES_FILENAME), "rb")) def update_file(self, name): url = "http://dictybase.org/db/cgi-bin/dictyBase/download/download.pl?area=mutant_phenotypes&ID=" filename = os.path.join(self.local_database_path, name) temp_file = os.path.join(self.local_database_path, name + "_temp") stream = urlopen(url + name) with open(temp_file, "wb") as file: shutil.copyfileobj(stream, file) os.rename(temp_file, filename) return filename def load_mutants(self, file): data = open(file) data.readline() # remove data_header data = data.read() return data.splitlines() def download_mutants(self): all_mutants = self.load_mutants(self.update_file("all-mutants.txt")) null_mutants = self.load_mutants( self.update_file("null-mutants.txt")) overexp_mutants = self.load_mutants( self.update_file("overexpression-mutants.txt")) multiple_mutants = self.load_mutants( self.update_file("multiple-mutants.txt")) develop_mutants = self.load_mutants( self.update_file("developmental-mutants.txt")) other_mutants = self.load_mutants( self.update_file("other-mutants.txt")) _mutants = [DictyMutant(mutant) for mutant in all_mutants] the_nulls = set([DictyMutant(line).name for line in null_mutants]) the_overexps = set([DictyMutant(line).name for line in overexp_mutants]) the_multiples = set([DictyMutant(line).name for line in multiple_mutants]) the_develops = set([DictyMutant(line).name for line in develop_mutants]) the_others = set([DictyMutant(line).name for line in other_mutants]) for mutant in _mutants: if mutant.name in the_nulls: mutant.null = True if mutant.name in the_overexps: mutant.overexp = True if mutant.name in the_multiples: mutant.multiple = True if mutant.name in the_develops: mutant.develop = True if mutant.name in the_others: mutant.other = True final_mutants = {x: x for x in _mutants} return final_mutants def pickle_data(self): return pickle.dumps(self.download_mutants(), -1) @classmethod def get_instance(cls): if not hasattr(cls, "_shared_dict"): dicty = DictyMutants() cls._shared_dict = dicty.__dict__ instance = DictyMutants.__new__(DictyMutants) instance.__dict__ = cls._shared_dict return instance def mutants(self): return list(self._mutants.keys()) def genes(self): return sorted(set(reduce(list.__add__, [self.mutant_genes(mutant) for mutant in self.mutants()], []))) def phenotypes(self): return sorted(set(reduce(list.__add__, [self.mutant_phenotypes(mutant) for mutant in self.mutants()], []))) def mutant_genes(self, mutant): return self._mutants[mutant].genes def mutant_phenotypes(self, mutant): return self._mutants[mutant].phenotypes def gene_mutants(self): dgm = defaultdict(set) for mutant, genes in [(mutant, self.mutant_genes(mutant)) for mutant in self.mutants()]: for gene in genes: dgm[gene].add(mutant) return dgm def phenotype_mutants(self): dpm = defaultdict(set) for mutant, phenotypes in [(mutant, self.mutant_phenotypes(mutant)) for mutant in self.mutants()]: for phenotype in phenotypes: dpm[phenotype].add(mutant) return dpm
""" Gene Ontology module """ import os import re import sys import tarfile import warnings from collections import namedtuple, defaultdict import six from orangecontrib.bioinformatics.ncbi import taxonomy from orangecontrib.bioinformatics.utils import statistics, serverfiles, progress_bar_milestones from orangecontrib.bioinformatics.go.config import DOMAIN, FILENAME_ONTOLOGY, FILENAME_ANNOTATION intern = sys.intern default_database_path = os.path.join(serverfiles.localpath(), DOMAIN) _CVS_REVISION_RE = re.compile(r"^(rev)?(\d+\.\d+)+$") evidence_types = { # Experimental 'EXP': 'Inferred from Experiment', 'IDA': 'Inferred from Direct Assay', 'IPI': 'Inferred from Physical Interaction', # [with <database:protein_name>]', 'IMP': 'Inferred from Mutant Phenotype', 'IGI': 'Inferred from Genetic Interaction', # [with <database:gene_symbol[allele_symbol]>]', 'IEP': 'Inferred from Expression Pattern', # Computational Analysis Evidence Codes 'ISS':
class DictyMutants: DEFAULT_DATABASE_PATH = serverfiles.localpath( DOMAIN) # use a default local folder for storing the genesets def __init__(self, file_path=None): """ A collection of Dictybase mutants as a dictionary of :obj:`DictyMutant` objects. """ if file_path is None: file_path = serverfiles.localpath_download(DOMAIN, PHENOTYPES_FILENAME) with open(file_path, 'r') as fp: _mutants = [DictyMutant(mutant) for mutant in json.load(fp)] self._mutants = {m: m for m in _mutants} @classmethod def get_instance(cls): if not hasattr(cls, "_shared_dict"): dicty = DictyMutants() cls._shared_dict = dicty.__dict__ instance = DictyMutants.__new__(DictyMutants) instance.__dict__ = cls._shared_dict return instance def mutants(self): return list(self._mutants.keys()) def genes(self): return sorted( set( reduce( list.__add__, [self.mutant_genes(mutant) for mutant in self.mutants()], []))) def phenotypes(self): return sorted( set( reduce(list.__add__, [ self.mutant_phenotypes(mutant) for mutant in self.mutants() ], []))) def mutant_genes(self, mutant): return self._mutants[mutant].genes def mutant_phenotypes(self, mutant): return self._mutants[mutant].phenotypes def gene_mutants(self): dgm = defaultdict(set) for mutant, genes in [(mutant, self.mutant_genes(mutant)) for mutant in self.mutants()]: for gene in genes: dgm[gene].add(mutant) return dgm def phenotype_mutants(self): dpm = defaultdict(set) for mutant, phenotypes in [(mutant, self.mutant_phenotypes(mutant)) for mutant in self.mutants()]: for phenotype in phenotypes: dpm[phenotype].add(mutant) return dpm