def download_data(cls, taxid=None, progress_callback=None): """ Download the data for ``taxid`` from the GeneMANIA website and initialize the local database. """ import tarfile baseurl = "http://genemania.org/data/current/" directory = orngServerFiles.localpath("PPI") if not os.path.exists(directory): os.makedirs(directory) if taxid is None: taxid = cls.common_taxids() if isinstance(taxid, (list, tuple)): taxids = taxid else: taxids = [taxid] for taxid in taxids: name = obiTaxonomy.name(taxid) name = name.replace(" ", "_") if progress_callback is None: progress = True #orngServerFiles.ConsoleProgressBar("Downloading %r." % filename) else: progress = progress_callback filename = name + ".tgz" url = baseurl + "networks/" + filename wget(url, directory=directory, progress=progress) tgz_filename = os.path.join(directory, filename) tgz = tarfile.open(tgz_filename) tgz.extractall(directory) filename = name + ".COMBINED.tgz" url = baseurl + "precombined/" + filename wget(url, directory=directory, progress=progress) tgz_filename = os.path.join(directory, filename) tgz = tarfile.open(tgz_filename) tgz.extractall(directory) cls.init_db([taxid])
def download(filename, url): with open(pjoin(cache_dir, filename + ".tmp"), "wb") as dest: wget(url, dst_obj=dest, progress=True) shutil.move(pjoin(cache_dir, filename + ".tmp"), pjoin(cache_dir, filename))
def init_db(cls, version, taxid, cache_dir=None, dbfilename=None): if cache_dir is None: cache_dir = serverfiles.localpath(cls.DOMAIN) if dbfilename is None: dbfilename = serverfiles.localpath( cls.DOMAIN, "string-protein-detailed.{taxid}.sqlite".format(taxid=taxid) ) pjoin = os.path.join base_url = "http://string-db.org/newstring_download/" filename = "{taxid}.protein.links.detailed.{version}.txt.gz" filename = filename.format(version=version, taxid=taxid) url = base_url + "protein.links.detailed.{version}/" + filename url = url.format(version=version) if not os.path.exists(pjoin(cache_dir, filename)): wget(url, cache_dir, progress=True) links_fileobj = open(pjoin(cache_dir, filename), "rb") links_file = gzip.GzipFile(fileobj=links_fileobj) con = sqlite3.connect(dbfilename) with con: con.execute(""" DROP TABLE IF EXISTS evidence """) con.execute(""" CREATE TABLE evidence( protein_id1 TEXT, protein_id2 TEXT, neighborhood INTEGER, fusion INTEGER, cooccurence INTEGER, coexpression INTEGER, experimental INTEGER, database INTEGER, textmining INTEGER ) """) links = csv.reader(links_file, delimiter=" ") links.next() # Read header filesize = os.stat(pjoin(cache_dir, filename)).st_size progress = ConsoleProgressBar("Processing links file:") progress(1.0) def read_links(reader): for i, (p1, p2, n, f, c, cx, ex, db, t, _) in \ enumerate(reader): yield p1, p2, n, f, c, cx, ex, db, t if i % 10000 == 0: progress(100.0 * links_fileobj.tell() / filesize) con.executemany(""" INSERT INTO evidence VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) """, read_links(links)) progress.finish() print("Indexing") con.execute("""\ CREATE INDEX IF NOT EXISTS index_evidence ON evidence (protein_id1, protein_id2) """) con.executescript(""" DROP TABLE IF EXISTS version; CREATE TABLE version ( string_version text, api_version text ); """) con.execute(""" INSERT INTO version VALUES (?, ?)""", (version, cls.VERSION))