def wget(url, directory=".", dst_obj=None, progress=None): stream = urllib2.urlopen(url) length = stream.headers.get("content-length", None) if length is None: length = sys.maxint else: length = int(length) basename = posixpath.basename(url) if dst_obj is None: dst_obj = open(os.path.join(directory, basename), "wb") if progress == True: from Orange.utils import ConsoleProgressBar progress = ConsoleProgressBar("Downloading %r." % basename) with finishing(progress): copyfileobj(stream, dst_obj, buffer=2**10, content_len=length, progress=progress) else: copyfileobj(stream, dst_obj, buffer=2**10, content_len=length, progress=progress)
def getstring(self): speed = int(self.state * self.size / 100.0 / (time.time() - self.starttime)) eta = (100 - self.state) * self.size / 100.0 / speed return ConsoleProgressBar.getstring( self) + " %s %12s/s %3i:%02i ETA" % (self.sizeof_fmt( self.size), self.sizeof_fmt(speed), eta / 60, eta % 60)
def getstring(self): elapsed = max(time.time() - self.starttime, 0.1) speed = max(int(self.state * self.size / 100.0 / elapsed), 1) eta = (100 - self.state) * self.size / 100.0 / speed return ConsoleProgressBar.getstring(self) + \ " %s %12s/s %3i:%02i ETA" % (self.sizeof_fmt(self.size), self.sizeof_fmt(speed), eta / 60, eta % 60)
def __call__(self, *args, **kwargs): ret = ConsoleProgressBar.__call__(self, *args, **kwargs) if self.redirect: self.redirect(self.state) return ret
def __init__(self, filename, size): print "Downloading", filename ConsoleProgressBar.__init__(self, "progress:", 20) self.size = size self.starttime = time.time() self.speed = 0.0
def __init__(self, filename, size): print("Downloading", filename) ConsoleProgressBar.__init__(self, "progress:", 20) self.size = size self.starttime = time.time() self.speed = 0.0
def init_db(cls, version, taxid, cache_dir=None, dbfilename=None): if cache_dir is None: cache_dir = serverfiles.localpath(cls.DOMAIN) if dbfilename is None: dbfilename = cls.default_db_filename(taxid) pjoin = os.path.join base_url = "http://string-db.org/newstring_download/" def paths(flatfile): url = "{flatfile}.{version}/{taxid}.{flatfile}.{version}.txt.gz" url = url.format(flatfile=flatfile, version=version, taxid=taxid) return posixpath.basename(url), base_url + url def ffname(pattern): return pattern.format(taxid=taxid, version=version) links_filename, links_url = paths("protein.links") actions_filename, actions_url = paths("protein.actions") aliases_filename, aliases_url = paths("protein.aliases") def download(filename, url): with open(pjoin(cache_dir, filename + ".tmp"), "wb") as dest: wget(url, dst_obj=dest, progress=True) shutil.move(pjoin(cache_dir, filename + ".tmp"), pjoin(cache_dir, filename)) for fname, url in [(links_filename, links_url), (actions_filename, actions_url), (aliases_filename, aliases_url)]: if not os.path.exists(pjoin(cache_dir, fname)): download(fname, url) links_fileobj = open(pjoin(cache_dir, links_filename), "rb") actions_fileobj = open(pjoin(cache_dir, actions_filename), "rb") aliases_fileobj = open(pjoin(cache_dir, aliases_filename), "rb") links_file = gzip.GzipFile(fileobj=links_fileobj) actions_file = gzip.GzipFile(fileobj=actions_fileobj) aliases_file = gzip.GzipFile(fileobj=aliases_fileobj) progress = ConsoleProgressBar("Processing {}:".format(links_filename)) progress(0.0) def st_size(filename): return os.stat(pjoin(cache_dir, filename)).st_size filesize = st_size(links_filename) con = sqlite3.connect(dbfilename) with con: cls.clear_db(con) links_file.readline() # read the header line reader = csv.reader(links_file, delimiter=" ") def read_links(reader, progress): for i, (p1, p2, score) in enumerate(reader): yield p1, p2, int(score) if i % 100000 == 0: # Update the progress every 100000 lines progress(100.0 * links_fileobj.tell() / filesize) con.executemany("INSERT INTO links VALUES (?, ?, ?)", read_links(reader, progress)) progress.finish() def part(string, sep, part): return string.split(sep)[part] con.create_function("part", 3, part) con.execute(""" INSERT INTO proteins SELECT protein_id1, part(protein_id1, '.', 0) FROM (SELECT DISTINCT(protein_id1) FROM links ORDER BY protein_id1) """) filesize = st_size(actions_filename) actions_file.readline() # read header line progress = ConsoleProgressBar("Processing actions:") reader = csv.reader(actions_file, delimiter="\t") def read_actions(reader): for i, (p1, p2, mode, action, a_is_acting, score) in \ enumerate(reader): yield p1, p2, mode, action, int(score) if i % 10000 == 0: progress(100.0 * actions_fileobj.tell() / filesize) con.executemany("INSERT INTO actions VALUES (?, ?, ?, ?, ?)", read_actions(reader)) progress.finish() filesize = st_size(aliases_filename) aliases_file.readline() # read header line progress = ConsoleProgressBar("Processing aliases:") reader = csv.reader(aliases_file, delimiter="\t") def read_aliases(reader, progress): for i, (taxid, name, alias, source) in enumerate(reader): yield (".".join([taxid, name]), alias.decode("utf-8", errors="ignore"), source.decode("utf-8", errors="ignore")) if i % 10000 == 0: progress(100.0 * aliases_fileobj.tell() / filesize) con.executemany("INSERT INTO aliases VALUES (?, ?, ?)", read_aliases(reader, progress)) progress.finish() print("Indexing the database") cls.create_db_index(con) con.executescript(""" DROP TABLE IF EXISTS version; CREATE TABLE version ( string_version text, api_version text );""") con.execute(""" INSERT INTO version VALUES (?, ?)""", (version, cls.VERSION))
def init_db(cls, version, taxid, cache_dir=None, dbfilename=None): if cache_dir is None: cache_dir = serverfiles.localpath(cls.DOMAIN) if dbfilename is None: dbfilename = serverfiles.localpath( cls.DOMAIN, "string-protein-detailed.{taxid}.sqlite".format(taxid=taxid) ) pjoin = os.path.join base_url = "http://string-db.org/newstring_download/" filename = "{taxid}.protein.links.detailed.{version}.txt.gz" filename = filename.format(version=version, taxid=taxid) url = base_url + "protein.links.detailed.{version}/" + filename url = url.format(version=version) if not os.path.exists(pjoin(cache_dir, filename)): wget(url, cache_dir, progress=True) links_fileobj = open(pjoin(cache_dir, filename), "rb") links_file = gzip.GzipFile(fileobj=links_fileobj) con = sqlite3.connect(dbfilename) with con: con.execute(""" DROP TABLE IF EXISTS evidence """) con.execute(""" CREATE TABLE evidence( protein_id1 TEXT, protein_id2 TEXT, neighborhood INTEGER, fusion INTEGER, cooccurence INTEGER, coexpression INTEGER, experimental INTEGER, database INTEGER, textmining INTEGER ) """) links = csv.reader(links_file, delimiter=" ") links.next() # Read header filesize = os.stat(pjoin(cache_dir, filename)).st_size progress = ConsoleProgressBar("Processing links file:") progress(1.0) def read_links(reader): for i, (p1, p2, n, f, c, cx, ex, db, t, _) in \ enumerate(reader): yield p1, p2, n, f, c, cx, ex, db, t if i % 10000 == 0: progress(100.0 * links_fileobj.tell() / filesize) con.executemany(""" INSERT INTO evidence VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) """, read_links(links)) progress.finish() print("Indexing") con.execute("""\ CREATE INDEX IF NOT EXISTS index_evidence ON evidence (protein_id1, protein_id2) """) con.executescript(""" DROP TABLE IF EXISTS version; CREATE TABLE version ( string_version text, api_version text ); """) con.execute(""" INSERT INTO version VALUES (?, ?)""", (version, cls.VERSION))
def getstring(self): speed = int(self.state * self.size / 100.0 / (time.time() - self.starttime)) eta = (100 - self.state) * self.size / 100.0 / speed return ConsoleProgressBar.getstring(self) + " %s %12s/s %3i:%02i ETA" % (self.sizeof_fmt(self.size), self.sizeof_fmt(speed), eta/60, eta%60)