def init_db(cls, version, taxid, cache_dir=None, dbfilename=None): if cache_dir is None: cache_dir = serverfiles.localpath(cls.DOMAIN) if dbfilename is None: dbfilename = cls.default_db_filename(taxid) pjoin = os.path.join base_url = "http://string-db.org/newstring_download/" def paths(flatfile): url = "{flatfile}.{version}/{taxid}.{flatfile}.{version}.txt.gz" url = url.format(flatfile=flatfile, version=version, taxid=taxid) return posixpath.basename(url), base_url + url def ffname(pattern): return pattern.format(taxid=taxid, version=version) links_filename, links_url = paths("protein.links") actions_filename, actions_url = paths("protein.actions") aliases_filename, aliases_url = paths("protein.aliases") def download(filename, url): with open(pjoin(cache_dir, filename + ".tmp"), "wb") as dest: wget(url, dst_obj=dest, progress=True) shutil.move(pjoin(cache_dir, filename + ".tmp"), pjoin(cache_dir, filename)) for fname, url in [(links_filename, links_url), (actions_filename, actions_url), (aliases_filename, aliases_url)]: if not os.path.exists(pjoin(cache_dir, fname)): download(fname, url) links_fileobj = open(pjoin(cache_dir, links_filename), "rb") actions_fileobj = open(pjoin(cache_dir, actions_filename), "rb") aliases_fileobj = open(pjoin(cache_dir, aliases_filename), "rb") links_file = gzip.GzipFile(fileobj=links_fileobj) actions_file = gzip.GzipFile(fileobj=actions_fileobj) aliases_file = gzip.GzipFile(fileobj=aliases_fileobj) progress = ConsoleProgressBar("Processing {}:".format(links_filename)) progress(0.0) def st_size(filename): return os.stat(pjoin(cache_dir, filename)).st_size filesize = st_size(links_filename) con = sqlite3.connect(dbfilename) with con: cls.clear_db(con) links_file.readline() # read the header line reader = csv.reader(links_file, delimiter=" ") def read_links(reader, progress): for i, (p1, p2, score) in enumerate(reader): yield p1, p2, int(score) if i % 100000 == 0: # Update the progress every 100000 lines progress(100.0 * links_fileobj.tell() / filesize) con.executemany("INSERT INTO links VALUES (?, ?, ?)", read_links(reader, progress)) progress.finish() def part(string, sep, part): return string.split(sep)[part] con.create_function("part", 3, part) con.execute(""" INSERT INTO proteins SELECT protein_id1, part(protein_id1, '.', 0) FROM (SELECT DISTINCT(protein_id1) FROM links ORDER BY protein_id1) """) filesize = st_size(actions_filename) actions_file.readline() # read header line progress = ConsoleProgressBar("Processing actions:") reader = csv.reader(actions_file, delimiter="\t") def read_actions(reader): for i, (p1, p2, mode, action, a_is_acting, score) in \ enumerate(reader): yield p1, p2, mode, action, int(score) if i % 10000 == 0: progress(100.0 * actions_fileobj.tell() / filesize) con.executemany("INSERT INTO actions VALUES (?, ?, ?, ?, ?)", read_actions(reader)) progress.finish() filesize = st_size(aliases_filename) aliases_file.readline() # read header line progress = ConsoleProgressBar("Processing aliases:") reader = csv.reader(aliases_file, delimiter="\t") def read_aliases(reader, progress): for i, (taxid, name, alias, source) in enumerate(reader): yield (".".join([taxid, name]), alias.decode("utf-8", errors="ignore"), source.decode("utf-8", errors="ignore")) if i % 10000 == 0: progress(100.0 * aliases_fileobj.tell() / filesize) con.executemany("INSERT INTO aliases VALUES (?, ?, ?)", read_aliases(reader, progress)) progress.finish() print("Indexing the database") cls.create_db_index(con) con.executescript(""" DROP TABLE IF EXISTS version; CREATE TABLE version ( string_version text, api_version text );""") con.execute(""" INSERT INTO version VALUES (?, ?)""", (version, cls.VERSION))
def init_db(cls, version, taxid, cache_dir=None, dbfilename=None): if cache_dir is None: cache_dir = serverfiles.localpath(cls.DOMAIN) if dbfilename is None: dbfilename = serverfiles.localpath( cls.DOMAIN, "string-protein-detailed.{taxid}.sqlite".format(taxid=taxid) ) pjoin = os.path.join base_url = "http://string-db.org/newstring_download/" filename = "{taxid}.protein.links.detailed.{version}.txt.gz" filename = filename.format(version=version, taxid=taxid) url = base_url + "protein.links.detailed.{version}/" + filename url = url.format(version=version) if not os.path.exists(pjoin(cache_dir, filename)): wget(url, cache_dir, progress=True) links_fileobj = open(pjoin(cache_dir, filename), "rb") links_file = gzip.GzipFile(fileobj=links_fileobj) con = sqlite3.connect(dbfilename) with con: con.execute(""" DROP TABLE IF EXISTS evidence """) con.execute(""" CREATE TABLE evidence( protein_id1 TEXT, protein_id2 TEXT, neighborhood INTEGER, fusion INTEGER, cooccurence INTEGER, coexpression INTEGER, experimental INTEGER, database INTEGER, textmining INTEGER ) """) links = csv.reader(links_file, delimiter=" ") links.next() # Read header filesize = os.stat(pjoin(cache_dir, filename)).st_size progress = ConsoleProgressBar("Processing links file:") progress(1.0) def read_links(reader): for i, (p1, p2, n, f, c, cx, ex, db, t, _) in \ enumerate(reader): yield p1, p2, n, f, c, cx, ex, db, t if i % 10000 == 0: progress(100.0 * links_fileobj.tell() / filesize) con.executemany(""" INSERT INTO evidence VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) """, read_links(links)) progress.finish() print("Indexing") con.execute("""\ CREATE INDEX IF NOT EXISTS index_evidence ON evidence (protein_id1, protein_id2) """) con.executescript(""" DROP TABLE IF EXISTS version; CREATE TABLE version ( string_version text, api_version text ); """) con.execute(""" INSERT INTO version VALUES (?, ?)""", (version, cls.VERSION))