def main(): antismash_jobid = argv[1] instance_folder = path.join(path.dirname(__file__), "..", "instance") jobs_db = path.join(instance_folder, "bigfam_jobs.db") if not path.exists(jobs_db): print("creating jobs db ({})...".format(jobs_db)) with db_open(jobs_db) as con: cur = con.cursor() schema_sql = path.join(path.dirname(__file__), "jobs_schema.sql") with open(schema_sql, "r") as sql_script: cur.executescript(sql_script.read()) con.commit() with db_open(jobs_db) as con: cur = con.cursor() # check if exist if cur.execute(("select count(name)" " from jobs" " where name like ?"), (antismash_jobid, )).fetchall()[0][0] > 0: print("Job ID exists!") return 0 else: cur.execute(("insert into jobs" " (name,submitted,status)" " values(?,?,?)"), (antismash_jobid, datetime.now(), 0)) con.commit() print("Inserted new job!") return 0
def main(): if len(argv) > 1: num_threads = int(argv[1]) else: num_threads = cpu_count() instance_folder = path.join(path.dirname(__file__), "..", "instance") jobs_db = path.join(instance_folder, "bigfam_jobs.db") if not path.exists(jobs_db): print("creating jobs db ({})...".format(jobs_db)) with db_open(jobs_db) as con: cur = con.cursor() schema_sql = path.join(path.dirname(__file__), "jobs_schema.sql") with open(schema_sql, "r") as sql_script: cur.executescript(sql_script.read()) con.commit() print("workers are running...") while (True): pending = fetch_pending_jobs(jobs_db) if len(pending) > 0: print("deploying {} jobs...".format(len(pending))) deploy_jobs(pending, jobs_db, instance_folder, num_threads) sleep(5) return 0
def fetch_pending_jobs(jobs_db): with db_open(jobs_db) as con: cur = con.cursor() return [ row[0] for row in cur.execute(("select name" " from jobs" " where status=0" " order by submitted asc")).fetchall() ]
def main(): instance_folder = path.join(path.dirname(__file__), "..", "instance") linkage_db = path.join(instance_folder, "linkage.db") source_db = path.join(instance_folder, "result", "data.db") input_folder = argv[1] if len(argv) > 2: num_jobs = int(argv[2]) else: num_jobs = 1 pool = fetch_pool(num_jobs) # check if linkage.db exists if path.exists(linkage_db): print("{} exists!".format(linkage_db)) return 1 # parse original datasets information print("parsing datasets.tsv...") datasets = {} with open(path.join(input_folder, "datasets.tsv"), "r") as dstv: for line in dstv: if not line.startswith("#"): ds_name, ds_folder, _, _ = line.rstrip("\n").split("\t") datasets[ds_name] = { "path": path.join(input_folder, ds_folder) } # scan BGCs in database and populate args for multiprocessing print("populating list of BGCs to parse...") with db_open(source_db) as con: cur = con.cursor() to_process = [ (path.join(input_folder, datasets[row[1]]["path"], row[2], row[3]), row) for row in cur.execute(("select bgc.id, dataset.name" ", bgc.orig_folder, bgc.orig_filename" " from bgc, dataset" " where bgc.dataset_id=dataset.id" " order by bgc.id asc")).fetchall() ] print("found {:,} BGCs!".format(len(to_process))) # parse original input files print("parsing {:,} BGCs using {} threads...".format( len(to_process), num_jobs)) to_insert_mibig = [] to_insert_ncbi = [] to_insert_antismashdb = [] i = 0 for data in pool.imap_unordered(parse_bgc_gbk, to_process): i += 1 stderr.write("\r{}/{}".format(i, len(to_process))) if data is None: continue elif data[0] == "mibig": to_insert_mibig.append(data[1]) elif data[0] == "ncbi": to_insert_ncbi.append(data[1]) elif data[0] == "antismashdb": to_insert_antismashdb.append(data[1]) # create linkage db print("creating linkage db...") with db_open(linkage_db) as con: cur = con.cursor() schema_sql = path.join(path.dirname(__file__), "linkage_schema.sql") with open(schema_sql, "r") as sql_script: cur.executescript(sql_script.read()) con.commit() if len(to_insert_mibig) > 0: # insert mibig data print("inserting mibig linkages...") cur.executemany(("insert into linkage_mibig(bgc_id, mibig_acc)" " values (?, ?)"), to_insert_mibig) else: print("found no mibig linkage") if len(to_insert_ncbi) > 0: # insert ncbi data print("inserting ncbi linkages...") cur.executemany(("insert into linkage_ncbi(" "bgc_id, nuccore_acc," " start_loc, end_loc)" " values (?, ?, ?, ?)"), to_insert_ncbi) else: print("found no ncbi linkage") if len(to_insert_antismashdb) > 0: # insert antismashdb data print("inserting antismashdb linkages...") cur.executemany(("insert into linkage_antismashdb(" "bgc_id, nuccore_acc," " start_loc, end_loc)" " values (?, ?, ?, ?)"), to_insert_antismashdb) else: print("found no antismashdb linkage")
def main(): instance_folder = path.join(path.dirname(__file__), "..", "instance") precalc_db = path.join(instance_folder, "precalculated.db") source_db = path.join(instance_folder, "result", "data.db") # make sure that source db only contains 1 clustering record and 1 hmmdb record with db_open(source_db) as con: cur = con.cursor() if cur.execute( ("select count(id) from clustering")).fetchall()[0][0] != 1: print("Expecting only 1 clustering record!") return 1 if cur.execute(("select count(id) from hmm_db")).fetchall()[0][0] != 1: print("Expecting only 1 hmmdb record!") return 1 if not path.exists(precalc_db): print("creating precalculated db ({})...".format(precalc_db)) with db_open(precalc_db) as con: cur = con.cursor() schema_sql = path.join(path.dirname(__file__), "cache_schema.sql") with open(schema_sql, "r") as sql_script: cur.executescript(sql_script.read()) con.commit() with db_open(precalc_db) as con: cur = con.cursor() # attach source_db cur.execute(("attach database ? as source"), (source_db, )) # fetch clustering id clustering_id, threshold = cur.execute( ("select id, threshold from clustering limit 1")).fetchall()[0] # generate bgc summary cur.executemany(("insert into bgc_summary values(?,0)"), cur.execute(("select id" " from source.bgc" " order by id asc")).fetchall()) print("calculating bgc cds counts...") cur.executemany( ("update bgc_summary set cds_count=? where bgc_id=?"), cur.execute(("select count(source.cds.id), source.cds.bgc_id" " from source.cds" " group by source.cds.bgc_id")).fetchall()) # generate gcf members summary cur.executemany(("insert into gcf_summary values(?,0,0)"), cur.execute(("select id" " from source.gcf" " where clustering_id=?" " order by id asc"), (clustering_id, )).fetchall()) # inserting core member counts print("calculating gcf core member counts...") cur.executemany( ("update gcf_summary set core_members=? where gcf_id=?"), cur.execute( ("select count(source.gcf_membership.bgc_id)" ", source.gcf_membership.gcf_id" " from source.gcf_membership,source.gcf" " where source.gcf.id=source.gcf_membership.gcf_id" " and source.gcf.clustering_id=?" " and source.gcf_membership.rank=0" " and source.gcf_membership.membership_value<=?" " group by source.gcf_membership.gcf_id"), (clustering_id, threshold)).fetchall()) # inserting putative member counts print("calculating gcf putative member counts...") cur.executemany( ("update gcf_summary set putative_members=? where gcf_id=?"), cur.execute( ("select count(source.gcf_membership.bgc_id)" ", source.gcf_membership.gcf_id" " from source.gcf_membership,source.gcf" " where source.gcf.id=source.gcf_membership.gcf_id" " and source.gcf.clustering_id=?" " and source.gcf_membership.rank=0" " and source.gcf_membership.membership_value>?" " group by source.gcf_membership.gcf_id"), (clustering_id, threshold)).fetchall()) # generate gcf members summary dataset print("calculating gcf dataset counts...") cur.executemany( ("insert into gcf_summary_dataset values(?,?,?)"), cur.execute( ("select source.gcf_membership.gcf_id" ", source.bgc.dataset_id" ", count(source.bgc.id)" " from source.gcf_membership,source.gcf,source.bgc" " where source.gcf.id=source.gcf_membership.gcf_id" " and source.gcf.clustering_id=?" " and source.gcf_membership.rank=0" " and source.gcf_membership.membership_value<=?" " and source.bgc.id=source.gcf_membership.bgc_id" " group by source.gcf_membership.gcf_id" ", source.bgc.dataset_id"), (clustering_id, threshold)).fetchall()) # generate gcf members summary class print("calculating gcf class counts...") cur.executemany( ("insert into gcf_summary_class values(?,?,?)"), cur.execute( ("select source.gcf_membership.gcf_id" ", source.bgc_class.chem_subclass_id" ", count(source.bgc.id)" " from source.gcf_membership,source.gcf,source.bgc" ",source.bgc_class" " where source.gcf.id=source.gcf_membership.gcf_id" " and source.gcf.clustering_id=?" " and source.gcf_membership.rank=0" " and source.gcf_membership.membership_value<=?" " and source.bgc.id=source.gcf_membership.bgc_id" " and source.bgc.id=source.bgc_class.bgc_id" " group by source.gcf_membership.gcf_id" ", source.bgc_class.chem_subclass_id"), (clustering_id, threshold)).fetchall()) # generate gcf members summary taxon print("calculating gcf taxon counts...") cur.executemany( ("insert into gcf_summary_taxon values(?,?,?)"), cur.execute( ("select source.gcf_membership.gcf_id" ", source.bgc_taxonomy.taxon_id" ", count(source.bgc.id)" " from source.gcf_membership,source.gcf,source.bgc" ",source.bgc_taxonomy" " where source.gcf.id=source.gcf_membership.gcf_id" " and source.gcf.clustering_id=?" " and source.gcf_membership.rank=0" " and source.gcf_membership.membership_value<=?" " and source.bgc.id=source.gcf_membership.bgc_id" " and source.bgc.id=source.bgc_taxonomy.bgc_id" " group by source.gcf_membership.gcf_id" ", source.bgc_taxonomy.taxon_id"), (clustering_id, threshold)).fetchall()) # make bgc_domains table print("inserting bgc_domains entries...") cur.executemany(("insert into bgc_domains values(?,?)"), cur.execute(("select bgc_id, hmm_id" " from source.bgc_features" " where value >= 255")).fetchall()) # make gcf_domains table print("inserting gcf_domains entries...") cur.executemany( ("insert into gcf_domains values(?,?)"), cur.execute(("select gcf_id, hmm_id" " from source.gcf_models,source.gcf" " where source.gcf.clustering_id=?" " and source.gcf_models.gcf_id=source.gcf.id" " and value >= 200"), (clustering_id, )).fetchall()) return 0 else: print("precalculated db exists!") return 1
def deploy_jobs(pending, jobs_db, instance_folder, num_threads): inputs_folder = path.join(instance_folder, "query_inputs") for name in pending: # update status to "DOWNLOADING" with db_open(jobs_db) as con: cur = con.cursor() cur.execute(("update jobs" " set status=?, started=?" " where name like ?"), (1, datetime.now(), name)) con.commit() # download antiSMASH result query_input = path.join(inputs_folder, name) print("downloading {}...".format(query_input)) with TemporaryDirectory() as temp_dir: if name.startswith("bacteria-"): as_type = "antismash" elif name.startswith("fungi-"): as_type = "fungismash" else: # update status to "FAILED" with db_open(jobs_db) as con: cur = con.cursor() cur.execute(("update jobs" " set status=?, finished=?," " comment=?" " where name like ?"), (-1, datetime.now(), "unknown_as_type", name)) con.commit() print("unknown job id!") return 1 antismash_url = ( "https://{}.secondarymetabolites.org/upload/{}/").format( as_type, name) commands = [ "wget", "-nd", "-r", "-A", "*.region*.gbk", antismash_url ] is_failed = True if run(commands, cwd=temp_dir).returncode == 0: # success # check if file exists at all file_exists = False for fname in listdir(temp_dir): print(fname) if fname.endswith(".gbk"): file_exists = True break if file_exists and not path.exists(query_input): copytree(temp_dir, query_input) is_failed = False if is_failed: # failed # update status to "FAILED" with db_open(jobs_db) as con: cur = con.cursor() cur.execute(("update jobs" " set status=?, finished=?," " comment=?" " where name like ?"), (-1, datetime.now(), "download_failed", name)) con.commit() print("download failed!") return 1 else: # update status to "PROCESSING" with db_open(jobs_db) as con: cur = con.cursor() cur.execute(("update jobs" " set status=?" " where name like ?"), (2, name)) con.commit() is_failed = False # run BiG-SLICE query commands = [ "bigslice", "-t", str(num_threads), "--query", query_input, "--query_name", name, instance_folder ] print("processing {}...".format(query_input)) # if run(commands, stdout=DEVNULL) == 0: # success if run(commands).returncode == 0: # success # update status to "PROCESSED" with db_open(jobs_db) as con: cur = con.cursor() cur.execute(("update jobs" " set status=?, finished=?" " where name like ?"), (3, datetime.now(), name)) con.commit() else: # failed # update status to "FAILED" with db_open(jobs_db) as con: cur = con.cursor() cur.execute(("update jobs" " set status=?, finished=?," " comment=?" " where name like ?"), (-1, datetime.now(), "query_failed", name)) con.commit() print("run failed!") return 1 return 0