Beispiel #1
0
def main():

    antismash_jobid = argv[1]

    instance_folder = path.join(path.dirname(__file__), "..", "instance")
    jobs_db = path.join(instance_folder, "bigfam_jobs.db")

    if not path.exists(jobs_db):
        print("creating jobs db ({})...".format(jobs_db))
        with db_open(jobs_db) as con:
            cur = con.cursor()
            schema_sql = path.join(path.dirname(__file__), "jobs_schema.sql")
            with open(schema_sql, "r") as sql_script:
                cur.executescript(sql_script.read())
                con.commit()

    with db_open(jobs_db) as con:
        cur = con.cursor()
        # check if exist
        if cur.execute(("select count(name)"
                        " from jobs"
                        " where name like ?"),
                       (antismash_jobid, )).fetchall()[0][0] > 0:
            print("Job ID exists!")
            return 0
        else:
            cur.execute(("insert into jobs"
                         " (name,submitted,status)"
                         " values(?,?,?)"),
                        (antismash_jobid, datetime.now(), 0))
            con.commit()
            print("Inserted new job!")
    return 0
Beispiel #2
0
def main():

    if len(argv) > 1:
        num_threads = int(argv[1])
    else:
        num_threads = cpu_count()

    instance_folder = path.join(path.dirname(__file__), "..", "instance")
    jobs_db = path.join(instance_folder, "bigfam_jobs.db")

    if not path.exists(jobs_db):
        print("creating jobs db ({})...".format(jobs_db))
        with db_open(jobs_db) as con:
            cur = con.cursor()
            schema_sql = path.join(path.dirname(__file__), "jobs_schema.sql")
            with open(schema_sql, "r") as sql_script:
                cur.executescript(sql_script.read())
                con.commit()

    print("workers are running...")
    while (True):
        pending = fetch_pending_jobs(jobs_db)
        if len(pending) > 0:
            print("deploying {} jobs...".format(len(pending)))
            deploy_jobs(pending, jobs_db, instance_folder, num_threads)

        sleep(5)

    return 0
Beispiel #3
0
def fetch_pending_jobs(jobs_db):
    with db_open(jobs_db) as con:
        cur = con.cursor()
        return [
            row[0]
            for row in cur.execute(("select name"
                                    " from jobs"
                                    " where status=0"
                                    " order by submitted asc")).fetchall()
        ]
Beispiel #4
0
def main():

    instance_folder = path.join(path.dirname(__file__), "..", "instance")
    linkage_db = path.join(instance_folder, "linkage.db")
    source_db = path.join(instance_folder, "result", "data.db")
    input_folder = argv[1]
    if len(argv) > 2:
        num_jobs = int(argv[2])
    else:
        num_jobs = 1
    pool = fetch_pool(num_jobs)

    # check if linkage.db exists
    if path.exists(linkage_db):
        print("{} exists!".format(linkage_db))
        return 1

    # parse original datasets information
    print("parsing datasets.tsv...")
    datasets = {}
    with open(path.join(input_folder, "datasets.tsv"), "r") as dstv:
        for line in dstv:
            if not line.startswith("#"):
                ds_name, ds_folder, _, _ = line.rstrip("\n").split("\t")
                datasets[ds_name] = {
                    "path": path.join(input_folder, ds_folder)
                }

    # scan BGCs in database and populate args for multiprocessing
    print("populating list of BGCs to parse...")
    with db_open(source_db) as con:
        cur = con.cursor()

        to_process = [
            (path.join(input_folder, datasets[row[1]]["path"], row[2],
                       row[3]), row)
            for row in cur.execute(("select bgc.id, dataset.name"
                                    ", bgc.orig_folder, bgc.orig_filename"
                                    " from bgc, dataset"
                                    " where bgc.dataset_id=dataset.id"
                                    " order by bgc.id asc")).fetchall()
        ]
    print("found {:,} BGCs!".format(len(to_process)))

    # parse original input files
    print("parsing {:,} BGCs using {} threads...".format(
        len(to_process), num_jobs))
    to_insert_mibig = []
    to_insert_ncbi = []
    to_insert_antismashdb = []
    i = 0
    for data in pool.imap_unordered(parse_bgc_gbk, to_process):
        i += 1
        stderr.write("\r{}/{}".format(i, len(to_process)))
        if data is None:
            continue
        elif data[0] == "mibig":
            to_insert_mibig.append(data[1])
        elif data[0] == "ncbi":
            to_insert_ncbi.append(data[1])
        elif data[0] == "antismashdb":
            to_insert_antismashdb.append(data[1])

    # create linkage db
    print("creating linkage db...")
    with db_open(linkage_db) as con:
        cur = con.cursor()
        schema_sql = path.join(path.dirname(__file__), "linkage_schema.sql")
        with open(schema_sql, "r") as sql_script:
            cur.executescript(sql_script.read())
            con.commit()

        if len(to_insert_mibig) > 0:
            # insert mibig data
            print("inserting mibig linkages...")
            cur.executemany(("insert into linkage_mibig(bgc_id, mibig_acc)"
                             " values (?, ?)"), to_insert_mibig)
        else:
            print("found no mibig linkage")

        if len(to_insert_ncbi) > 0:
            # insert ncbi data
            print("inserting ncbi linkages...")
            cur.executemany(("insert into linkage_ncbi("
                             "bgc_id, nuccore_acc,"
                             " start_loc, end_loc)"
                             " values (?, ?, ?, ?)"), to_insert_ncbi)
        else:
            print("found no ncbi linkage")

        if len(to_insert_antismashdb) > 0:
            # insert antismashdb data
            print("inserting antismashdb linkages...")
            cur.executemany(("insert into linkage_antismashdb("
                             "bgc_id, nuccore_acc,"
                             " start_loc, end_loc)"
                             " values (?, ?, ?, ?)"), to_insert_antismashdb)
        else:
            print("found no antismashdb linkage")
Beispiel #5
0
def main():

    instance_folder = path.join(path.dirname(__file__), "..", "instance")
    precalc_db = path.join(instance_folder, "precalculated.db")
    source_db = path.join(instance_folder, "result", "data.db")

    # make sure that source db only contains 1 clustering record and 1 hmmdb record
    with db_open(source_db) as con:
        cur = con.cursor()
        if cur.execute(
            ("select count(id) from clustering")).fetchall()[0][0] != 1:
            print("Expecting only 1 clustering record!")
            return 1
        if cur.execute(("select count(id) from hmm_db")).fetchall()[0][0] != 1:
            print("Expecting only 1 hmmdb record!")
            return 1

    if not path.exists(precalc_db):
        print("creating precalculated db ({})...".format(precalc_db))
        with db_open(precalc_db) as con:
            cur = con.cursor()
            schema_sql = path.join(path.dirname(__file__), "cache_schema.sql")
            with open(schema_sql, "r") as sql_script:
                cur.executescript(sql_script.read())
                con.commit()

        with db_open(precalc_db) as con:
            cur = con.cursor()

            # attach source_db
            cur.execute(("attach database ? as source"), (source_db, ))

            # fetch clustering id
            clustering_id, threshold = cur.execute(
                ("select id, threshold from clustering limit 1")).fetchall()[0]

            # generate bgc summary
            cur.executemany(("insert into bgc_summary values(?,0)"),
                            cur.execute(("select id"
                                         " from source.bgc"
                                         " order by id asc")).fetchall())
            print("calculating bgc cds counts...")
            cur.executemany(
                ("update bgc_summary set cds_count=? where bgc_id=?"),
                cur.execute(("select count(source.cds.id), source.cds.bgc_id"
                             " from source.cds"
                             " group by source.cds.bgc_id")).fetchall())

            # generate gcf members summary
            cur.executemany(("insert into gcf_summary values(?,0,0)"),
                            cur.execute(("select id"
                                         " from source.gcf"
                                         " where clustering_id=?"
                                         " order by id asc"),
                                        (clustering_id, )).fetchall())
            # inserting core member counts
            print("calculating gcf core member counts...")
            cur.executemany(
                ("update gcf_summary set core_members=? where gcf_id=?"),
                cur.execute(
                    ("select count(source.gcf_membership.bgc_id)"
                     ", source.gcf_membership.gcf_id"
                     " from source.gcf_membership,source.gcf"
                     " where source.gcf.id=source.gcf_membership.gcf_id"
                     " and source.gcf.clustering_id=?"
                     " and source.gcf_membership.rank=0"
                     " and source.gcf_membership.membership_value<=?"
                     " group by source.gcf_membership.gcf_id"),
                    (clustering_id, threshold)).fetchall())
            # inserting putative member counts
            print("calculating gcf putative member counts...")
            cur.executemany(
                ("update gcf_summary set putative_members=? where gcf_id=?"),
                cur.execute(
                    ("select count(source.gcf_membership.bgc_id)"
                     ", source.gcf_membership.gcf_id"
                     " from source.gcf_membership,source.gcf"
                     " where source.gcf.id=source.gcf_membership.gcf_id"
                     " and source.gcf.clustering_id=?"
                     " and source.gcf_membership.rank=0"
                     " and source.gcf_membership.membership_value>?"
                     " group by source.gcf_membership.gcf_id"),
                    (clustering_id, threshold)).fetchall())

            # generate gcf members summary dataset
            print("calculating gcf dataset counts...")
            cur.executemany(
                ("insert into gcf_summary_dataset values(?,?,?)"),
                cur.execute(
                    ("select source.gcf_membership.gcf_id"
                     ", source.bgc.dataset_id"
                     ", count(source.bgc.id)"
                     " from source.gcf_membership,source.gcf,source.bgc"
                     " where source.gcf.id=source.gcf_membership.gcf_id"
                     " and source.gcf.clustering_id=?"
                     " and source.gcf_membership.rank=0"
                     " and source.gcf_membership.membership_value<=?"
                     " and source.bgc.id=source.gcf_membership.bgc_id"
                     " group by source.gcf_membership.gcf_id"
                     ", source.bgc.dataset_id"),
                    (clustering_id, threshold)).fetchall())

            # generate gcf members summary class
            print("calculating gcf class counts...")
            cur.executemany(
                ("insert into gcf_summary_class values(?,?,?)"),
                cur.execute(
                    ("select source.gcf_membership.gcf_id"
                     ", source.bgc_class.chem_subclass_id"
                     ", count(source.bgc.id)"
                     " from source.gcf_membership,source.gcf,source.bgc"
                     ",source.bgc_class"
                     " where source.gcf.id=source.gcf_membership.gcf_id"
                     " and source.gcf.clustering_id=?"
                     " and source.gcf_membership.rank=0"
                     " and source.gcf_membership.membership_value<=?"
                     " and source.bgc.id=source.gcf_membership.bgc_id"
                     " and source.bgc.id=source.bgc_class.bgc_id"
                     " group by source.gcf_membership.gcf_id"
                     ", source.bgc_class.chem_subclass_id"),
                    (clustering_id, threshold)).fetchall())

            # generate gcf members summary taxon
            print("calculating gcf taxon counts...")
            cur.executemany(
                ("insert into gcf_summary_taxon values(?,?,?)"),
                cur.execute(
                    ("select source.gcf_membership.gcf_id"
                     ", source.bgc_taxonomy.taxon_id"
                     ", count(source.bgc.id)"
                     " from source.gcf_membership,source.gcf,source.bgc"
                     ",source.bgc_taxonomy"
                     " where source.gcf.id=source.gcf_membership.gcf_id"
                     " and source.gcf.clustering_id=?"
                     " and source.gcf_membership.rank=0"
                     " and source.gcf_membership.membership_value<=?"
                     " and source.bgc.id=source.gcf_membership.bgc_id"
                     " and source.bgc.id=source.bgc_taxonomy.bgc_id"
                     " group by source.gcf_membership.gcf_id"
                     ", source.bgc_taxonomy.taxon_id"),
                    (clustering_id, threshold)).fetchall())

            # make bgc_domains table
            print("inserting bgc_domains entries...")
            cur.executemany(("insert into bgc_domains values(?,?)"),
                            cur.execute(("select bgc_id, hmm_id"
                                         " from source.bgc_features"
                                         " where value >= 255")).fetchall())

            # make gcf_domains table
            print("inserting gcf_domains entries...")
            cur.executemany(
                ("insert into gcf_domains values(?,?)"),
                cur.execute(("select gcf_id, hmm_id"
                             " from source.gcf_models,source.gcf"
                             " where source.gcf.clustering_id=?"
                             " and source.gcf_models.gcf_id=source.gcf.id"
                             " and value >= 200"),
                            (clustering_id, )).fetchall())

        return 0
    else:
        print("precalculated db exists!")
        return 1
Beispiel #6
0
def deploy_jobs(pending, jobs_db, instance_folder, num_threads):
    inputs_folder = path.join(instance_folder, "query_inputs")
    for name in pending:
        # update status to "DOWNLOADING"
        with db_open(jobs_db) as con:
            cur = con.cursor()
            cur.execute(("update jobs"
                         " set status=?, started=?"
                         " where name like ?"), (1, datetime.now(), name))
            con.commit()

        # download antiSMASH result
        query_input = path.join(inputs_folder, name)
        print("downloading {}...".format(query_input))
        with TemporaryDirectory() as temp_dir:
            if name.startswith("bacteria-"):
                as_type = "antismash"
            elif name.startswith("fungi-"):
                as_type = "fungismash"
            else:
                # update status to "FAILED"
                with db_open(jobs_db) as con:
                    cur = con.cursor()
                    cur.execute(("update jobs"
                                 " set status=?, finished=?,"
                                 " comment=?"
                                 " where name like ?"),
                                (-1, datetime.now(), "unknown_as_type", name))
                    con.commit()
                print("unknown job id!")
                return 1
            antismash_url = (
                "https://{}.secondarymetabolites.org/upload/{}/").format(
                    as_type, name)
            commands = [
                "wget", "-nd", "-r", "-A", "*.region*.gbk", antismash_url
            ]
            is_failed = True
            if run(commands, cwd=temp_dir).returncode == 0:  # success
                # check if file exists at all
                file_exists = False
                for fname in listdir(temp_dir):
                    print(fname)
                    if fname.endswith(".gbk"):
                        file_exists = True
                        break
                if file_exists and not path.exists(query_input):
                    copytree(temp_dir, query_input)
                    is_failed = False

            if is_failed:  # failed
                # update status to "FAILED"
                with db_open(jobs_db) as con:
                    cur = con.cursor()
                    cur.execute(("update jobs"
                                 " set status=?, finished=?,"
                                 " comment=?"
                                 " where name like ?"),
                                (-1, datetime.now(), "download_failed", name))
                    con.commit()
                print("download failed!")
                return 1
            else:
                # update status to "PROCESSING"
                with db_open(jobs_db) as con:
                    cur = con.cursor()
                    cur.execute(("update jobs"
                                 " set status=?"
                                 " where name like ?"), (2, name))
                    con.commit()
                is_failed = False

        # run BiG-SLICE query
        commands = [
            "bigslice", "-t",
            str(num_threads), "--query", query_input, "--query_name", name,
            instance_folder
        ]
        print("processing {}...".format(query_input))
        # if run(commands, stdout=DEVNULL) == 0:  # success
        if run(commands).returncode == 0:  # success
            # update status to "PROCESSED"
            with db_open(jobs_db) as con:
                cur = con.cursor()
                cur.execute(("update jobs"
                             " set status=?, finished=?"
                             " where name like ?"), (3, datetime.now(), name))
                con.commit()
        else:  # failed
            # update status to "FAILED"
            with db_open(jobs_db) as con:
                cur = con.cursor()
                cur.execute(("update jobs"
                             " set status=?, finished=?,"
                             " comment=?"
                             " where name like ?"),
                            (-1, datetime.now(), "query_failed", name))
                con.commit()
            print("run failed!")
            return 1
    return 0