예제 #1
0
def get_pca_job(conf):
    logger.debug(f"""Generate job for PCA:\n"""
                 f"""  uid -        {conf["uid"]}\n"""
                 f"""  expression - {conf["expression"]}\n""")
    connect_db = HookConnect()
    setting_data = connect_db.get_settings_data()
    job = {
        "expression_file": [],
        "legend_name": [],
        "output_prefix": conf["uid"] + "_",
        "output_folder": os.path.join(setting_data["anl_data"], conf["uid"]),
        "uid": conf["uid"]
    }
    for idx, uid in enumerate(conf["expression"].split()):
        for genelist_data in connect_db.fetchall(
                f"SELECT tableName, name FROM genelist WHERE leaf=1 AND (parent_id like '{uid}' OR id like '{uid}')"
        ):
            exp_data = get_exp_data(genelist_data["tableName"])
            job["expression_file"].append(
                fill_template(
                    """{{"class": "File",
                                                             "location": "{outputs[rpkm_isoforms][location]}",
                                                             "format": "http://edamontology.org/format_3752"}}""",
                    exp_data))
            job["legend_name"].append(genelist_data["name"])
    return job
예제 #2
0
def get_genelist_data(uid):
    logger.debug(f"Collecting data from genelist for: {uid}")
    connect_db = HookConnect()
    sql_query = f"""SELECT name,
                           leaf,
                           type,
                           conditions,
                           gblink,
                           db,
                           tableName,
                           labdata_id,
                           rtype_id,
                           atype_id,
                           project_id,
                           parent_id,
                           params as outputs
                    FROM genelist
                    WHERE id LIKE '{uid}'"""
    logger.debug(f"Running SQL query:\n{sql_query}")
    glist_data = connect_db.fetchone(sql_query)
    glist_data = {
        key: (value if not isinstance(value, decimal.Decimal) else int(value))
        for key, value in glist_data.items()
    }
    glist_data.update({
        "outputs":
        loads(glist_data['outputs']) if glist_data['outputs'] else {}
    })
    logger.debug(
        f"Collected data from genelist for: {uid}\n{dumps(glist_data, indent=4)}"
    )
    return glist_data
예제 #3
0
def update_atdp_table_for_heatmap(conf, job_result):
    logger.debug(f"""Updating atdp table with heatmap results for {conf["uid"]}""")
    connect_db = HookConnect()
    connect_db.execute(f"""UPDATE atdp
                           SET params='{dumps(job_result)}'
                           WHERE genelist_id='{conf["uid"]}' AND
                                 tbl1_id='{conf["data_uid"]}' AND
                                 tbl2_id='{conf["intervals_uid"]}' AND
                                 pltname='{conf["name"]}'""")
예제 #4
0
def get_atdp_data(uid):
    logger.debug(f"Collecting data from atdp for: {uid}")
    connect_db = HookConnect()
    sql_query = f"""SELECT
                        tbl1_id as data_uid,
                        tbl2_id as intervals_uid,
                        pltname as name,
                        params as outputs
                    FROM atdp
                    WHERE genelist_id='{uid}'"""
    logger.debug(f"Running SQL query:\n{sql_query}")
    atdp_data = []
    for data in connect_db.fetchall(sql_query):
        data.update(
            {"outputs": loads(data['outputs']) if data['outputs'] else {}})
        atdp_data.append(data)
    logger.debug(
        f"Collected data from atdp for: {uid}\n{dumps(atdp_data, indent=4)}")
    return atdp_data
예제 #5
0
def get_deseq_job(conf):
    logger.debug(f"Collecting data for genelists:\n"
                 f"  name -           {conf['name']}\n"
                 f"  project_uid -    {conf['project_uid']}\n"
                 f"  uid -            {conf['uid']}\n"
                 f"  untreated -      {conf['condition'][0]}\n"
                 f"  treated -        {conf['condition'][1]}\n"
                 f"  groupby -        {conf['groupby']}\n")

    connect_db = HookConnect()
    setting_data = connect_db.get_settings_data()
    job = {
        "untreated_files": [],
        "treated_files": [],
        "output_filename": conf["uid"] + "_deseq.tsv",
        "threads": int(setting_data["threads"]),
        "output_folder": os.path.join(setting_data["anl_data"], conf["uid"]),
        "uid": conf["uid"]
    }

    for idx, uid in enumerate(conf['condition']):
        logger.debug(f"Get experiment IDs for {uid}")
        sql_query = f"SELECT tableName FROM genelist WHERE leaf=1 AND (parent_id like '{uid}' OR id like '{uid}')"
        file_templates = {
            1:
            '{{"class": "File", "location": "{outputs[rpkm_isoforms][location]}", "format": "http://edamontology.org/format_3752"}}',
            2:
            '{{"class": "File", "location": "{outputs[rpkm_genes][location]}", "format": "http://edamontology.org/format_3475"}}',
            3:
            '{{"class": "File", "location": "{outputs[rpkm_common_tss][location]}", "format": "http://edamontology.org/format_3475"}}'
        }
        current_file_template = file_templates[conf["groupby"]]
        for record in connect_db.fetchall(sql_query):
            exp_data = get_exp_data(record["tableName"])
            if idx == 0:
                job["untreated_files"].append(
                    fill_template(current_file_template, exp_data))
            else:
                job["treated_files"].append(
                    fill_template(current_file_template, exp_data))
    return job
예제 #6
0
def get_exp_data(uid):
    logger.debug(f"Collecting data for: {uid}")
    connect_db = HookConnect()
    sql_query = f"""SELECT 
                        l.params as outputs,
                        e.id     as exp_type_id,
                        g.findex as genome_type,
                        l.fragmentsize as fragment_size
                    FROM labdata l
                    INNER JOIN (experimenttype e, genome g) ON (e.id=l.experimenttype_id AND g.id=l.genome_id)
                    LEFT JOIN (antibody a) ON (l.antibody_id=a.id)
                    WHERE l.uid='{uid}'"""
    logger.debug(f"Running SQL query:\n{sql_query}")
    exp_data = connect_db.fetchone(sql_query)
    exp_data = {
        key: (value if not isinstance(value, decimal.Decimal) else int(value))
        for key, value in exp_data.items()
    }
    exp_data.update(
        {"outputs": loads(exp_data['outputs']) if exp_data['outputs'] else {}})
    logger.debug(f"Collected data for: {uid}\n{dumps(exp_data, indent=4)}")
    return exp_data
예제 #7
0
def get_heatmap_job(conf):
    logger.debug(f"Collecting data for genelist:\n"
                 f"  name          - {conf['name']}\n"
                 f"  uid           - {conf['uid']}\n"
                 f"  data_uid      - {conf['data_uid']}\n"
                 f"  intervals_uid - {conf['intervals_uid']}\n")
    connect_db = HookConnect()
    setting_data = connect_db.get_settings_data()
    exp_data = get_exp_data(get_genelist_data(conf['data_uid'])["tableName"])
    job = {
        "bam_file":
        fill_template(
            '{{"class": "File", "location": "{outputs[bambai_pair][location]}", "format": "http://edamontology.org/format_2572"}}',
            exp_data),
        "genelist_file":
        get_genelist_file(conf['intervals_uid']),
        "fragment_size":
        exp_data["fragment_size"],
        "json_filename":
        "-".join([conf['data_uid'], conf['intervals_uid']]),
        "plot_name":
        conf['name'],
        "data_uid":
        conf['data_uid'],
        "data_name":
        get_genelist_data(conf['data_uid'])["name"],
        "intervals_uid":
        conf['intervals_uid'],
        "intervals_name":
        get_genelist_data(conf['intervals_uid'])["name"],
        "threads":
        int(setting_data["threads"]),
        "output_folder":
        os.path.join(setting_data["anl_data"], conf["uid"]),
        "uid":
        conf["uid"]
    }
    return job
예제 #8
0
def update_genelist_table_for_deseq(conf, job_result):
    logger.debug(f"Updating genelist table with DESeq results for {conf['uid']}")
    connect_db = HookConnect()
    genelist_info = [get_genelist_data(uid) for uid in conf['condition']]

    grouping = {1: "isoforms", 2: "genes", 3: "common tss"}[conf["groupby"]]
    gblink = "&".join([item["gblink"] for item in genelist_info])
    names = [item["name"] for item in genelist_info]
    table_name = conf["uid"].replace("-","")
    comment = f"""Annotation grouping ({grouping}) were used for DESeq analysis.<br>Data from "{names[0]}" vs "{names[1]}" has been chosen."""

    sql_header = f"""INSERT INTO genelist (id,name,leaf,`type`,conditions,gblink,db,tableName,rtype_id,project_id,params) VALUES
                     ('{conf["uid"]}',
                      '{conf["name"]}',
                       1,
                       3,
                      '{comment}',
                      '{gblink}',
                      '',
                      '{table_name}',
                       {conf["groupby"]},
                      '{conf["project_uid"]}',
                      '{dumps(job_result)}')"""
    connect_db.execute(sql_header)
예제 #9
0
def upload_deseq_results(conf, job_result):
    connect_db = HookConnect()
    filename = strip_filepath(job_result["diff_expr_file"]["location"])
    table_name = connect_db.get_settings_data()["experimentsdb"] + '.`' + conf["uid"].replace("-", "") + '`'
    logger.debug(f"Uploading DESeq results from file {filename} to {table_name}")
    connect_db.execute(f"DROP TABLE IF EXISTS {table_name}")
    logger.debug(f"Drop {table_name} if exist")
    with open(filename, 'r') as input_file:
        header = input_file.readline().strip().split()
    u_rpkm, t_rpkm = header[6], header[7]
    connect_db.execute(f"""CREATE TABLE {table_name}
                             (refseq_id VARCHAR(1000) NOT NULL,
                             gene_id VARCHAR(500) NOT NULL,
                             chrom VARCHAR(45) NOT NULL,
                             txStart INT NULL,
                             txEnd INT NULL,
                             strand VARCHAR(1),
                             {u_rpkm} DOUBLE,
                             {t_rpkm} DOUBLE,
                             LOGR DOUBLE,
                             pvalue DOUBLE,
                             padj DOUBLE,
                             INDEX refseq_id_idx (refseq_id ASC) USING BTREE,
                             INDEX gene_id_idx (gene_id ASC) USING BTREE,
                             INDEX chr_idx (chrom ASC) USING BTREE,
                             INDEX txStart_idx (txStart ASC) USING BTREE,
                             INDEX txEnd_idx (txEnd ASC) USING BTREE
                            ) ENGINE=MyISAM DEFAULT CHARSET=utf8 """)
    logger.debug(f"Create {table_name}")
    sql_header = f"INSERT INTO {table_name} VALUES "
    value_list = []
    with open(filename, 'r') as input_file:
        for line in input_file.read().splitlines():
            # RefseqId, GeneId, Chrom, TxStart, TxEnd, Strand, uRpkm, tRpkm, log2FoldChange, pvalue, padj
            if not line or "RefseqId" in line or "GeneId" in line:
                continue
            value_list.append("('{}','{}','{}',{},{},'{}',{},{},{},{},{})".format(*line.split()))
    connect_db.execute(sql_header + ",".join(value_list))
    logger.debug(f"Insert data into {table_name}")
예제 #10
0
def upload_atdp_results(conf, job_result):
    connect_db = HookConnect()
    filename = strip_filepath(job_result["json_file"]["location"])
    table_name = connect_db.get_settings_data()["experimentsdb"] + '.`' + conf["uid"] + '`'

    logger.debug(f"""Uploading ATDP results from file {filename} to {table_name}""")

    atdp_tables = connect_db.fetchall(f"""SELECT tbl1_id,tbl2_id,pltname
                                          FROM atdp
                                          WHERE genelist_id='{conf["uid"]}'
                                          ORDER BY tbl1_id, tbl2_id""")  # we need this to display results in correct order

    idx = atdp_tables.index({"tbl1_id": conf["data_uid"],
                             "tbl2_id": conf["intervals_uid"],
                             "pltname": conf["name"]})

    with open(filename, 'r') as input_stream:
        atdp_data = load(input_stream)["atdp"]

    try:
        logger.debug(f"""Creating table {table_name}""")
        connect_db.execute(";".join([f"""CREATE TABLE {table_name}
                                         ( 
                                           X INT NULL,
                                           {", ".join(["Y{} FLOAT NULL".format(i) for i in range(len(atdp_tables))])},
                                           INDEX X_idx (X) using btree
                                         ) ENGINE=MyISAM DEFAULT CHARSET=utf8"""] +
                                    [f"""INSERT INTO {table_name}
                                           (X)
                                         VALUES ({i})""" for i in atdp_data["index"]]))
    except Error as e:
        logger.debug(f"""Failed to create table {table_name}: {e}""")

    logger.debug(f"""Inserting data into {table_name}""")
    connect_db.execute(";".join([f"""UPDATE {table_name} 
                                     SET Y{idx}={r[3]}
                                     WHERE X={i}""" for i, r in zip(atdp_data["index"], atdp_data["data"])]))
예제 #11
0
def get_genelist_file(uid):
    genelist_data = get_genelist_data(uid)
    genelist_file_template = '{{"class": "File", "location": "{outputs[genelist_file][location]}", "format": "http://edamontology.org/format_3475"}}'
    try:
        genelist_file = fill_template(genelist_file_template, genelist_data)
    except KeyError:
        logger.debug(f"Failed to find genelist file for: {uid}")
        connect_db = HookConnect()
        filename = os.path.join(connect_db.get_settings_data()["anl_data"],
                                uid, uid + "_genelist.tsv")

        data = connect_db.fetchall(
            f"""SELECT * FROM experiments.`{genelist_data["tableName"]}`""")
        data_str = ""
        for idx, record in enumerate(data):
            if idx == 0:
                data_str += "\t".join([str(item)
                                       for item in record.keys()]) + "\n"
            else:
                data_str += "\t".join([str(item)
                                       for item in record.values()]) + "\n"

        export_to_file(data_str, filename)
        logger.debug(f"Export genelist file to: {filename}")
        genelist_data["outputs"].update({
            "genelist_file": {
                "class": "File",
                "location": filename,
                "format": "http://edamontology.org/format_3475"
            }
        })
        connect_db.execute(
            f"""UPDATE genelist SET params='{dumps(genelist_data["outputs"])}' WHERE id='{uid}'"""
        )
        logger.debug(
            f"""Update params for {uid}\n{dumps(genelist_data["outputs"], indent=4)}"""
        )
        genelist_file = fill_template(genelist_file_template, genelist_data)
    return genelist_file
def setup_airflow(config):
    HookConnect(config)  # Need to run it with config at least once to create connection
    create_pools("biowardrobe_advanced", 10, "Pool to run BioWardrobe Advanced Analysis")
    create_dags()
예제 #13
0
def update_genelist_table_for_atdp(conf):
    logger.debug(f"""Updating genelist table with ATDP results for {conf["uid"]}""")
    connect_db = HookConnect()
    connect_db.execute(f"""UPDATE genelist SET tableName='{conf["uid"]}' WHERE id='{conf["uid"]}'""")