def get_pca_job(conf): logger.debug(f"""Generate job for PCA:\n""" f""" uid - {conf["uid"]}\n""" f""" expression - {conf["expression"]}\n""") connect_db = HookConnect() setting_data = connect_db.get_settings_data() job = { "expression_file": [], "legend_name": [], "output_prefix": conf["uid"] + "_", "output_folder": os.path.join(setting_data["anl_data"], conf["uid"]), "uid": conf["uid"] } for idx, uid in enumerate(conf["expression"].split()): for genelist_data in connect_db.fetchall( f"SELECT tableName, name FROM genelist WHERE leaf=1 AND (parent_id like '{uid}' OR id like '{uid}')" ): exp_data = get_exp_data(genelist_data["tableName"]) job["expression_file"].append( fill_template( """{{"class": "File", "location": "{outputs[rpkm_isoforms][location]}", "format": "http://edamontology.org/format_3752"}}""", exp_data)) job["legend_name"].append(genelist_data["name"]) return job
def get_genelist_data(uid): logger.debug(f"Collecting data from genelist for: {uid}") connect_db = HookConnect() sql_query = f"""SELECT name, leaf, type, conditions, gblink, db, tableName, labdata_id, rtype_id, atype_id, project_id, parent_id, params as outputs FROM genelist WHERE id LIKE '{uid}'""" logger.debug(f"Running SQL query:\n{sql_query}") glist_data = connect_db.fetchone(sql_query) glist_data = { key: (value if not isinstance(value, decimal.Decimal) else int(value)) for key, value in glist_data.items() } glist_data.update({ "outputs": loads(glist_data['outputs']) if glist_data['outputs'] else {} }) logger.debug( f"Collected data from genelist for: {uid}\n{dumps(glist_data, indent=4)}" ) return glist_data
def update_atdp_table_for_heatmap(conf, job_result): logger.debug(f"""Updating atdp table with heatmap results for {conf["uid"]}""") connect_db = HookConnect() connect_db.execute(f"""UPDATE atdp SET params='{dumps(job_result)}' WHERE genelist_id='{conf["uid"]}' AND tbl1_id='{conf["data_uid"]}' AND tbl2_id='{conf["intervals_uid"]}' AND pltname='{conf["name"]}'""")
def get_atdp_data(uid): logger.debug(f"Collecting data from atdp for: {uid}") connect_db = HookConnect() sql_query = f"""SELECT tbl1_id as data_uid, tbl2_id as intervals_uid, pltname as name, params as outputs FROM atdp WHERE genelist_id='{uid}'""" logger.debug(f"Running SQL query:\n{sql_query}") atdp_data = [] for data in connect_db.fetchall(sql_query): data.update( {"outputs": loads(data['outputs']) if data['outputs'] else {}}) atdp_data.append(data) logger.debug( f"Collected data from atdp for: {uid}\n{dumps(atdp_data, indent=4)}") return atdp_data
def get_deseq_job(conf): logger.debug(f"Collecting data for genelists:\n" f" name - {conf['name']}\n" f" project_uid - {conf['project_uid']}\n" f" uid - {conf['uid']}\n" f" untreated - {conf['condition'][0]}\n" f" treated - {conf['condition'][1]}\n" f" groupby - {conf['groupby']}\n") connect_db = HookConnect() setting_data = connect_db.get_settings_data() job = { "untreated_files": [], "treated_files": [], "output_filename": conf["uid"] + "_deseq.tsv", "threads": int(setting_data["threads"]), "output_folder": os.path.join(setting_data["anl_data"], conf["uid"]), "uid": conf["uid"] } for idx, uid in enumerate(conf['condition']): logger.debug(f"Get experiment IDs for {uid}") sql_query = f"SELECT tableName FROM genelist WHERE leaf=1 AND (parent_id like '{uid}' OR id like '{uid}')" file_templates = { 1: '{{"class": "File", "location": "{outputs[rpkm_isoforms][location]}", "format": "http://edamontology.org/format_3752"}}', 2: '{{"class": "File", "location": "{outputs[rpkm_genes][location]}", "format": "http://edamontology.org/format_3475"}}', 3: '{{"class": "File", "location": "{outputs[rpkm_common_tss][location]}", "format": "http://edamontology.org/format_3475"}}' } current_file_template = file_templates[conf["groupby"]] for record in connect_db.fetchall(sql_query): exp_data = get_exp_data(record["tableName"]) if idx == 0: job["untreated_files"].append( fill_template(current_file_template, exp_data)) else: job["treated_files"].append( fill_template(current_file_template, exp_data)) return job
def get_exp_data(uid): logger.debug(f"Collecting data for: {uid}") connect_db = HookConnect() sql_query = f"""SELECT l.params as outputs, e.id as exp_type_id, g.findex as genome_type, l.fragmentsize as fragment_size FROM labdata l INNER JOIN (experimenttype e, genome g) ON (e.id=l.experimenttype_id AND g.id=l.genome_id) LEFT JOIN (antibody a) ON (l.antibody_id=a.id) WHERE l.uid='{uid}'""" logger.debug(f"Running SQL query:\n{sql_query}") exp_data = connect_db.fetchone(sql_query) exp_data = { key: (value if not isinstance(value, decimal.Decimal) else int(value)) for key, value in exp_data.items() } exp_data.update( {"outputs": loads(exp_data['outputs']) if exp_data['outputs'] else {}}) logger.debug(f"Collected data for: {uid}\n{dumps(exp_data, indent=4)}") return exp_data
def get_heatmap_job(conf): logger.debug(f"Collecting data for genelist:\n" f" name - {conf['name']}\n" f" uid - {conf['uid']}\n" f" data_uid - {conf['data_uid']}\n" f" intervals_uid - {conf['intervals_uid']}\n") connect_db = HookConnect() setting_data = connect_db.get_settings_data() exp_data = get_exp_data(get_genelist_data(conf['data_uid'])["tableName"]) job = { "bam_file": fill_template( '{{"class": "File", "location": "{outputs[bambai_pair][location]}", "format": "http://edamontology.org/format_2572"}}', exp_data), "genelist_file": get_genelist_file(conf['intervals_uid']), "fragment_size": exp_data["fragment_size"], "json_filename": "-".join([conf['data_uid'], conf['intervals_uid']]), "plot_name": conf['name'], "data_uid": conf['data_uid'], "data_name": get_genelist_data(conf['data_uid'])["name"], "intervals_uid": conf['intervals_uid'], "intervals_name": get_genelist_data(conf['intervals_uid'])["name"], "threads": int(setting_data["threads"]), "output_folder": os.path.join(setting_data["anl_data"], conf["uid"]), "uid": conf["uid"] } return job
def update_genelist_table_for_deseq(conf, job_result): logger.debug(f"Updating genelist table with DESeq results for {conf['uid']}") connect_db = HookConnect() genelist_info = [get_genelist_data(uid) for uid in conf['condition']] grouping = {1: "isoforms", 2: "genes", 3: "common tss"}[conf["groupby"]] gblink = "&".join([item["gblink"] for item in genelist_info]) names = [item["name"] for item in genelist_info] table_name = conf["uid"].replace("-","") comment = f"""Annotation grouping ({grouping}) were used for DESeq analysis.<br>Data from "{names[0]}" vs "{names[1]}" has been chosen.""" sql_header = f"""INSERT INTO genelist (id,name,leaf,`type`,conditions,gblink,db,tableName,rtype_id,project_id,params) VALUES ('{conf["uid"]}', '{conf["name"]}', 1, 3, '{comment}', '{gblink}', '', '{table_name}', {conf["groupby"]}, '{conf["project_uid"]}', '{dumps(job_result)}')""" connect_db.execute(sql_header)
def upload_deseq_results(conf, job_result): connect_db = HookConnect() filename = strip_filepath(job_result["diff_expr_file"]["location"]) table_name = connect_db.get_settings_data()["experimentsdb"] + '.`' + conf["uid"].replace("-", "") + '`' logger.debug(f"Uploading DESeq results from file {filename} to {table_name}") connect_db.execute(f"DROP TABLE IF EXISTS {table_name}") logger.debug(f"Drop {table_name} if exist") with open(filename, 'r') as input_file: header = input_file.readline().strip().split() u_rpkm, t_rpkm = header[6], header[7] connect_db.execute(f"""CREATE TABLE {table_name} (refseq_id VARCHAR(1000) NOT NULL, gene_id VARCHAR(500) NOT NULL, chrom VARCHAR(45) NOT NULL, txStart INT NULL, txEnd INT NULL, strand VARCHAR(1), {u_rpkm} DOUBLE, {t_rpkm} DOUBLE, LOGR DOUBLE, pvalue DOUBLE, padj DOUBLE, INDEX refseq_id_idx (refseq_id ASC) USING BTREE, INDEX gene_id_idx (gene_id ASC) USING BTREE, INDEX chr_idx (chrom ASC) USING BTREE, INDEX txStart_idx (txStart ASC) USING BTREE, INDEX txEnd_idx (txEnd ASC) USING BTREE ) ENGINE=MyISAM DEFAULT CHARSET=utf8 """) logger.debug(f"Create {table_name}") sql_header = f"INSERT INTO {table_name} VALUES " value_list = [] with open(filename, 'r') as input_file: for line in input_file.read().splitlines(): # RefseqId, GeneId, Chrom, TxStart, TxEnd, Strand, uRpkm, tRpkm, log2FoldChange, pvalue, padj if not line or "RefseqId" in line or "GeneId" in line: continue value_list.append("('{}','{}','{}',{},{},'{}',{},{},{},{},{})".format(*line.split())) connect_db.execute(sql_header + ",".join(value_list)) logger.debug(f"Insert data into {table_name}")
def upload_atdp_results(conf, job_result): connect_db = HookConnect() filename = strip_filepath(job_result["json_file"]["location"]) table_name = connect_db.get_settings_data()["experimentsdb"] + '.`' + conf["uid"] + '`' logger.debug(f"""Uploading ATDP results from file {filename} to {table_name}""") atdp_tables = connect_db.fetchall(f"""SELECT tbl1_id,tbl2_id,pltname FROM atdp WHERE genelist_id='{conf["uid"]}' ORDER BY tbl1_id, tbl2_id""") # we need this to display results in correct order idx = atdp_tables.index({"tbl1_id": conf["data_uid"], "tbl2_id": conf["intervals_uid"], "pltname": conf["name"]}) with open(filename, 'r') as input_stream: atdp_data = load(input_stream)["atdp"] try: logger.debug(f"""Creating table {table_name}""") connect_db.execute(";".join([f"""CREATE TABLE {table_name} ( X INT NULL, {", ".join(["Y{} FLOAT NULL".format(i) for i in range(len(atdp_tables))])}, INDEX X_idx (X) using btree ) ENGINE=MyISAM DEFAULT CHARSET=utf8"""] + [f"""INSERT INTO {table_name} (X) VALUES ({i})""" for i in atdp_data["index"]])) except Error as e: logger.debug(f"""Failed to create table {table_name}: {e}""") logger.debug(f"""Inserting data into {table_name}""") connect_db.execute(";".join([f"""UPDATE {table_name} SET Y{idx}={r[3]} WHERE X={i}""" for i, r in zip(atdp_data["index"], atdp_data["data"])]))
def get_genelist_file(uid): genelist_data = get_genelist_data(uid) genelist_file_template = '{{"class": "File", "location": "{outputs[genelist_file][location]}", "format": "http://edamontology.org/format_3475"}}' try: genelist_file = fill_template(genelist_file_template, genelist_data) except KeyError: logger.debug(f"Failed to find genelist file for: {uid}") connect_db = HookConnect() filename = os.path.join(connect_db.get_settings_data()["anl_data"], uid, uid + "_genelist.tsv") data = connect_db.fetchall( f"""SELECT * FROM experiments.`{genelist_data["tableName"]}`""") data_str = "" for idx, record in enumerate(data): if idx == 0: data_str += "\t".join([str(item) for item in record.keys()]) + "\n" else: data_str += "\t".join([str(item) for item in record.values()]) + "\n" export_to_file(data_str, filename) logger.debug(f"Export genelist file to: {filename}") genelist_data["outputs"].update({ "genelist_file": { "class": "File", "location": filename, "format": "http://edamontology.org/format_3475" } }) connect_db.execute( f"""UPDATE genelist SET params='{dumps(genelist_data["outputs"])}' WHERE id='{uid}'""" ) logger.debug( f"""Update params for {uid}\n{dumps(genelist_data["outputs"], indent=4)}""" ) genelist_file = fill_template(genelist_file_template, genelist_data) return genelist_file
def setup_airflow(config): HookConnect(config) # Need to run it with config at least once to create connection create_pools("biowardrobe_advanced", 10, "Pool to run BioWardrobe Advanced Analysis") create_dags()
def update_genelist_table_for_atdp(conf): logger.debug(f"""Updating genelist table with ATDP results for {conf["uid"]}""") connect_db = HookConnect() connect_db.execute(f"""UPDATE genelist SET tableName='{conf["uid"]}' WHERE id='{conf["uid"]}'""")