def main(): # Parse command-line arguments and load experiment configuration cmd_args = parse_arguments() config = common.load_config(cmd_args.ini_file_initial) # TRAPID and ref. db data (lists containing all needed parameters for `common.db_connect()`) trapid_db_data = common.get_db_connection_data(config, 'trapid_db') ref_db_data = common.get_db_connection_data(config, 'reference_db') exp_id = config['experiment']['exp_id'] try: # Clean TRAPID db (in case of previous results) db_conn = common.db_connect(*trapid_db_data) cleanup_db(db_conn, exp_id) db_conn.close() # Parse eggNOG-mapper's output file emapper_output = os.path.join(config['experiment']['tmp_exp_dir'], "emapper_%s.emapper.annotations" % exp_id) if not os.path.exists(emapper_output): sys.stderr.write("Error: emapper output file (%s) not found!\n" % emapper_output) emapper_results = parse_emapper_output(emapper_output) # Perform GF assignment db_conn = common.db_connect(*trapid_db_data) perform_gf_assignment(emapper_results, db_conn, exp_id) common.update_experiment_log(exp_id, "infer_genefamilies", "", 3, db_conn) db_conn.close() # Get GO data db_conn = common.db_connect(*ref_db_data) go_data = get_go_data(db_conn) db_conn.close() # Perform GO annotation # Read RFAM GO data rfam_go_file = os.path.join(config['experiment']['tmp_exp_dir'], "rfam_go_data.tsv") rfam_transcript_gos = read_rfam_go_data(rfam_go_file) db_conn = common.db_connect(*trapid_db_data) perform_go_annotation(emapper_results, rfam_transcript_gos, db_conn, go_data, exp_id) db_conn.close() # Perform KO annotation db_conn = common.db_connect(*trapid_db_data) perform_ko_annotation(emapper_results, db_conn, exp_id) common.update_experiment_log(exp_id, "infer_functional_annotation", "", 3, db_conn) db_conn.close() # If any exception was raised, update the experiment's log, set status to 'error', and exit except Exception: print_exc() common.stop_initial_processing_error(exp_id, trapid_db_data)
def flag_rna_genes(exp_id, infernal_results, trapid_db_data): """Flag a set of transcripts as RNA genes in TRAPID's database. :param exp_id: TRAPID experiment id :param infernal_results: parsed (filtered) infernal results :param trapid_db_data: TRAPID db connection data """ # Before updating `transcripts` with the current Infernal results, clear previous content clear_transcripts_table(exp_id, trapid_db_data) sys.stderr.write('[Message] Flag RNA genes in `transcripts` table. \n') query_str = "UPDATE `transcripts` SET `is_rna_gene`=1, `rf_ids`='{rf_ids}' WHERE `experiment_id`='{exp_id}' and transcript_id='{transcript_id}';" db_conn = common.db_connect(*trapid_db_data) cursor = db_conn.cursor() # Get Rfam families associated to each transcript transcript_families = {} for rec in infernal_results: exp_cm_acc = "%s_%s" % (exp_id, rec["cm_acc"]) if rec["query"] not in transcript_families: transcript_families[rec["query"]] = set([exp_cm_acc]) else: transcript_families[rec["query"]].add(exp_cm_acc) for transcript_id in sorted(transcript_families): rf_ids = ",".join(sorted(list(transcript_families[transcript_id]))) formatted_query = query_str.format(exp_id=exp_id, rf_ids=rf_ids, transcript_id=transcript_id) cursor.execute(formatted_query) db_conn.commit() db_conn.close()
def main(args): # database init path = "sqlite:///{}".format(args.database_path) conn = db_connect(path, args.debug) market_data = get_market_data() populate_db(conn, market_data)
def store_tree_db(tree_file, tree_program, exp_id, gf_id, trapid_db_data, verbose=False): """Read a gene family's newick tree file and upload its content to TRAPID database. :param tree_file: path to newick tree file to upload :param tree_program: program used to generate the tree (also stored in the database) :param exp_id: TRAPID experiment id :param gf_id: gene family id :param trapid_db_data: TRAPID database connection data (parameters for common.db_connect()) :param verbose: whether to be verbose (print extra information to stderr if set to True) """ query_str = "UPDATE `gene_families` SET `tree`='{tree_data}', `tree_params`='{tree_params}' WHERE `experiment_id`='{exp_id}' AND `gf_id`='{gf_id}';" if verbose: sys.stderr.write( "[Message] Store newick tree data ('%s') for GF '%s' of experiment '%d' to TRAPID database.\n" % (tree_file, gf_id, exp_id)) with open(tree_file, 'r') as in_file: tree_data = in_file.read().replace('\n', '') db_conn = common.db_connect(*trapid_db_data) cursor = db_conn.cursor() cursor.execute( query_str.format(tree_data=tree_data, tree_params=tree_program, exp_id=exp_id, gf_id=gf_id)) db_conn.commit() db_conn.close()
def get_transcript_sqces(gf_data, transl_tables, trapid_db_data, verbose=False): """Retrieve gene family transcript sequences for MSA/tree creation, translate them & return them as a dictionary. :param gf_data: all gene family data retrieved from the TRAPID database (output of `get_gf_data()`) :param transl_tables: translation tables dictionary :param trapid_db_data: TRAPID database connection data (parameters for common.db_connect()) :param verbose: whether to be verbose (print extra information to stderr if set to True) :return: translated transcript sequences as a sequence_id:sequence dictionary """ trs_sqces = {} query_str = "SELECT `transcript_id`, UNCOMPRESS(`orf_sequence`) as `orf_sequence`, `transl_table` FROM `transcripts` WHERE `experiment_id`='{exp_id}' AND `gf_id`='{gf_id}';" excluded_trs = set([]) if gf_data['exclude_transcripts']: excluded_trs = set(gf_data['exclude_transcripts'].split(',')) if verbose: sys.stderr.write( "[Message] These transcripts will be excluded: %s.\n" % ", ".join(excluded_trs)) db_conn = common.db_connect(*trapid_db_data) cursor = db_conn.cursor(MS.cursors.DictCursor) cursor.execute( query_str.format(exp_id=gf_data['experiment_id'], gf_id=gf_data['gf_id'])) for record in common.ResultIter(cursor): if record['transcript_id'] not in excluded_trs: trs_sqces[record['transcript_id']] = translate_dna_to_aa( dna_string=record['orf_sequence'], transl_tables=transl_tables, transl_table_idx=record['transl_table']) return trs_sqces
def get_gf_data(exp_id, gf_id, trapid_db_data, verbose=False): """Retrieve all gene family data for a TRAPID experiment's gene family. :param exp_id: TRAPID experiment id :param gf_id: gene family id, for which data is retrieved :param trapid_db_data: TRAPID database connection data (parameters for common.db_connect()) :param verbose: whether to be verbose (print extra information to stderr if set to True) :return: gene family data as a field:value dictionary """ if verbose: sys.stderr.write( "[Message] Retrieve data for GF '%s' (experiment '%d') from TRAPID DB. \n" % (gf_id, exp_id)) query_str = "SELECT * FROM `gene_families` WHERE `experiment_id`='{exp_id}' AND `gf_id`='{gf_id}';" db_conn = common.db_connect(*trapid_db_data) cursor = db_conn.cursor(MS.cursors.DictCursor) cursor.execute(query_str.format(exp_id=exp_id, gf_id=gf_id)) gf_data = cursor.fetchone() db_conn.close() if not gf_data: sys.stderr.write( "[Error] No GF data could be retrieved (GF '%s', experiment '%d')!\n" % (gf_id, exp_id)) sys.exit(1) return gf_data
def get_ref_db_sqce_type(trapid_db_data, ref_db_name, verbose=False): """Get sequence type (`DNA` or `AA`) of a reference database. :param trapid_db_data: TRAPID database connection data (parameters for common.db_connect()) :param ref_db_name: reference database for which to retrieve sequence type :param verbose: whether to be verbose (print extra information to stderr if set to True) :return: reference database sequence type """ if verbose: sys.stderr.write( "[Message] Retrieve sequence type for reference database '%s' from TRAPID DB. \n" % ref_db_name) query_str = "SELECT `seq_type` FROM `data_sources` WHERE `db_name`='{ref_db_name}';" db_conn = common.db_connect(*trapid_db_data) cursor = db_conn.cursor() cursor.execute(query_str.format(ref_db_name=ref_db_name)) sqce_type = cursor.fetchone()[0] db_conn.close() if not sqce_type: sys.stderr.write( "[Error] Impossible to retrieve sequence type for reference database '%s'!\n" % (ref_db_name)) sys.exit(1) if sqce_type not in ['DNA', 'AA']: sys.stderr.write( "[Error] Invalid sequence type for reference DB '%s'!\n" % ref_db_name) return sqce_type
def get_exp_data(exp_id, trapid_db_data, verbose=False): """Get reference database name and gene family type used for a TRAPID experiment. :param exp_id: TRAPID experiment id :param trapid_db_data: TRAPID database connection data (parameters for common.db_connect()) :param verbose: whether to be verbose (print extra information to stderr if set to True) :return: experiment's reference database and gene family type """ if verbose: sys.stderr.write( "[Message] Retrieve data for experiment '%d' from TRAPID DB. \n" % exp_id) query_str = "SELECT `used_plaza_database`, `genefamily_type` FROM `experiments` WHERE `experiment_id`='{exp_id}';" db_conn = common.db_connect(*trapid_db_data) cursor = db_conn.cursor(MS.cursors.DictCursor) cursor.execute(query_str.format(exp_id=exp_id)) exp_data = cursor.fetchone() db_conn.close() if not exp_data: sys.stderr.write( "[Error] Impossible to retrieve experiment data (experiment '%d')!\n" % (exp_id)) sys.exit(1) return exp_data
def main(): # Parse command-line arguments cmd_args = parse_arguments() # Read enrichment configuration file config = common.load_config( cmd_args.ini_file_enrichment, {'trapid_db', 'reference_db', 'experiment', 'enrichment'}) # TRAPID db data (list containing all needed parameters for `common.db_connect()`) trapid_db_data = common.get_db_connection_data(config, 'trapid_db') # Ref. db data (list containing all needed parameters for `common.db_connect()`) ref_db_data = common.get_db_connection_data(config, 'reference_db') exp_id = config['experiment']['exp_id'] tmp_dir = config['experiment']['tmp_exp_dir'] enricher_bin = config['enrichment']['enricher_bin'] # Check existence of enrichmer bin fe.check_enricher_bin(enricher_bin, cmd_args.verbose) # Get GO data from reference database if `fa_type` is GO go_data = {} if cmd_args.fa_type == 'go': db_conn = common.db_connect(*ref_db_data) go_data = fe.get_go_data(db_conn) db_conn.close() # Delete previous enrichment results from TRAPID DB db_conn = common.db_connect(*trapid_db_data) fe.delete_previous_results(db_conn, exp_id, cmd_args.fa_type, cmd_args.subset, cmd_args.max_pval, cmd_args.verbose) db_conn.close() # Run enricher enrichment_data = fe.run_enricher(trapid_db_data, exp_id, cmd_args.fa_type, cmd_args.subset, cmd_args.max_pval, go_data, enricher_bin, tmp_dir, cmd_args.keep_tmp, cmd_args.verbose) # Create result records and upload them to TRAPID DB enrichment_rows = fe.create_enrichment_rows(enrichment_data['results'], enrichment_data['gf_data'], exp_id, cmd_args.subset, cmd_args.fa_type, cmd_args.max_pval, go_data) db_conn = common.db_connect(*trapid_db_data) fe.upload_results_to_db(db_conn, enrichment_rows, cmd_args.verbose) db_conn.close()
def get_gf_sqces(gf_data, ref_db_data, gf_type, ref_sqce_type, transl_tables): """Retrieve gene family reference/external protein sequences for MSA and tree creation. :param gf_data: all gene family data retrieved from the TRAPID database (output of `get_gf_data()`) :param ref_db_data: reference database data (parameters for `common.db_connect()`) :param gf_type: gene family type of the current TRAPID experiment :param ref_sqce_type: sequence type of reference database sequences (DNA or AA) :param transl_tables: translation tables dictionary :return: reference protein sequences as a sequence_id:sequence dictionary """ if gf_type not in ['HOM', 'IORTHO']: sys.stderr.write( "[Error] Invalid GF type ('%s'), cannot retrieve GF sequences!\n" % gf_type) # Get used species (spacies name) based on `gf_data` tax ids species_query_str = "SELECT `species` FROM `annot_sources` WHERE `tax_id` IN ({tax_id_str});" tax_id_str = "'%s'" % "','".join(gf_data['used_species'].split(',')) used_species = set([]) used_genes = set([]) used_sqces = {} db_conn = common.db_connect(*ref_db_data) cursor = db_conn.cursor(MS.cursors.DictCursor) cursor.execute(species_query_str.format(tax_id_str=tax_id_str)) for rec in cursor.fetchall(): used_species.add(rec['species']) # To replace (update)? # Would it be better to fetch filtered data directly? gf_data_query_str = "SELECT gene_id, species FROM `gf_data` WHERE `gf_id` = '{gf_id}';" cursor.execute(gf_data_query_str.format(gf_id=gf_data['plaza_gf_id'])) for record in common.ResultIter(cursor): if record['species'] in used_species: used_genes.add(record['gene_id']) # If we are dealing with an IORTHO GF, further filtering is required (only keep sequences that are in `gf_content`) if gf_type == "IORTHO" and gf_data['gf_content']: to_remove = set(gf_data['gf_content'].split(',')) used_genes = used_genes - to_remove # Retrieve and translate sequences for genes in `used_genes` sqce_query_str = "SELECT `gene_id`, `seq`, `transl_table` FROM annotation WHERE `gene_id` IN ({gene_id_str});" gene_id_str = "'%s'" % "','".join(sorted(list(used_genes))) # print sqce_query_str.format(gene_id_str=gene_id_str) cursor.execute(sqce_query_str.format(gene_id_str=gene_id_str)) # Store & return translated sequences if ref_sqce_type == 'DNA': for record in common.ResultIter(cursor): used_sqces[record['gene_id']] = translate_dna_to_aa( dna_string=record['seq'], transl_tables=transl_tables, transl_table_idx=record['transl_table']) else: for record in common.ResultIter(cursor): used_sqces[record['gene_id']] = record['seq'] db_conn.close() return used_sqces
def main(args): # Initialize database path = "sqlite:///{}".format(args.database_path) conn = db_connect(path, args.debug) # query SQL s = select([ms.c.Last, ms.c.PrevDay, ms.c.MarketName]) rp = conn.execute(s) # show report for record in rp: daily_return = record.Last / record.PrevDay - 1 daily_return = format(daily_return, '.2f') print("{} {}%".format(record.MarketName, daily_return))
def clear_transcripts_table(exp_id, trapid_db_data): """Clear content from the `transcripts` table of the TRAPID database, which is necessary prior to updating the table with new Infernal results. :param exp_id: TRAPID experiment id :param trapid_db_data: TRAPID db connection data """ sys.stderr.write('[Message] Clear content in `transcripts` table. \n') clear_query_str = "UPDATE `transcripts` SET `is_rna_gene`=0, `rf_ids`=NULL WHERE `experiment_id`='{exp_id}';" db_conn = common.db_connect(*trapid_db_data) cursor = db_conn.cursor() formatted_query = clear_query_str.format(exp_id=exp_id) cursor.execute(formatted_query) db_conn.commit() db_conn.close()
def get_infernal_z_value(exp_id, trapid_db_data): """Retrieve value needed for cmscan/cmsearch `-Z` parameter (search space size in megabase, here the total length of query sequences in Mb multiplied by 2). :param exp_id: TRAPID experiment id :param trapid_db_data: TRAPID db connection data :return: value for `-Z` parameter """ query_str = "SELECT SUM(`len`) FROM (SELECT CHAR_LENGTH(UNCOMPRESS(`transcript_sequence`)) AS len FROM `transcripts` WHERE experiment_id ='{exp_id}') tr;" db_conn = common.db_connect(*trapid_db_data) cursor = db_conn.cursor() cursor.execute(query_str.format(exp_id=exp_id)) total_nts = float([record for record in cursor.fetchone()][0]) db_conn.close() return (total_nts / 10e6) * 2
def cleanup_table(exp_id, table_name, trapid_db_data): """Cleanup a table from the db for an experiment. :param exp_id: TRAPID experiment id :param table_name: the table to cleanup :param trapid_db_data: TRAPID db connection data """ query_str = "DELETE FROM `{table_name}` WHERE `experiment_id`='{exp_id}'" # Cleanup previous Infernal results for the experiment sys.stderr.write( '[Message] Cleanup previous data from `{table_name}`. \n'.format( table_name=table_name)) db_conn = common.db_connect(*trapid_db_data) cursor = db_conn.cursor() cursor.execute(query_str.format(table_name=table_name, exp_id=exp_id)) db_conn.commit() db_conn.close()
def retrieve_rfam_go_data(trapid_db_data): """Retrieve Rfam GO annotation stored the `configuration` table of TRAPID's database. Return it as cm:gos dictionary. :param trapid_db_data: TRAPID db connection data :return: Rfam RNA family GO annotations as cm:gos dictionary. """ sys.stderr.write( "[Message] Retrieve Rfam GO annotation from `configuration`.\n") rfam_go = {} query_str = "SELECT `key`, `value` FROM `configuration` WHERE `method`='rfam_annotation' AND `attr`='go'" db_conn = common.db_connect(*trapid_db_data) cursor = db_conn.cursor(MS.cursors.DictCursor) cursor.execute(query_str) for record in cursor.fetchall(): rfam_go[record['key']] = record['value'].split(',') db_conn.close() return rfam_go
def get_ref_db(exp_id, trapid_db_data, verbose=False): """Get reference database name used for a TRAPID experiment from TRAPID DB. :param exp_id: TRAPID experiment id :param trapid_db_data: TRAPID database connection data (parameters for common.db_connect()) :param verbose: whether to be verbose (print extra information to stderr if set to True) :return: the experiment's reference database name """ if verbose: sys.stderr.write("[Message] Retrieve reference database name of experiment '%d' from TRAPID DB. \n" % exp_id) query_str = "SELECT `used_plaza_database` FROM `experiments` WHERE `experiment_id`='{exp_id}';" db_conn = common.db_connect(*trapid_db_data) cursor = db_conn.cursor() cursor.execute(query_str.format(exp_id=exp_id)) ref_db_name = cursor.fetchone()[0] db_conn.close() return ref_db_name
def store_rna_similarities(exp_id, infernal_results, trapid_db_data): """Store Infernal tabulated output data in the `rna_similarities` table of TRAPID's db. :param exp_id: TRAPID experiment id :param infernal_results: parsed infernal results to store :param trapid_db_data: TRAPID db connection data """ # First cleanup the table cleanup_table(exp_id=exp_id, table_name="rna_similarities", trapid_db_data=trapid_db_data) sorted_infernal_results = sorted(infernal_results, key=lambda k: float(k['score']), reverse=True) sys.stderr.write( '[Message] Store Infernal results in `rna_similarities`. \n') query_str = "INSERT INTO `rna_similarities` (`experiment_id`,`transcript_id`,`similarity_data`) VALUES ('{exp_id}','{transcript_id}', COMPRESS(\"{infernal_data}\"))" # Get and format similarity data fields_to_keep = [ "cm_acc", "cm_id", "clan", "e_value", "score", "bias", "mdl_from", "mdl_to", "trunc", "seq_from", "seq_to" ] rna_sim_data = {} for rec in sorted_infernal_results: if rec["query"] not in rna_sim_data: sim_str = ",".join([rec[f] for f in fields_to_keep]) rna_sim_data[rec["query"]] = [sim_str] else: sim_str = ",".join([rec[f] for f in fields_to_keep]) rna_sim_data[rec["query"]].append(sim_str) for transcript_id in rna_sim_data: rna_sim_data[transcript_id] = ";".join(rna_sim_data[transcript_id]) # Store Infernal results in `rna_similarities` db_conn = common.db_connect(*trapid_db_data) cursor = db_conn.cursor() for transcript_id in sorted(rna_sim_data.keys()): cursor.execute( query_str.format(exp_id=exp_id, transcript_id=transcript_id, infernal_data=rna_sim_data[transcript_id])) db_conn.commit() db_conn.close()
def get_tax_data(ref_db_data, verbose=False): """Get species, common name and tax ids from the `annot_sources` table of a reference database. :param ref_db_data: reference database connection data (parameters for common.db_connect()) :param verbose: whether to be verbose (print extra information to stderr if set to True) :return: return fetched `annot_sources` data as a dictionary (species:{tax_id, common_name}) """ if verbose: sys.stderr.write("[Message] Retrieve taxonomy data from reference DB '%s'. \n" % ref_db_data[-1]) tax_data = {} query_str = "SELECT `species`, `tax_id`, `common_name` FROM `annot_sources`;" db_conn = common.db_connect(*ref_db_data) cursor = db_conn.cursor(MS.cursors.DictCursor) cursor.execute(query_str) for rec in cursor.fetchall(): tax_data[rec['species']] = {'tax_id': rec['tax_id'], 'common_name': rec['common_name']} db_conn.close() return tax_data
def create_infernal_files(exp_id, tmp_exp_dir, rfam_dir, exp_clans, trapid_db_data): """Create `cm` and `clanin` files needed by Infernal for user-selected Rfam clans. :param exp_id: TRAPID experiment id :param tmp_exp_dir: experiment's directory :param rfam_dir: directory with Rfam data :param exp_clans: user-selected Rfam clans :param trapid_db_data: TRAPID db connection data """ # individual_cms = "individual_cms" # Name of directory containing individual CMs (in `rfam_dir`) rfam_clans_file = "Rfam_%s.clanin" % exp_id rfam_cm_file = os.path.join(tmp_exp_dir, "Rfam_%s.cm" % exp_id) exp_cms_file = "rfam_cms_%s.lst" % exp_id sys.stderr.write( "[Message] Create Rfam `cm` and `clanin` files for Infernal ('%s' and '%s').\n" % (rfam_cm_file, rfam_clans_file)) clan_members = {} exp_cms = set() # Get clan membership information from `configuration` table # Since there are only 111 clans, we can retrieve this information for all of them query_str = "SELECT `key`, `value` FROM `configuration` WHERE `method`='rfam_clans' AND `attr`='families'" db_conn = common.db_connect(*trapid_db_data) cursor = db_conn.cursor(MS.cursors.DictCursor) cursor.execute(query_str) for record in cursor.fetchall(): clan_members[record['key']] = record['value'].split(',') db_conn.close() # Create `clanin` file with open(os.path.join(tmp_exp_dir, rfam_clans_file), "w") as out_file: for clan in exp_clans: clanin_str = "{clan}\t{members}\n" out_file.write( clanin_str.format(clan=clan, members="\t".join(clan_members[clan]))) # Also retrieve Rfam models (update `exp_cms`, later used to retrieve individual models) exp_cms.update(clan_members[clan]) # Create file listing the experiment's CM (input file for `cmfetch`) with open(os.path.join(tmp_exp_dir, exp_cms_file), "w") as out_file: out_file.write('\n'.join(sorted(list(exp_cms))) + '\n') # Create `cm` file using Infernal's `cmfetch` command run_cmfetch(exp_id, tmp_exp_dir, rfam_dir)
def get_gene_species(gf_id, ref_db_data, verbose=False): """Retrieve the species of the members of a gene family from a reference database. :param gf_id: gene family for which to retrieve reference data :param ref_db_data: reference database connection data (parameters for common.db_connect()) :param verbose: whether to be verbose (print extra information to stderr if set to True) :return: gene family species data as a dictionary (gene:species). """ ref_gf_id = gf_id.split('_')[1] if verbose: sys.stderr.write("[Message] Retrieve species for members of GF '%s' ('%s' reference DB). \n" % (ref_gf_id, ref_db_data[-1])) gene_species = {} query_str = "SELECT `gene_id`, `species` FROM `gf_data` WHERE `gf_id`='{ref_gf_id}';" db_conn = common.db_connect(*ref_db_data) cursor = db_conn.cursor(MS.cursors.DictCursor) cursor.execute(query_str.format(ref_gf_id=ref_gf_id)) for rec in cursor.fetchall(): gene_species[rec['gene_id']] = rec['species'] db_conn.close() return gene_species
def store_msa_db(msa_file, msa_program, exp_id, gf_id, trapid_db_data, verbose=False): """Read faln MSA file generated for a GF and upload its content to TRAPID database. :param msa_file: path to faln MSA file to upload :param msa_program: program used to perform the MSA (also stored in the database) :param exp_id: TRAPID experiment id :param gf_id: gene family id :param trapid_db_data: TRAPID database connection data (parameters for common.db_connect()) :param verbose: whether to be verbose (print extra information to stderr if set to True) """ query_str = "UPDATE `gene_families` SET `msa`='{msa_data}', `msa_params`='{msa_params}' WHERE `experiment_id`='{exp_id}' AND `gf_id`='{gf_id}';" if verbose: sys.stderr.write( "[Message] Store MSA data ('%s') for GF '%s' of experiment '%d' to TRAPID database.\n" % (msa_file, gf_id, exp_id)) msa_data = [] with open(msa_file, 'r') as in_file: for line in in_file: if line.startswith(">"): msa_data.append(line.strip() + ";") else: msa_data[-1] += line.strip() msa_data_str = ''.join(msa_data) if msa_data_str: db_conn = common.db_connect(*trapid_db_data) cursor = db_conn.cursor() cursor.execute( query_str.format(msa_data=msa_data_str, msa_params=msa_program, exp_id=exp_id, gf_id=gf_id)) db_conn.commit() db_conn.close()
def upload_phyloxml_tree(exp_id, gf_id, phyloxml_tree, trapid_db_data, verbose=False): """Upload a PhyloXML tree generated for a gene family to TRAPID db. :param exp_id: TRAPID experiment id :param gf_id: gene family for which a PhyloXML tree is generated :param phyloxml_tree: PhyloXML tree string :param trapid_db_data: TRAPID database connection data (parameters for common.db_connect()) :param verbose: whether to be verbose (print extra information to stderr if set to True) """ if verbose: sys.stderr.write("[Message] Upload PhyloXML tree for GF '%s' (experiment '%d'). \n" % (gf_id, exp_id)) # Minify XML tree string minified_tree_xml = etree.XML(phyloxml_tree, parser=etree.XMLParser(remove_blank_text=True)) minified_tree_str = etree.tostring(minified_tree_xml) # Upload minified XML tree query_str = "UPDATE `gene_families` SET `xml_tree`='{xml_str}' WHERE `experiment_id`='{exp_id}' AND `gf_id`='{gf_id}';" db_conn = common.db_connect(*trapid_db_data) cursor = db_conn.cursor() cursor.execute(query_str.format(xml_str=minified_tree_str, exp_id=exp_id, gf_id=gf_id)) db_conn.commit() db_conn.close()
def delete_db_job(exp_id, gf_id, trapid_db_data, msa_only, verbose=False): """Delete MSA/tree creation job from TRAPID db. :param exp_id: TRAPID experiment id :param gf_id: gene family id :param trapid_db_data: TRAPID database connection data (parameters for common.db_connect()) :param msa_only: whether only an MSA, and no tree, was generated :param verbose: whether to be verbose (print extra information to stderr if set to True) """ if verbose: sys.stderr.write( "[Message] Delete MSA/tree creation job from TRAPID database...\n") job_name = "create_tree" if msa_only: job_name = "create_msa" comment_str = "{} {}".format(job_name, gf_id) db_conn = common.db_connect(*trapid_db_data) common.delete_experiment_job(experiment_id=exp_id, job_name=comment_str, db_conn=db_conn) db_conn.close()
def send_end_email(exp_id, gf_id, trapid_db_data, msa_only): """Send an email to the owner of an experiment to warn them of job completion. :param exp_id: TRAPID experiment id :param gf_id: gene family id for which an MSA/tree was generated :param trapid_db_data: TRAPID database connection data (parameters for common.db_connect()) :param msa_only: whether only an MSA, and no tree, was generated """ # Get title & email address associated to the experiment query_str = "SELECT a.`title`, b.`email` FROM `experiments` a,`authentication` b WHERE a.`experiment_id`='{exp_id}' AND b.`user_id`=a.`user_id`;" page_url = '/'.join( [common.TRAPID_BASE_URL, 'tools', 'create_tree', str(exp_id), gf_id]) db_conn = common.db_connect(*trapid_db_data) cursor = db_conn.cursor() cursor.execute(query_str.format(exp_id=exp_id)) exp_data = cursor.fetchone() db_conn.close() # Send email if not exp_data: sys.stderr.write( "[Error] Impossible to retrieve experiment title/email address (experiment '%d')!\n" % (exp_id)) sys.exit(1) email_subject = "TRAPID phylogenetic tree finished (%s)\n" % gf_id if msa_only: email_subject = "TRAPID MSA finished (%s)\n" % gf_id email_content = ( "Dear user,\n\n" "The phylogenetic tree for gene family '{gf_id}' in experiment '{exp_title}' has been created.\n\n" "You can now view it at this URL: {page_url}\n\n" "Thank you for using TRAPID.\n").format(gf_id=gf_id, page_url=page_url, exp_title=exp_data[0]) common.send_mail(to=[exp_data[1]], subject=email_subject, text=email_content)
def get_subsets(exp_id, gf_id, trapid_db_data, verbose=False): """Retrieve subset information for transcripts assigned to a gene family from the TRAPID database. :param exp_id: TRAPID experiment id :param gf_id: gene family containing transcripts for which to retrieve subset information :param trapid_db_data: TRAPID database connection data (parameters for common.db_connect()) :param verbose: whether to be verbose (print extra information to stderr if set to True) :return: transcript subset information as a dictionary (transcript_id:subsets) """ if verbose: sys.stderr.write("[Message] Retrieve subsets for transcripts of GF '%s' (experiment '%d'). \n" % (gf_id, exp_id)) tr_subsets = {} transcripts = set() tr_query_str = "SELECT `transcript_id` FROM `transcripts` WHERE `experiment_id`='{exp_id}' AND `gf_id`='{gf_id}';" subset_query_str = "SELECT `transcript_id`,`label` FROM `transcripts_labels` WHERE `experiment_id`='{exp_id}' AND `transcript_id` IN ({transcripts});" db_conn = common.db_connect(*trapid_db_data) cursor = db_conn.cursor() # Get transcript ids cursor.execute(tr_query_str.format(exp_id=exp_id, gf_id=gf_id)) # OK to use `fetchall()` since we are getting data for a single GF only... for rec in cursor.fetchall(): transcripts.add(rec[0]) # Create a string for `IN` clause of SQL query transcripts_str = "'%s'" % "', '".join(sorted(list(transcripts))) # Get subset information for transcripts cursor.execute(subset_query_str.format(exp_id=exp_id, transcripts=transcripts_str)) for rec in cursor.fetchall(): if rec[0] in tr_subsets: tr_subsets[rec[0]].append(rec[1]) else: tr_subsets[rec[0]] = [rec[1]] db_conn.close() if verbose: sys.stderr.write("[Message] Retrieved subset information: %s\n" % str(tr_subsets)) return tr_subsets
def get_meta_annotation(exp_id, gf_id, trapid_db_data, verbose=False): """Retrieve meta-annotation for transcripts assigned to a gene family from the TRAPID database. :param exp_id: TRAPID experiment id :param gf_id: gene family containing transcripts for which to retrieve meta-annotation :param trapid_db_data: TRAPID database connection data (parameters for common.db_connect()) :param verbose: whether to be verbose (print extra information to stderr if set to True) :return: GF transcript meta-annotation as a dictionary (transcript_id:meta_annotation) """ if verbose: sys.stderr.write("[Message] Retrieve meta-annotation for transcripts of GF '%s' (experiment '%d'). \n" % (gf_id, exp_id)) tr_meta_annotation = {} query_str = "SELECT `transcript_id`, `meta_annotation` FROM `transcripts` WHERE `experiment_id`='{exp_id}' AND `gf_id`='{gf_id}';" db_conn = common.db_connect(*trapid_db_data) cursor = db_conn.cursor() cursor.execute(query_str.format(exp_id=exp_id, gf_id=gf_id)) # OK to use `fetchall()` since we are getting data for a single GF only... for rec in cursor.fetchall(): tr_meta_annotation[rec[0]] = rec[1] db_conn.close() if verbose: sys.stderr.write("[Message] Retrieved meta-annotation: %s\n" % str(tr_meta_annotation)) return tr_meta_annotation
def store_rna_families(exp_id, infernal_results, trapid_db_data): """Populate `rna_families` for the experiment from Infernal results. :param exp_id: TRAPID experiment id :param infernal_results: parsed infernal results :param trapid_db_data: TRAPID db connection data """ # First cleanup the table cleanup_table(exp_id=exp_id, table_name="rna_families", trapid_db_data=trapid_db_data) sys.stderr.write('[Message] Store Infernal results in `rna_families`. \n') query_str = "INSERT INTO `rna_families` (`experiment_id`,`rf_id`,`rfam_rf_id`,`rfam_clan_id`, `num_transcripts`) VALUES ('{e}','{f}','{rf}','{c}','{n}')" # Get and format data from Infernal results rna_fam_data = {} for rec in infernal_results: if rec["cm_acc"] not in rna_fam_data: rna_fam_data[rec["cm_acc"]] = { "clan": rec["clan"], "n_transcripts": 1 } else: rna_fam_data[rec["cm_acc"]]["n_transcripts"] += 1 db_conn = common.db_connect(*trapid_db_data) cursor = db_conn.cursor() for rf_id in sorted(rna_fam_data.keys()): exp_rf_id = "%s_%s" % (exp_id, rf_id) cursor.execute( query_str.format(e=exp_id, f=exp_rf_id, rf=rf_id, c=rna_fam_data[rf_id]["clan"], n=rna_fam_data[rf_id]["n_transcripts"])) db_conn.commit() db_conn.close()
def main(): sys.stderr.write('[Message] Starting Kaiju procedure: %s\n' % time.strftime('%Y/%m/%d %H:%M:%S')) cmd_args = parse_arguments() # sys.stderr.write(str(cmd_args)+'\n') # Debug # Read experiment's initial processing configuration file config = common.load_config(cmd_args.ini_file_initial, {"tax_binning", "trapid_db", "experiment"}) # A list containing all needed parameters for `common.db_connect()` trapid_db_data = common.get_db_connection_data(config, 'trapid_db') exp_id = config['experiment']['exp_id'] # Input / output input_file = os.path.join(config['experiment']['tmp_exp_dir'], "transcripts_%s.fasta" % exp_id) output_dir = os.path.join(config['experiment']['tmp_exp_dir'], "kaiju") output_script = os.path.join(config['experiment']['tmp_exp_dir'], "run_kaiju_split.sh") # Paths to files needed for taxonomic classification and post-processing split_db_dir = config['tax_binning']['splitted_db_dir'] kaiju_parameters = config['tax_binning']['kaiju_parameters'] names_dmp_file = config['tax_binning']['names_dmp_file'] nodes_dmp_file = config['tax_binning']['nodes_dmp_file'] # Update experiment log db_connection = common.db_connect(*trapid_db_data) common.update_experiment_log(exp_id, 'start_tax_classification', 'kaiju_mem', 2, db_connection) db_connection.close() # Create and run shell script that will run Kaiju. kaiju_split_script = create_shell_script( get_split_db_files(split_db_dir), os.path.join(output_dir, "split_results"), output_script, kaiju_parameters, nodes_dmp_file, input_file) # Impossible to qsub directly from the webcluster... So run Kaiju shell script from here # qsub_and_wait(script_name=kaiju_split_script, n_cores=1, mem_per_core=8) os.chmod(kaiju_split_script, 0o755) job = subprocess.Popen(kaiju_split_script) job.communicate() # Merge Kaiju results merge_kaiju_split_results.merge_results( os.path.join(output_dir, "split_results"), nodes_dmp_file, os.path.abspath(os.path.join(output_dir, "kaiju_merged.out"))) # Process output file to generate graphical outputs kaiju_data_dir = os.path.abspath(output_dir) data_dict = { 'names_dmp': names_dmp_file, 'nodes_dmp': nodes_dmp_file, 'kaiju_output': os.path.join(kaiju_data_dir, "kaiju_merged.out"), # Domain, phylum, order, genus composition files 'domain_comp': os.path.join(kaiju_data_dir, 'top_tax.domain.tsv'), 'phylum_comp': os.path.join(kaiju_data_dir, 'top_tax.phylum.tsv'), 'order_comp': os.path.join(kaiju_data_dir, 'top_tax.order.tsv'), 'genus_comp': os.path.join(kaiju_data_dir, 'top_tax.genus.tsv'), # Krona HTML 'kaiju_tsv_output': os.path.join(kaiju_data_dir, 'kaiju_merged.to_krona.out'), 'krona_html_file': os.path.join(kaiju_data_dir, 'kaiju_merged.krona.html'), # Treeview JSON 'treeview_json': os.path.join(kaiju_data_dir, 'kaiju_merged.to_treeview.json') } # Generate all visualization output files # 1. Krona try: kaiju_viz.kaiju_to_krona(data_dict['kaiju_output'], data_dict['kaiju_tsv_output'], data_dict['krona_html_file'], data_dict['names_dmp'], data_dict['nodes_dmp']) except Exception as e: print(e) sys.stderr.write("[Error] Unable to produce Krona output. \n") # 2. Treeview JSON data try: kaiju_viz.kaiju_to_treeview(data_dict['kaiju_output'], data_dict['treeview_json'], data_dict['names_dmp'], data_dict['nodes_dmp']) except Exception as e: print(e) sys.stderr.write("[Error] Unable to produce Treeview JSON file. \n") # 3. Pie/barcharts summaries data try: # At superkingdom/domain level kaiju_viz.kaiju_to_domain_summary_data(data_dict['kaiju_tsv_output'], data_dict['domain_comp'], data_dict['names_dmp'], data_dict['nodes_dmp']) # At selected taxonomic ranks (iterating over a rank->rank composition file dictionary) tax_ranks = { 'phylum': 'phylum_comp', 'order': 'order_comp', 'genus': 'genus_comp' } for rank, rank_comp in tax_ranks.items(): kaiju_viz.kaiju_to_tax_summary_data(data_dict['kaiju_output'], data_dict[rank_comp], data_dict['names_dmp'], data_dict['nodes_dmp'], rank_limit=rank) # kaiju_viz.kaiju_to_tax_summary_data( # kaiju_output_file=data_dict['kaiju_output'], # names_tax_file=data_dict['names_dmp'], # nodes_tax_file=data_dict['nodes_dmp'], # output_data_table=data_dict['order_comp'], # rank_limit='order') # kaiju_viz.kaiju_to_tax_summary_data( # kaiju_output_file=data_dict['kaiju_output'], # names_tax_file=data_dict['names_dmp'], # nodes_tax_file=data_dict['nodes_dmp'], # output_data_table=data_dict['genus_comp'], # rank_limit='genus') except Exception as e: print(e) sys.stderr.write( "[Error] Unable to produce domain/taxonomic rank summaries. \n") # Now, let's cleanup existing results and store current results in the TRAPID database db_connection = common.db_connect(*trapid_db_data) clear_db_content(exp_id, db_connection) db_connection.close() db_connection = common.db_connect(*trapid_db_data) kaiju_output_to_db(exp_id, data_dict['kaiju_output'], db_connection) db_connection.close() # Also update experiment log db_connection = common.db_connect(*trapid_db_data) common.update_experiment_log(exp_id, 'stop_tax_classification', 'kaiju_mem', 2, db_connection) db_connection.close() sys.stderr.write('[Message] Finished Kaiju procedure: %s\n' % time.strftime('%Y/%m/%d %H:%M:%S'))
from flask import Flask, render_template from common import db_connect, DATABASE_FILENAME, db_exec db = db_connect(DATABASE_FILENAME) app = Flask(__name__) @app.route("/") def index(): c_books = db_exec( db, """ select b.id, b.title, b.isbn from books b order by b.title; """) c_authors = db_exec( db, """ select ba.book_id, a.id, a.surname, a.given_name from books_authors ba, authors a where ba.author_id = a.id; """) books = {x[0]: {"title": x[1], "isbn": x[2]} for x in c_books} authors = dict() for book_id, author_id, surname, given_name in c_authors: if book_id not in authors.keys(): authors[book_id] = [{ "id": int(author_id),
def run(self): """ 03-18-05 mapping_dict all changed to haiyan_no2gene_no 04-12-05 use min_cluster_size to cut off some small clusters 07-03-05 construct graph_modeling's cor_cut_off vector first 10-14-05 add calculate_unknown_gene_ratio() 12-06-05 add gene_no2incidence_array to parser_type ==4 05-31-06 add type 5 (haifeng's output) --db_connect() --get_haiyan_no2gene_no() --get_known_genes_dict() --get_gene_id2gene_no() --create_tables() --graph_modeling.cor_cut_off_vector_construct() (loop over inf) --parser_dict[parser_type]() (codense_parser(), copath_parser() ) --get_combined_cor_vector --parse_recurrence --parse_connectivity --get_vertex_set_gim_array() (parser_type=4 only) --calculate_unknown_gene_ratio() --db_submit() """ inf = csv.reader(open(self.infname, 'r'), delimiter=self.delimiter) (conn, curs) = db_connect(self.hostname, self.dbname, self.schema) #setup the haiyan_no2gene_no if self.mapping_file != None: haiyan_no2gene_no = get_haiyan_no2gene_no(self.mapping_file) else: haiyan_no2gene_no = {} #a blank dictionary, known_gene_no2go_no_set = get_known_genes_dict(curs) #10-14-05 used to get unknown_gene_ratio if self.parser_type == 4 or self.parser_type==5: #12-06-05 if self.gim_inputfname == None: sys.stderr.write("\n parser_type = 4 needs gim_inputfname.\n") sys.exit(3) gene_id2gene_no = get_gene_id2gene_no(curs) gene_no2incidence_array = get_gene_no2incidence_array(self.gim_inputfname, gene_id2gene_no) else: gene_no2incidence_array = None mapping_dict = {1:haiyan_no2gene_no, 2:haiyan_no2gene_no, 3:None, 4:gene_no2incidence_array, 5:gene_no2incidence_array} self.create_tables(curs, self.table, self.mcl_table, self.pattern_table) no = 0 graph_modeling.cor_cut_off_vector_construct(0, 0.8) #07-03-05 compute the cor cutoff vector for graph_modeling, use 0.8 as cutoff #graph_modeling.ind_min_cor() requires the cor_cut_off vector to be constructed ahead. graph_modeling.set_jk_cut_off(6) #07-03-05 haiyan's cutoff is 6, different from my default value, 7. for row in inf: cluster_list = self.parser_dict[self.parser_type](row, mapping_dict[self.parser_type], curs) for cluster in cluster_list: if self.parser_type!=5 and len(cluster.vertex_set)<self.min_cluster_size: #too small, ignore, 2006-08-29 if it's haifeng_output_parser, no restriction for cluster size, haifeng imposes 4 continue #10-14-05 unknown_gene_ratio to submit to pattern_table cluster.unknown_gene_ratio = self.calculate_unknown_gene_ratio(cluster.vertex_set, known_gene_no2go_no_set) self.db_submit(curs, cluster, self.pattern_table) no+=1 if self.report and no%1000==0: sys.stderr.write('%s%d'%('\x08'*20, no)) if self.report: sys.stderr.write('%s%d'%('\x08'*20, no)) if self.needcommit: conn.commit() sys.stderr.write('\n\tTotal patterns: %d\n'%no)
def main(): """ Main function: run Infernal, filter results and flag RNA genes in TRAPID db. """ cmd_args = parse_arguments() # Read experiment's initial processing configuration file config = common.load_config(cmd_args.ini_file_initial, {"infernal", "trapid_db", "experiment"}) # The web application sets the Rfam clan string to 'None' in case the user chose no clans # If this is the case, exit the script with an information message if config["infernal"]["rfam_clans"] == "None": sys.stderr.write( "[Message] No Rfam clans selected: skip ncRNA annotation step.\n") sys.exit() try: # Run Infernal, parse and export results to DB sys.stderr.write( '[Message] Starting ncRNA annotation procedure: %s\n' % time.strftime('%Y/%m/%d %H:%M:%S')) exp_id = config["experiment"]["exp_id"] tmp_exp_dir = config["experiment"]["tmp_exp_dir"] rfam_dir = config["infernal"]["rfam_dir"] exp_clans = config["infernal"]["rfam_clans"].split(",") # Lists containing all needed parameters for `common.db_connect()` (TRAPID + reference DB) trapid_db_data = common.get_db_connection_data(config, 'trapid_db') reference_db_data = common.get_db_connection_data( config, 'reference_db') db_connection = common.db_connect(*trapid_db_data) common.update_experiment_log(exp_id, 'start_nc_rna_search', 'Infernal', 2, db_connection) db_connection.close() create_infernal_files(exp_id, tmp_exp_dir, rfam_dir, exp_clans, trapid_db_data) # run_cmpress(exp_id=exp_id, tmp_exp_dir=tmp_exp_dir) total_m_nts = get_infernal_z_value(exp_id, trapid_db_data) infernal_tblout = run_infernal(exp_id, tmp_exp_dir, total_m_nts) # Filter Infernal tabulated output (keep best non-ovelrapping matches) # infernal_tblout_filtered = filter_out_overlaps(exp_id=exp_id, tmp_exp_dir=tmp_exp_dir, tblout_file=infernal_tblout) infernal_tblout_filtered = keep_best_results(exp_id, tmp_exp_dir, infernal_tblout) # Get filtered results as list of dict and add clan information # Read Rfam clan information from `clanin` file. Would it make more sense to retrieve it when creating it? cm_clans = get_exp_cm_clans(exp_id, tmp_exp_dir) filtered_infernal_results = infernal_tblout_to_list( infernal_tblout_filtered, cm_clans) infernal_results = infernal_tblout_to_list(infernal_tblout, cm_clans) # Flag potential rna genes (set `is_rna_gene` value to 1 and `rf_ids` in `transcripts` table) flag_rna_genes(exp_id, filtered_infernal_results, trapid_db_data) # Store filtered results in `rna_similarities` ... store_rna_similarities(exp_id, infernal_results, trapid_db_data) # ... and `rna_families` store_rna_families(exp_id, filtered_infernal_results, trapid_db_data) # Annotate transcripts using GO terms from Rfam rfam_go = retrieve_rfam_go_data(trapid_db_data) go_data = get_go_data(reference_db_data) # perform_go_annotation(exp_id, infernal_results, rfam_go, go_data, tmp_exp_dir) perform_go_annotation(exp_id, filtered_infernal_results, rfam_go, go_data, tmp_exp_dir) # That's it for now db_connection = common.db_connect(*trapid_db_data) common.update_experiment_log(exp_id, 'stop_nc_rna_search', 'Infernal', 2, db_connection) db_connection.close() sys.stderr.write( '[Message] Finished ncRNA annotation procedure: %s\n' % time.strftime('%Y/%m/%d %H:%M:%S')) # If any exception was raised, update the experiment's log, set status to 'error', and exit except Exception: print_exc() common.stop_initial_processing_error(exp_id, trapid_db_data)