def fix_miscolored_phams(engine): print("Phixing Phalsely Hued Phams...") # Phams which are colored as though they are orphams, when really # they are multi-member phams query = "SELECT * FROM (SELECT g.PhamID, COUNT(GeneID) AS count, "\ "p.Color FROM gene AS g INNER JOIN pham AS p ON g.PhamID " \ "= p.PhamID GROUP BY PhamID) AS c WHERE Color = '#FFFFFF' "\ "AND count > 1" results = mysqldb_basic.query_dict_list(engine, query) print(f"Found {len(results)} miscolored phams to fix") commands = [] for dictionary in results: pham_id = dictionary["PhamID"] count = dictionary["count"] color = dictionary["Color"] h = s = v = 0 while h <= 0: h = random.random() while s < 0.5: s = random.random() while v < 0.8: v = random.random() rgb = colorsys.hsv_to_rgb(h, s, v) rgb = (rgb[0] * 255, rgb[1] * 255, rgb[2] * 255) hexrgb = "#{:02x}{:02x}{:02x}".format(int(rgb[0]), int(rgb[1]), int(rgb[2])) new_color = hexrgb commands.append(f"UPDATE pham SET Color = '{new_color}' WHERE PhamID = '{pham_id}'") mysqldb.execute_transaction(engine, commands) print("Phixing Phalsely Phlagged Orphams...") # Phams which are colored as though they are multi-member phams # when really they are orphams query = "SELECT * FROM (SELECT g.PhamID, COUNT(GeneID) AS count, "\ "p.Color FROM gene AS g INNER JOIN pham AS p ON g.PhamID "\ "=p.PhamID GROUP BY PhamID) AS c WHERE Color != '#FFFFFF' "\ "AND count = 1" results = mysqldb_basic.query_dict_list(engine, query) print(f"Found {len(results)} miscolored orphams to fix...") commands = [] for dictionary in results: pham_id = dictionary["PhamID"] count = dictionary["count"] color = dictionary["Color"] new_color = "#FFFFFF" commands.append(f"UPDATE pham SET Color = '{new_color}' WHERE PhamID = '{pham_id}'") mysqldb.execute_transaction(engine, commands)
def fix_white_phams(engine): """ Find any phams with 2+ members which are colored as though they are orphams (#FFFFFF in pham.Color). :param engine: sqlalchemy Engine allowing access to the database :return: """ query = "SELECT c.PhamID FROM (SELECT g.PhamID, COUNT(GeneID) AS count, "\ "p.Color FROM gene AS g INNER JOIN pham AS p ON g.PhamID " \ "= p.PhamID GROUP BY PhamID) AS c WHERE Color = '#FFFFFF' "\ "AND count > 1" results = mysqldb_basic.query_dict_list(engine, query) print(f"Found {len(results)} white phams...") commands = [] for dictionary in results: pham_id = dictionary["PhamID"] h = s = v = 0 while h <= 0: h = random.random() while s < 0.5: s = random.random() while v < 0.8: v = random.random() rgb = colorsys.hsv_to_rgb(h, s, v) rgb = (rgb[0] * 255, rgb[1] * 255, rgb[2] * 255) hexrgb = "#{:02x}{:02x}{:02x}".format(int(rgb[0]), int(rgb[1]), int(rgb[2])) new_color = hexrgb.upper() commands.append(f"UPDATE pham SET Color = '{new_color}' WHERE " f"PhamID = '{pham_id}'") mysqldb.execute_transaction(engine, commands)
def fix_colored_orphams(engine): """ Find any single-member phams which are colored as though they are multi-member phams (not #FFFFFF in pham.Color). :param engine: sqlalchemy Engine allowing access to the database :return: """ query = "SELECT * FROM (SELECT g.PhamID, COUNT(GeneID) AS count, " \ "p.Color FROM gene AS g INNER JOIN pham AS p ON g.PhamID " \ "=p.PhamID GROUP BY PhamID) AS c WHERE Color != '#FFFFFF' " \ "AND count = 1" results = mysqldb_basic.query_dict_list(engine, query) print(f"Found {len(results)} non-white orphams...") commands = [] for dictionary in results: pham_id = dictionary["PhamID"] count = dictionary["count"] color = dictionary["Color"] new_color = "#FFFFFF" commands.append(f"UPDATE pham SET Color = '{new_color}' WHERE " f"PhamID = '{pham_id}'") mysqldb.execute_transaction(engine, commands)
def get_pham_genes(engine, phamid): """ Queries the database for the geneids and translations found in the indicated pham. Returns a dictionary mapping the pham's geneids to their associated translations. All geneid:translation pairs will be represented (i.e. no redundant gene filtering is done...). :param engine: the Engine allowing access to the database :type engine: sqlalchemy Engine :param phamid: the pham whose genes are to be returned :type phamid: str :return: pham_genes :rtype: dict """ pham_genes = dict() # Query will return the pham's GeneIDs and Translations grouped by genes # that share the same sequence query = f"SELECT GeneID, Translation FROM gene WHERE PhamID = {phamid} " \ f"ORDER BY Translation, GeneID ASC" query_results = mysqldb_basic.query_dict_list(engine, query) for dictionary in query_results: geneid = dictionary["GeneID"] translation = dictionary["Translation"].decode("utf-8") pham_genes[geneid] = translation return pham_genes
def test_build_select_3(self): """Verify build_select() appends ORDER BY clauses to executable. """ select_query = querying.build_select(self.graph, self.PhageID, order_by=self.PhageID) dict_list = query_dict_list(self.engine, select_query) phage_ids = [] dict_list = query_dict_list(self.engine, select_query) for dict in dict_list: phage_ids.append(dict["PhageID"]) self.assertEqual("Alice", phage_ids[0]) self.assertTrue("Myrna" in phage_ids) self.assertTrue("D29" in phage_ids) self.assertTrue("Trixie" in phage_ids)
def main(unparsed_args_list): """Run main get_gb_records pipeline.""" # Parse command line arguments args = parse_args(unparsed_args_list) # Filters input: phage.Status=draft AND phage.HostGenus=Mycobacterium # Args structure: [['phage.Status=draft'], ['phage.HostGenus=Mycobacterium']] filters = args.filters ncbi_cred_dict = ncbi.get_ncbi_creds(args.ncbi_credentials_file) output_folder = basic.set_path(args.output_folder, kind="dir", expect=True) working_dir = pathlib.Path(RESULTS_FOLDER) working_path = basic.make_new_dir(output_folder, working_dir, attempt=50) if working_path is None: print(f"Invalid working directory '{working_dir}'") sys.exit(1) # Verify database connection and schema compatibility. print("Connecting to the MySQL database...") alchemist = AlchemyHandler(database=args.database) alchemist.connect(pipeline=True) engine = alchemist.engine mysqldb.check_schema_compatibility(engine, "the get_gb_records pipeline") # Get SQLAlchemy metadata Table object # table_obj.primary_key.columns is a # SQLAlchemy ColumnCollection iterable object # Set primary key = 'phage.PhageID' alchemist.build_metadata() table = querying.get_table(alchemist.metadata, TARGET_TABLE) for column in table.primary_key.columns: primary_key = column # Create filter object and then add command line filter strings db_filter = Filter(alchemist=alchemist, key=primary_key) db_filter.values = [] # Attempt to add filters and exit if needed. add_filters(db_filter, filters) # Performs the query db_filter.update() # db_filter.values now contains list of PhageIDs that pass the filters. # Get the accessions associated with these PhageIDs. keep_set = set(db_filter.values) # Create data sets print("Retrieving accessions from the database...") query = construct_accession_query(keep_set) list_of_dicts = mysqldb_basic.query_dict_list(engine, query) id_acc_dict = get_id_acc_dict(list_of_dicts) acc_id_dict = get_acc_id_dict(id_acc_dict) engine.dispose() if len(acc_id_dict.keys()) > 0: get_data(working_path, acc_id_dict, ncbi_cred_dict) else: print("There are no records to retrieve.")
def test_build_count_1(self): """Verify build_count() creates valid SQLAlchemy executable. """ count_query = querying.build_count(self.graph, self.PhageID) dict_list = query_dict_list(self.engine, count_query) count_dict = dict_list[0] self.assertTrue(isinstance(count_dict["count_1"], int))
def test_build_distinct_2(self): """Verify build_distinct() cannot variable aggregated columns. MySQL does not accept DISTINCT queries with aggregated and non-aggregated columns. """ distinct_query = querying.build_distinct( self.graph, [self.PhageID, func.count(self.Cluster)]) with self.assertRaises(InternalError): dict_list = query_dict_list(self.engine, distinct_query)
def test_build_select_4(self): """Verify build_select() handles many-to-one relations as expected. build_select() queries should duplicate 'one' when filtering 'many' """ where_clause = (self.Subcluster == "A2") select_query = querying.build_select(self.graph, self.Cluster, where=where_clause) dict_list = query_dict_list(self.engine, select_query) self.assertTrue(len(dict_list) > 1)
def test_build_count_2(self): """Verify build_count() appends WHERE clauses to executable. """ where_clause = (self.PhageID == "Trixie") count_query = querying.build_count(self.graph, self.PhageID, where=where_clause) dict_list = query_dict_list(self.engine, count_query) count_dict = dict_list[0] self.assertEqual(count_dict["count_1"], 1)
def test_5_map_geneids_to_translations(self): """Verify we get back a dictionary""" gs_to_ts = map_geneids_to_translations(self.engine) command = "SELECT distinct(GeneID) FROM gene" results = mysqldb_basic.query_dict_list(self.engine, command) # gs_to_ts should be a dictionary with self.subTest(): self.assertEqual(type(gs_to_ts), type(dict())) # gs_to_ts should have the right number of geneids with self.subTest(): self.assertEqual(len(gs_to_ts), len(results))
def test_build_count_3(self): """Verify build_count() recognizes multiple inputs as expected. """ where_clause = (self.Cluster == "A") count_query = querying.build_count( self.graph, [self.PhageID, self.Cluster.distinct()], where=where_clause) dict_list = query_dict_list(self.engine, count_query) count_dict = dict_list[0] self.assertTrue(count_dict["count_1"] > 1) self.assertEqual(count_dict["count_2"], 1)
def test_6_map_translations_to_geneids(self): """Verify we get back a dictionary""" ts_to_gs = map_translations_to_geneids(self.engine) command = "SELECT distinct(CONVERT(Translation USING utf8)) FROM gene" results = mysqldb_basic.query_dict_list(self.engine, command) # ts_to_gs should be a dictionary with self.subTest(): self.assertEqual(type(ts_to_gs), type(dict())) # ts_to_gs should have the right number of translations with self.subTest(): self.assertEqual(len(ts_to_gs), len(results))
def test_build_select_1(self): """Verify build_select() creates valid SQLAlchemy executable. """ select_query = querying.build_select(self.graph, self.PhageID) phage_ids = [] dict_list = query_dict_list(self.engine, select_query) for dict in dict_list: phage_ids.append(dict["PhageID"]) self.assertTrue("Myrna" in phage_ids) self.assertTrue("D29" in phage_ids) self.assertTrue("Trixie" in phage_ids)
def test_build_distinct_1(self): """Verify build_distinct() handles many-to-one relations as expected. build_distinct() should not duplicate 'one' when handling 'many.' """ where_clause = (self.Subcluster == "A2") distinct_query = querying.build_distinct(self.graph, self.Cluster, where=where_clause) dict_list = query_dict_list(self.engine, distinct_query) self.assertEqual(len(dict_list), 1) distinct_dict = dict_list[0] self.assertEqual(distinct_dict["Cluster"], "A")
def test_build_select_2(self): """Verify build_select() appends WHERE clauses to executable. """ where_clause = (self.Cluster == "A") select_query = querying.build_select(self.graph, self.PhageID, where=where_clause) phage_ids = [] dict_list = query_dict_list(self.engine, select_query) for dict in dict_list: phage_ids.append(dict["PhageID"]) self.assertTrue("Trixie" in phage_ids) self.assertTrue("D29" in phage_ids) self.assertFalse("Myrna" in phage_ids)
def get_pham_colors(engine): """ Queries the database for the colors of existing phams :param engine: the Engine allowing access to the database :return: pham_colors """ pham_colors = dict() color_query = "SELECT PhamID, Color FROM pham" color_results = mysqldb_basic.query_dict_list(engine, color_query) for dictionary in color_results: pham_id = dictionary["PhamID"] color = dictionary["Color"] pham_colors[pham_id] = color.upper() return pham_colors
def get_new_geneids(engine): """ Queries the database for those genes that are not yet phamerated. :param engine: the Engine allowing access to the database :return: new_geneids """ new_geneids = list() gene_query = "SELECT GeneID FROM gene WHERE PhamID IS NULL" gene_results = mysqldb_basic.query_dict_list(engine, gene_query) # At scale, much cheaper to convert list to set than to build the # set one gene at a time for dictionary in gene_results: geneid = dictionary["GeneID"] new_geneids.append(geneid) return set(new_geneids)
def get_geneids_and_translations(engine): """ Constructs a dictionary mapping all geneids to their translations. :param engine: the Engine allowing access to the database :return: gs_to_ts """ gs_to_ts = dict() query = ("SELECT GeneID, CONVERT(Translation USING utf8) as Translation " "FROM gene") results = mysqldb_basic.query_dict_list(engine, query) for dictionary in results: geneid = dictionary["GeneID"] translation = dictionary["Translation"] gs_to_ts[geneid] = translation return gs_to_ts
def get_new_geneids(engine): """ Queries the database for those genes that are not yet phamerated. :param engine: the Engine allowing access to the database :return: new_geneids """ new_geneids = set() gene_query = "SELECT GeneID FROM gene WHERE PhamID IS NULL" gene_results = mysqldb_basic.query_dict_list(engine, gene_query) for dictionary in gene_results: geneid = dictionary["GeneID"] new_geneids = new_geneids | {geneid} print(f"Found {len(new_geneids)} genes not in phams...") return new_geneids
def get_pham_geneids(engine): """ Queries the database for those genes that are already phamerated. :param engine: the Engine allowing access to the database :return: pham_geneids """ pham_geneids = dict() geneid_query = "SELECT GeneID, PhamID FROM gene WHERE PhamID IS NOT NULL" geneid_results = mysqldb_basic.query_dict_list(engine, geneid_query) for dictionary in geneid_results: pham_id = dictionary["PhamID"] geneid = dictionary["GeneID"] if pham_id in pham_geneids.keys(): pham_geneids[pham_id] = pham_geneids[pham_id] | {geneid} else: pham_geneids[pham_id] = {geneid} return pham_geneids
def get_translation_groups(engine): """ Constructs a dictionary mapping all unique translations to their groups of geneids that share them :param engine: the Engine allowing access to the database :return: ts_to_gs """ ts_to_gs = dict() query = ("SELECT GeneID, CONVERT(Translation USING utf8) as Translation " "FROM gene") results = mysqldb_basic.query_dict_list(engine, query) for dictionary in results: geneid = dictionary["GeneID"] trans = dictionary["Translation"] geneids = ts_to_gs.get(trans, []) geneids.append(geneid) ts_to_gs[trans] = geneids return ts_to_gs
def main(argument_list): """ :param argument_list: :return: """ # Setup argument parser cdd_parser = setup_argparser() # Use argument parser to parse argument_list args = cdd_parser.parse_args(argument_list) # Store arguments in more easily accessible variables database = args.database cdd_dir = expand_path(args.cdd) cdd_name = learn_cdd_name(cdd_dir) threads = args.threads evalue = args.evalue rpsblast = args.rpsblast tmp_dir = args.tmp_dir output_folder = args.output_folder reset = args.reset batch_size = args.batch_size # Set up directory. output_folder = basic.set_path(output_folder, kind="dir", expect=True) results_folder = pathlib.Path(RESULTS_FOLDER) results_path = basic.make_new_dir(output_folder, results_folder, attempt=50) if results_path is None: print("Unable to create output_folder.") sys.exit(1) log_file = pathlib.Path(results_path, MAIN_LOG_FILE) # Set up root logger. logging.basicConfig(filename=log_file, filemode="w", level=logging.DEBUG, format="pdm_utils find_domains: %(levelname)s: %(message)s") logger.info(f"pdm_utils version: {VERSION}") logger.info(f"CDD run date: {constants.CURRENT_DATE}") logger.info(f"Command line arguments: {' '.join(argument_list)}") logger.info(f"Results directory: {results_path}") # Early exit if either 1) cdd_name == "" or 2) no rpsblast given and we are # unable to find one if cdd_name == "": msg = (f"Unable to learn CDD database name. Make sure the files in " f"{cdd_dir} all have the same basename.") logger.error(msg) print(msg) return # Get the rpsblast command and path. if rpsblast == "": command = get_rpsblast_command() rpsblast = get_rpsblast_path(command) # Verify database connection and schema compatibility. alchemist = AlchemyHandler(database=database) alchemist.connect(pipeline=True) engine = alchemist.engine logger.info(f"Connected to database: {database}.") mysqldb.check_schema_compatibility(engine, "the find_domains pipeline") logger.info(f"Schema version is compatible.") logger.info("Command line arguments verified.") if reset: logger.info("Clearing all domain data currently in the database.") clear_domain_data(engine) # Get gene data that needs to be processed # in dict format where key = column name, value = stored value. cdd_genes = mysqldb_basic.query_dict_list(engine, GET_GENES_FOR_CDD) msg = f"{len(cdd_genes)} genes to search for conserved domains..." logger.info(msg) print(msg) # Only run the pipeline if there are genes returned that need it if len(cdd_genes) > 0: log_gene_ids(cdd_genes) make_tempdir(tmp_dir) # Identify unique translations to process mapped to GeneIDs. cds_trans_dict = create_cds_translation_dict(cdd_genes) unique_trans = list(cds_trans_dict.keys()) msg = (f"{len(unique_trans)} unique translations " "to search for conserved domains...") logger.info(msg) print(msg) # Process translations in batches. Otherwise, searching could take # so long that MySQL connection closes resulting in 1 or more # transaction errors. batch_indices = basic.create_indices(unique_trans, batch_size) total_rolled_back = 0 for indices in batch_indices: start = indices[0] stop = indices[1] msg = f"Processing translations {start + 1} to {stop}..." logger.info(msg) print(msg) sublist = unique_trans[start:stop] batch_rolled_back = search_translations( rpsblast, cdd_name, tmp_dir, evalue, threads, engine, sublist, cds_trans_dict) total_rolled_back += batch_rolled_back search_summary(total_rolled_back) engine.dispose() return