def main(unparsed_args_list): """Run main get_gb_records pipeline.""" # Parse command line arguments args = parse_args(unparsed_args_list) date = time.strftime("%Y%m%d") args.output_folder = basic.set_path(args.output_folder, kind="dir", expect=True) working_dir = pathlib.Path(f"{date}_get_gb_records") working_path = basic.make_new_dir(args.output_folder, working_dir, attempt=10) if working_path is None: print(f"Invalid working directory '{working_dir}'") sys.exit(1) ncbi_cred_dict = ncbi.get_ncbi_creds(args.ncbi_credentials_file) # Verify database connection and schema compatibility. print("Connecting to the MySQL database...") engine = mysqldb.connect_to_db(args.database) mysqldb.check_schema_compatibility(engine, "the get_gb_records pipeline") # Create data sets print("Retrieving accessions from the database...") accessions = mysqldb.get_distinct_data(engine, "phage", "Accession") engine.dispose() if "" in accessions: accessions.remove("") if None in accessions: accessions.remove(None) get_genbank_data(working_path, accessions, ncbi_cred_dict)
def test_connect_to_db_2(self, get_engine_mock, sys_exit_mock): """Verify that sys exit is called when engine is None.""" get_engine_mock.return_value = (None, "") engine = mysqldb.connect_to_db(db2) with self.subTest(): self.assertTrue(get_engine_mock.called) with self.subTest(): self.assertTrue(sys_exit_mock.called)
def test_connect_to_db_1(self, getpass_mock): """Verify that engine returned when valid info provided.""" getpass_mock.side_effect = [user, pwd] engine = mysqldb.connect_to_db(db2) with self.subTest(): self.assertTrue(getpass_mock.called) with self.subTest(): self.assertIsNotNone(engine)
def main(unparsed_args_list, engine1=None): """Run main conversion pipeline.""" # Parse command line arguments args = parse_args(unparsed_args_list) if engine1 is None: engine1 = mysqldb.connect_to_db(args.database) target = args.schema_version actual = mysqldb.get_schema_version(engine1) steps, dir = get_conversion_direction(actual, target) # Iterate through list of versions and implement SQL files. if dir == "none": if args.verbose == True: print("No schema conversion is needed.") convert = False else: convert = True if convert == True: if (args.new_database_name is not None and args.new_database_name != args.database): result = mysqldb.drop_create_db(engine1, args.new_database_name) if result == 0: result = mysqldb.copy_db(engine1, args.new_database_name) if result == 0: # Create a new connection to the new database. engine2, msg = mysqldb.get_engine( database=args.new_database_name, username=engine1.url.username, password=engine1.url.password, echo=False) else: print("Error: Unable to copy the database for conversion.") convert = False else: print( "Error: Unable to create the new database for conversion.") convert = False else: engine2 = engine1 if convert == True: stop_step, summary = convert_schema(engine2, actual, dir, steps, verbose=args.verbose) engine2.dispose() if stop_step == target: if args.verbose == True: print("\n\nThe database schema conversion was successful.") else: print("\n\nError: " "The database schema conversion was not successful. " f"Unable to proceed past schema version {stop_step}.") if args.verbose == True: print_summary(summary) engine1.dispose()
def main(unparsed_args): """Runs the complete update pipeline.""" args = parse_args(unparsed_args) # Verify database connection and schema compatibility. print("Connecting to the MySQL database...") engine = mysqldb.connect_to_db(args.database) mysqldb.check_schema_compatibility(engine, "the update pipeline") if args.version == True: mysqldb.change_version(engine) print("Database version updated.") if args.ticket_table is not None: update_table_path = basic.set_path(args.ticket_table, kind="file", expect=True) # Iterate through the tickets and process them sequentially. list_of_update_tickets = [] with update_table_path.open(mode='r') as f: file_reader = csv.DictReader(f) for dict in file_reader: list_of_update_tickets.append(dict) # Variables to be used for end summary processed = 0 succeeded = 0 failed = 0 for dict in list_of_update_tickets: conn = engine.connect() # Pass the raw db_api connection handler = RandomFieldUpdateHandler(conn.connection) handler.table = dict["table"] # Which table will be updated? handler.field = dict["field"] # Which field will be updated? handler.value = dict[ "value"] # What value will be put in that field? handler.key_name = dict[ "key_name"] # How will we know which row is the right one? handler.key_value = dict[ "key_value"] # How will we know which row is the right one? handler.validate_ticket( ) # Make sure all handler attributes are valid status = handler.execute_ticket() # Do what was requested if status == 1: processed += 1 succeeded += 1 else: processed += 1 failed += 1 engine.dispose() print("\nDone iterating through tickets.") if succeeded > 0: print(f"{succeeded} / {processed} tickets successfully handled.") if failed > 0: print(f"{failed} / {processed} tickets failed to be handled.")
def main(unparsed_args_list): """Run main freeze database pipeline.""" args = parse_args(unparsed_args_list) # engine1, msg = mysqldb.get_engine(database=args.database, echo=False) # Verify database connection and schema compatibility. print("Connecting to the MySQL database...") engine1 = mysqldb.connect_to_db(args.database) mysqldb.check_schema_compatibility(engine1, "the freeze pipeline") # Get the number of draft genomes. query = "SELECT count(*) as count FROM phage WHERE Status != 'draft'" result = engine1.execute(query).fetchall() phage_count = result[0]["count"] # Create the new frozen database folder e.g. Actinobacteriophage_XYZ. prefix = get_prefix() new_database = f"{prefix}_{phage_count}" # Create the new database result = mysqldb.drop_create_db(engine1, new_database) # Copy database. if result == 0: result = mysqldb.copy_db(engine1, new_database) if result == 0: print(f"Deleting 'draft' genomes...") engine2, msg = mysqldb.get_engine(username=engine1.url.username, password=engine1.url.password, database=new_database) statement3 = "DELETE FROM phage WHERE Status = 'draft'" engine2.execute(statement3) statement4 = "UPDATE version SET Version = 0" engine2.execute(statement4) # Close up all connections in the connection pool. engine2.dispose() else: print("Unable to copy the database.") # Close up all connections in the connection pool. engine1.dispose() else: print(f"Error creating new database: {new_database}.") print("Freeze database script completed.")
def main(argument_list): """ :param argument_list: :return: """ # Setup argument parser cdd_parser = setup_argparser() # Use argument parser to parse argument_list args = cdd_parser.parse_args(argument_list) # Store arguments in more easily accessible variables database = args.db cdd_dir = expand_path(args.dir) cdd_name = learn_cdd_name(cdd_dir) threads = args.threads evalue = args.evalue rpsblast = args.rpsblast tmp_dir = args.tmp_dir output_folder = args.output_folder log_file = args.log_file reset = args.reset # Set up directory. output_folder = basic.set_path(output_folder, kind="dir", expect=True) results_folder = pathlib.Path(RESULTS_FOLDER) results_path = basic.make_new_dir(output_folder, results_folder, attempt=10) if results_path is None: print("Unable to create output_folder.") sys.exit(1) log_file = pathlib.Path(results_path, log_file) # Set up root logger. logging.basicConfig( filename=log_file, filemode="w", level=logging.DEBUG, format="pdm_utils find_domains: %(levelname)s: %(message)s") logger.info(f"pdm_utils version: {VERSION}") logger.info(f"CDD run date: {constants.CURRENT_DATE}") logger.info(f"Command line arguments: {' '.join(argument_list)}") logger.info(f"Results directory: {results_path}") # Early exit if either 1) cdd_name == "" or 2) no rpsblast given and we are # unable to find one if cdd_name == "": msg = (f"Unable to learn CDD database name. Make sure the files in " f"{cdd_dir} all have the same basename.") logger.error(msg) print(msg) return # Get the rpsblast command and path. if rpsblast == "": command = get_rpsblast_command() rpsblast = get_rpsblast_path(command) # Verify database connection and schema compatibility. engine = mysqldb.connect_to_db(database) logger.info(f"Connected to database: {database}.") mysqldb.check_schema_compatibility(engine, "the find_domains pipeline") logger.info(f"Schema version is compatible.") logger.info("Command line arguments verified.") if reset: logger.info("Clearing all domain data currently in the database.") clear_domain_data(engine) # Get gene data that needs to be processed # in dict format where key = column name, value = stored value. # result = engine.execute(GET_GENES_FOR_CDD) cdd_genes = mysqldb.query_dict_list(engine, GET_GENES_FOR_CDD) msg = f"{len(cdd_genes)} genes to search for conserved domains..." logger.info(msg) print(msg) # Only run the pipeline if there are genes returned that need it if len(cdd_genes) > 0: log_gene_ids(cdd_genes) # Create temp_dir make_tempdir(tmp_dir) # TODO dev # translations = get_unique_translations(cdd_genes) # Build jobs list jobs = [] # TODO dev # translation_id = 0 # for translation in translations: # translation_id += 1 # jobs.append((rpsblast, cdd_name, tmp_dir, evalue, # translation_id, translation)) for cdd_gene in cdd_genes: jobs.append((rpsblast, cdd_name, tmp_dir, evalue, cdd_gene["GeneID"], cdd_gene["Translation"])) results = parallelize(jobs, threads, search_and_process) print("\n") # TODO dev # results_dict = create_results_dict(results) # map_results_to_genes(cdd_genes, results_dict) insert_domain_data(engine, results) engine.dispose() return
def main(argument_list): # Set up the argument parser phamerate_parser = setup_argparser() # Parse arguments args = phamerate_parser.parse_args(argument_list) program = args.program temp_dir = args.temp_dir # Initialize SQLAlchemy engine with database provided at CLI engine = mysqldb.connect_to_db(args.db) # If we made it past the above connection_status() check, database access # works (user at least has SELECT privileges on the indicated database). # We'll assume that they also have UPDATE, INSERT, and TRUNCATE privileges. # Record start time start = datetime.datetime.now() # Refresh temp_dir if os.path.exists(temp_dir): try: shutil.rmtree(temp_dir) except OSError: print(f"Failed to delete existing temp directory '{temp_dir}'") return try: os.makedirs(temp_dir) except OSError: print(f"Failed to create new temp directory '{temp_dir}") return # Get old pham data and un-phamerated genes old_phams = get_pham_geneids(engine) old_colors = get_pham_colors(engine) unphamerated = get_new_geneids(engine) # Get GeneIDs & translations, and translation groups genes_and_trans = map_geneids_to_translations(engine) translation_groups = map_translations_to_geneids(engine) # Write input fasta file write_fasta(translation_groups, temp_dir) # Create clusterdb and perform clustering program_params = get_program_params(program, args) create_clusterdb(program, temp_dir) phamerate(program_params, program, temp_dir) # Parse phameration output new_phams = parse_output(program, temp_dir) new_phams = reintroduce_duplicates(new_phams, translation_groups, genes_and_trans) # Preserve old pham names and colors new_phams, new_colors = preserve_phams(old_phams, new_phams, old_colors, unphamerated) # Early exit if we don't have new phams or new colors - avoids # overwriting the existing pham data with potentially incomplete new data if len(new_phams) == 0 or len(new_colors) == 0: print("Failed to parse new pham/color data... Terminating pipeline") return # If we got past the early exit, we are probably safe to truncate the # pham table, and insert the new pham data # Clear old pham data - auto commits at end of transaction - this will also # set all PhamID values in gene table to NULL commands = ["DELETE FROM pham"] mysqldb.execute_transaction(engine, commands) # Insert new pham/color data reinsert_pham_data(new_phams, new_colors, engine) # Fix miscolored phams/orphams fix_miscolored_phams(engine) # Close all connections in the connection pool. engine.dispose() # Record stop time stop = datetime.datetime.now() # Report phameration elapsed time print("Elapsed time: {}".format(str(stop - start)))
def main(unparsed_args_list): """Run main retrieve_updates pipeline.""" # Parse command line arguments args = parse_args(unparsed_args_list) date = time.strftime("%Y%m%d") args.output_folder = basic.set_path(args.output_folder, kind="dir", expect=True) working_dir = pathlib.Path(f"{date}_get_data") working_path = basic.make_new_dir(args.output_folder, working_dir, attempt=10) if working_path is None: print(f"Invalid working directory '{working_dir}'") sys.exit(1) ncbi_cred_dict = ncbi.get_ncbi_creds(args.ncbi_credentials_file) # Verify database connection and schema compatibility. print("Preparing genome data sets from the MySQL database...") engine = mysqldb.connect_to_db(args.database) mysqldb.check_schema_compatibility(engine, "the get_data pipeline") # Get existing data from MySQL to determine what needs to be updated. query = ("SELECT PhageID, Name, HostGenus, Status, Cluster, " "DateLastModified, Accession, RetrieveRecord, Subcluster, " "AnnotationAuthor FROM phage") mysqldb_genome_list = mysqldb.parse_genome_data(engine=engine, phage_query=query, gnm_type="mysqldb") engine.dispose() mysqldb_genome_dict = {} for gnm in mysqldb_genome_list: mysqldb_genome_dict[gnm.id] = gnm # Get data from PhagesDB if (args.updates or args.final or args.draft) is True: print("Retrieving data from PhagesDB...") phagesdb_phages = phagesdb.get_phagesdb_data(constants.API_SEQUENCED) phagesdb_phages_dict = basic.convert_list_to_dict( phagesdb_phages, "phage_name") phagesdb_genome_dict = phagesdb.parse_genomes_dict( phagesdb_phages_dict, gnm_type="phagesdb", seq=False) # Exit if all phage data wasn't retrieved. if len(phagesdb_genome_dict) == 0: sys.exit(1) # Returns a list of tuples. match_output = match_genomes(mysqldb_genome_dict, phagesdb_genome_dict) matched_genomes = match_output[0] unmatched_phagesdb_ids = match_output[1] if args.updates is True: get_update_data(working_path, matched_genomes) if args.final is True: get_final_data(working_path, matched_genomes) if args.genbank is True: get_genbank_data(working_path, mysqldb_genome_dict, ncbi_cred_dict, args.genbank_results) if args.draft is True: get_draft_data(working_path, unmatched_phagesdb_ids) print("\n\n\nRetrieve updates script completed.")