def get_files(directory, file, ignore): """ Get the list of file(s) that need to be uploaded. :param directory: (optional) directory containing files for upload :type: directory: pathlib.Path :param file: (optional) file to upload :type file: pathlib.Path :param ignore: file(s) to ignore during upload process :type ignore: set :return: file_list """ file_list = [] if directory is not None: directory = basic.set_path(directory, kind="dir", expect=True) folder_files = basic.identify_contents(directory, kind="file", ignore_set=ignore) file_list.extend(folder_files) if file is not None: file = basic.set_path(file, kind="file", expect=True) file_list.append(file) return file_list
def main(unparsed_args_list): """Run the get_db pipeline. The database data can be retrieved from three places: The server, which needs to be downloaded to a new folder. A local file, in which no download and no new folder are needed. The empty schema stored within pdm_utils, in which no download, new folder, or local file are needed. """ args = parse_args(unparsed_args_list) # Set all values to get rid of args object and to set additional values. database = args.database option = args.option install = True schema_version = None db_filepath = None if option == "file": db_filepath = basic.set_path(args.filename, kind="file", expect=True) elif option == "new": schema_version = args.schema_version else: # option must be "server" output_folder = basic.set_path(args.output_folder, kind="dir", expect=True) download = True remove = True results_folder = pathlib.Path(RESULTS_FOLDER) results_path = basic.make_new_dir(output_folder, results_folder, attempt=50) if args.download_only == True: install = False remove = False if results_path is None: print("Unable to create results folder.") sys.exit(1) else: version_filepath, status1 = prepare_download(results_path, constants.DB_WEBSITE, args.database, "version") db_filepath, status2 = prepare_download(results_path, constants.DB_WEBSITE, args.database, "sql") if (status1 == False or status2 == False): print("Unable to download data from server.") sys.exit(1) # If downloading from server, user may have selected to not # install the database file. if install == True: install_db(database, db_filepath=db_filepath, schema_version=schema_version) # The output folder was only created for downloading from server. if option == "server": if remove == True: print("Removing downloaded data.") shutil.rmtree(results_path)
def main(unparsed_args_list): """Run the push_db pipeline.""" args = parse_args(unparsed_args_list) file_list = [] if args.directory is not None: args.directory = basic.set_path(args.directory, kind="dir", expect=True) folder_files = basic.identify_contents(args.directory, kind="file", ignore_set=set([".DS_Store"])) file_list.extend(folder_files) if args.file is not None: args.file = basic.set_path(args.file, kind="file", expect=True) file_list.append(args.file) status = True if len(file_list) == 0: print("There are no files to upload.") status = False if status == True: server.set_log_file(str(args.log_file)) transport = server.get_transport(constants.DB_HOST) if transport is None: status = False if status == True: sftp = server.setup_sftp_conn(transport, attempts=3) if sftp is None: status = False success = [] fail = [] if status == True: for local_filepath in file_list: print(f"Uploading {local_filepath.name}...") remote_filepath = pathlib.Path(constants.DB_HOST_DIR, local_filepath.name) result = server.upload_file(sftp, str(local_filepath), str(remote_filepath)) if result: success.append(local_filepath.name) else: fail.append(local_filepath.name) sftp.close() transport.close() if len(fail) > 0: print("The following files were not uploaded:") for file in fail: print(file)
def main(unparsed_args_list): """Run main get_gb_records pipeline.""" # Parse command line arguments args = parse_args(unparsed_args_list) date = time.strftime("%Y%m%d") args.output_folder = basic.set_path(args.output_folder, kind="dir", expect=True) working_dir = pathlib.Path(f"{date}_get_gb_records") working_path = basic.make_new_dir(args.output_folder, working_dir, attempt=10) if working_path is None: print(f"Invalid working directory '{working_dir}'") sys.exit(1) ncbi_cred_dict = ncbi.get_ncbi_creds(args.ncbi_credentials_file) # Verify database connection and schema compatibility. print("Connecting to the MySQL database...") engine = mysqldb.connect_to_db(args.database) mysqldb.check_schema_compatibility(engine, "the get_gb_records pipeline") # Create data sets print("Retrieving accessions from the database...") accessions = mysqldb.get_distinct_data(engine, "phage", "Accession") engine.dispose() if "" in accessions: accessions.remove("") if None in accessions: accessions.remove(None) get_genbank_data(working_path, accessions, ncbi_cred_dict)
def test_set_path_4(self, verify_path2_mock): """Verify '..' directory resolution.""" test_file = Path("/dir1/dir2/../file.txt") verify_path2_mock.return_value = (True, None) output = basic.set_path(test_file, kind="file", expect=True) exp = Path("/dir1/file.txt") self.assertEqual(output, exp)
def main(unparsed_args): """Runs the complete update pipeline.""" args = parse_args(unparsed_args) # Verify database connection and schema compatibility. print("Connecting to the MySQL database...") engine = mysqldb.connect_to_db(args.database) mysqldb.check_schema_compatibility(engine, "the update pipeline") if args.version == True: mysqldb.change_version(engine) print("Database version updated.") if args.ticket_table is not None: update_table_path = basic.set_path(args.ticket_table, kind="file", expect=True) # Iterate through the tickets and process them sequentially. list_of_update_tickets = [] with update_table_path.open(mode='r') as f: file_reader = csv.DictReader(f) for dict in file_reader: list_of_update_tickets.append(dict) # Variables to be used for end summary processed = 0 succeeded = 0 failed = 0 for dict in list_of_update_tickets: conn = engine.connect() # Pass the raw db_api connection handler = RandomFieldUpdateHandler(conn.connection) handler.table = dict["table"] # Which table will be updated? handler.field = dict["field"] # Which field will be updated? handler.value = dict[ "value"] # What value will be put in that field? handler.key_name = dict[ "key_name"] # How will we know which row is the right one? handler.key_value = dict[ "key_value"] # How will we know which row is the right one? handler.validate_ticket( ) # Make sure all handler attributes are valid status = handler.execute_ticket() # Do what was requested if status == 1: processed += 1 succeeded += 1 else: processed += 1 failed += 1 engine.dispose() print("\nDone iterating through tickets.") if succeeded > 0: print(f"{succeeded} / {processed} tickets successfully handled.") if failed > 0: print(f"{failed} / {processed} tickets failed to be handled.")
def convert_file_path(path: str): """Function to convert argparse input to a working file path. :param path: A string to be converted into a Path object. :type path: str :returns: A Path object converted from the inputed string. :rtype: Path """ return basic.set_path(Path(path), kind="file")
def create_empty_config_file(dir, file, null_value): """Create an empty config file with all available settings.""" output_path = basic.set_path(dir, kind="dir", expect=True) config_path = basic.make_new_file(output_path, file, "txt", attempt=50) if config_path is None: print("Unable to create config file. File already exists.") else: parser = default_parser(null_value) write_config(parser, config_path)
def main(unparsed_args_list): """Run main get_gb_records pipeline.""" # Parse command line arguments args = parse_args(unparsed_args_list) # Filters input: phage.Status=draft AND phage.HostGenus=Mycobacterium # Args structure: [['phage.Status=draft'], ['phage.HostGenus=Mycobacterium']] filters = args.filters ncbi_cred_dict = ncbi.get_ncbi_creds(args.ncbi_credentials_file) output_folder = basic.set_path(args.output_folder, kind="dir", expect=True) working_dir = pathlib.Path(RESULTS_FOLDER) working_path = basic.make_new_dir(output_folder, working_dir, attempt=50) if working_path is None: print(f"Invalid working directory '{working_dir}'") sys.exit(1) # Verify database connection and schema compatibility. print("Connecting to the MySQL database...") alchemist = AlchemyHandler(database=args.database) alchemist.connect(pipeline=True) engine = alchemist.engine mysqldb.check_schema_compatibility(engine, "the get_gb_records pipeline") # Get SQLAlchemy metadata Table object # table_obj.primary_key.columns is a # SQLAlchemy ColumnCollection iterable object # Set primary key = 'phage.PhageID' alchemist.build_metadata() table = querying.get_table(alchemist.metadata, TARGET_TABLE) for column in table.primary_key.columns: primary_key = column # Create filter object and then add command line filter strings db_filter = Filter(alchemist=alchemist, key=primary_key) db_filter.values = [] # Attempt to add filters and exit if needed. add_filters(db_filter, filters) # Performs the query db_filter.update() # db_filter.values now contains list of PhageIDs that pass the filters. # Get the accessions associated with these PhageIDs. keep_set = set(db_filter.values) # Create data sets print("Retrieving accessions from the database...") query = construct_accession_query(keep_set) list_of_dicts = mysqldb_basic.query_dict_list(engine, query) id_acc_dict = get_id_acc_dict(list_of_dicts) acc_id_dict = get_acc_id_dict(id_acc_dict) engine.dispose() if len(acc_id_dict.keys()) > 0: get_data(working_path, acc_id_dict, ncbi_cred_dict) else: print("There are no records to retrieve.")
def test_set_path_5(self, verify_path2_mock): """Verify home directory expansion and '..' directory resolution.""" home = Path("~") home = home.expanduser() test_file = Path("~/dir1/dir2/../file.txt") verify_path2_mock.return_value = (True, None) output = basic.set_path(test_file, kind="file", expect=True) exp = Path(home, "dir1/file.txt") self.assertEqual(output, exp)
def main(unparsed_args): """Runs the complete update pipeline.""" args = parse_args(unparsed_args[2:]) # Verify database connection and schema compatibility. print("Connecting to the MySQL database...") # Create config object with data obtained from file and/or defaults. config = configfile.build_complete_config(args.config_file) mysql_creds = config["mysql"] alchemist = AlchemyHandler(database=args.database, username=mysql_creds["user"], password=mysql_creds["password"]) alchemist.connect(pipeline=True) engine = alchemist.engine mysqldb.check_schema_compatibility(engine, "the update pipeline") if args.version is True: mysqldb.change_version(engine) print("Database version updated.") if args.ticket_table is not None: update_table_path = basic.set_path(args.ticket_table, kind="file", expect=True) # Iterate through the tickets and process them sequentially. list_of_update_tickets = [] with update_table_path.open(mode='r') as f: file_reader = csv.DictReader(f) for dict in file_reader: list_of_update_tickets.append(dict) # Variables to be used for end summary processed = 0 succeeded = 0 failed = 0 for dict in list_of_update_tickets: status = update_field(alchemist, dict) if status == 1: processed += 1 succeeded += 1 else: processed += 1 failed += 1 engine.dispose() print("\nDone iterating through tickets.") if succeeded > 0: print(f"{succeeded} / {processed} tickets successfully handled.") if failed > 0: print(f"{failed} / {processed} tickets failed to be handled.")
def test_set_path_1(self): """Verify output when file exists and is expected to exist.""" self.file.touch() # Since using tempfile, there is an added quirk. # the tempfile path may be a symlink, so passing it through set path # will resolve the symlink, changing the path, and breaking the test. self.file = self.file.resolve() output = basic.set_path(self.file, kind="file", expect=True) with self.subTest(): self.assertIsInstance(output, Path) with self.subTest(): self.assertEqual(str(self.file), str(output))
def parse_config(file, parser=None): """Get parameters from config file.""" filepath = basic.set_path(file, kind="file", expect=True) if parser is None: parser = configparser.ConfigParser() try: parser.read(filepath) except: print("Unable to parse config file") sys.exit(1) else: return parser
def execute_get_file_db(alchemist, database, filename, config_file=None, schema_version=None, verbose=False): db_filepath = basic.set_path(filename, kind="file", expect=True) install_db(alchemist, database, db_filepath=db_filepath, config_file=config_file, schema_version=schema_version, verbose=verbose)
def get_ncbi_creds(filename): """Get NCBI credentials from a file. :param filename: Path to config file containing NCBI login credentials. :type filename: Path :return: Dictionary of NCBI login credentials. :rtype: dict """ ncbi_cred_dict = {} ncbi_cred_dict["ncbi_api_key"] = None ncbi_cred_dict["ncbi_email"] = None ncbi_cred_dict["ncbi_tool"] = None if filename is not None: filepath = basic.set_path(filename, kind="file", expect=True) config_dict = basic.parse_config_file(filepath) try: ncbi_cred_dict["ncbi_api_key"] = config_dict["ncbi_api_key"] ncbi_cred_dict["ncbi_email"] = config_dict["ncbi_email"] ncbi_cred_dict["ncbi_tool"] = config_dict["ncbi_tool"] except: print(f"Unable to parse NCBI credentials from {filepath.name}") return ncbi_cred_dict
def main(unparsed_args_list): """Run main retrieve_updates pipeline.""" # Parse command line arguments args = parse_args(unparsed_args_list) force = args.force_download args.output_folder = basic.set_path(args.output_folder, kind="dir", expect=True) working_dir = pathlib.Path(RESULTS_FOLDER) working_path = basic.make_new_dir(args.output_folder, working_dir, attempt=50) if working_path is None: print(f"Invalid working directory '{working_dir}'") sys.exit(1) # Create config object with data obtained from file and/or defaults. config = configfile.build_complete_config(args.config_file) mysql_creds = config["mysql"] ncbi_creds = config["ncbi"] # Verify database connection and schema compatibility. print("Preparing genome data sets from the MySQL database...") alchemist = AlchemyHandler(database=args.database, username=mysql_creds["user"], password=mysql_creds["password"]) alchemist.connect(pipeline=True) engine = alchemist.engine mysqldb.check_schema_compatibility(engine, "the get_data pipeline") # Get existing data from MySQL to determine what needs to be updated. query = ("SELECT PhageID, Name, HostGenus, Status, Cluster, " "DateLastModified, Accession, RetrieveRecord, Subcluster, " "AnnotationAuthor FROM phage") mysqldb_genome_list = mysqldb.parse_genome_data(engine=engine, phage_query=query, gnm_type="mysqldb") engine.dispose() mysqldb_genome_dict = {} for gnm in mysqldb_genome_list: # With default date, the date of all records retrieved will be newer. if force: gnm.date = constants.EMPTY_DATE mysqldb_genome_dict[gnm.id] = gnm # Get data from PhagesDB if (args.updates or args.final or args.draft) is True: print("Retrieving data from PhagesDB...") phagesdb_phages = phagesdb.get_phagesdb_data(constants.API_SEQUENCED) phagesdb_phages_dict = basic.convert_list_to_dict( phagesdb_phages, "phage_name") phagesdb_genome_dict = phagesdb.parse_genomes_dict( phagesdb_phages_dict, gnm_type="phagesdb", seq=False) # Exit if all phage data wasn't retrieved. if len(phagesdb_genome_dict) == 0: sys.exit(1) # Returns a list of tuples. tup = match_genomes(mysqldb_genome_dict, phagesdb_genome_dict) matched_genomes = tup[0] unmatched_phagesdb_ids = tup[1] if args.updates is True: get_update_data(working_path, matched_genomes) if args.final is True: get_final_data(working_path, matched_genomes) if args.genbank is True: get_genbank_data(working_path, mysqldb_genome_dict, ncbi_creds, args.genbank_results, force=force) if args.draft is True: if force: # Add all draft genomes currently in database to the list of # draft genomes to be downloaded. drafts = get_matched_drafts(matched_genomes) unmatched_phagesdb_ids |= drafts get_draft_data(working_path, unmatched_phagesdb_ids)
def main(argument_list): """ :param argument_list: :return: """ # Setup argument parser cdd_parser = setup_argparser() # Use argument parser to parse argument_list args = cdd_parser.parse_args(argument_list) # Store arguments in more easily accessible variables database = args.db cdd_dir = expand_path(args.dir) cdd_name = learn_cdd_name(cdd_dir) threads = args.threads evalue = args.evalue rpsblast = args.rpsblast tmp_dir = args.tmp_dir output_folder = args.output_folder log_file = args.log_file reset = args.reset # Set up directory. output_folder = basic.set_path(output_folder, kind="dir", expect=True) results_folder = pathlib.Path(RESULTS_FOLDER) results_path = basic.make_new_dir(output_folder, results_folder, attempt=10) if results_path is None: print("Unable to create output_folder.") sys.exit(1) log_file = pathlib.Path(results_path, log_file) # Set up root logger. logging.basicConfig( filename=log_file, filemode="w", level=logging.DEBUG, format="pdm_utils find_domains: %(levelname)s: %(message)s") logger.info(f"pdm_utils version: {VERSION}") logger.info(f"CDD run date: {constants.CURRENT_DATE}") logger.info(f"Command line arguments: {' '.join(argument_list)}") logger.info(f"Results directory: {results_path}") # Early exit if either 1) cdd_name == "" or 2) no rpsblast given and we are # unable to find one if cdd_name == "": msg = (f"Unable to learn CDD database name. Make sure the files in " f"{cdd_dir} all have the same basename.") logger.error(msg) print(msg) return # Get the rpsblast command and path. if rpsblast == "": command = get_rpsblast_command() rpsblast = get_rpsblast_path(command) # Verify database connection and schema compatibility. engine = mysqldb.connect_to_db(database) logger.info(f"Connected to database: {database}.") mysqldb.check_schema_compatibility(engine, "the find_domains pipeline") logger.info(f"Schema version is compatible.") logger.info("Command line arguments verified.") if reset: logger.info("Clearing all domain data currently in the database.") clear_domain_data(engine) # Get gene data that needs to be processed # in dict format where key = column name, value = stored value. # result = engine.execute(GET_GENES_FOR_CDD) cdd_genes = mysqldb.query_dict_list(engine, GET_GENES_FOR_CDD) msg = f"{len(cdd_genes)} genes to search for conserved domains..." logger.info(msg) print(msg) # Only run the pipeline if there are genes returned that need it if len(cdd_genes) > 0: log_gene_ids(cdd_genes) # Create temp_dir make_tempdir(tmp_dir) # TODO dev # translations = get_unique_translations(cdd_genes) # Build jobs list jobs = [] # TODO dev # translation_id = 0 # for translation in translations: # translation_id += 1 # jobs.append((rpsblast, cdd_name, tmp_dir, evalue, # translation_id, translation)) for cdd_gene in cdd_genes: jobs.append((rpsblast, cdd_name, tmp_dir, evalue, cdd_gene["GeneID"], cdd_gene["Translation"])) results = parallelize(jobs, threads, search_and_process) print("\n") # TODO dev # results_dict = create_results_dict(results) # map_results_to_genes(cdd_genes, results_dict) insert_domain_data(engine, results) engine.dispose() return
def main(argument_list): """ :param argument_list: :return: """ # Setup argument parser cdd_parser = setup_argparser() # Use argument parser to parse argument_list args = cdd_parser.parse_args(argument_list) # Store arguments in more easily accessible variables database = args.database cdd_dir = expand_path(args.cdd) cdd_name = learn_cdd_name(cdd_dir) threads = args.threads evalue = args.evalue rpsblast = args.rpsblast tmp_dir = args.tmp_dir output_folder = args.output_folder reset = args.reset batch_size = args.batch_size # Set up directory. output_folder = basic.set_path(output_folder, kind="dir", expect=True) results_folder = pathlib.Path(RESULTS_FOLDER) results_path = basic.make_new_dir(output_folder, results_folder, attempt=50) if results_path is None: print("Unable to create output_folder.") sys.exit(1) log_file = pathlib.Path(results_path, MAIN_LOG_FILE) # Set up root logger. logging.basicConfig(filename=log_file, filemode="w", level=logging.DEBUG, format="pdm_utils find_domains: %(levelname)s: %(message)s") logger.info(f"pdm_utils version: {VERSION}") logger.info(f"CDD run date: {constants.CURRENT_DATE}") logger.info(f"Command line arguments: {' '.join(argument_list)}") logger.info(f"Results directory: {results_path}") # Early exit if either 1) cdd_name == "" or 2) no rpsblast given and we are # unable to find one if cdd_name == "": msg = (f"Unable to learn CDD database name. Make sure the files in " f"{cdd_dir} all have the same basename.") logger.error(msg) print(msg) return # Get the rpsblast command and path. if rpsblast == "": command = get_rpsblast_command() rpsblast = get_rpsblast_path(command) # Verify database connection and schema compatibility. alchemist = AlchemyHandler(database=database) alchemist.connect(pipeline=True) engine = alchemist.engine logger.info(f"Connected to database: {database}.") mysqldb.check_schema_compatibility(engine, "the find_domains pipeline") logger.info(f"Schema version is compatible.") logger.info("Command line arguments verified.") if reset: logger.info("Clearing all domain data currently in the database.") clear_domain_data(engine) # Get gene data that needs to be processed # in dict format where key = column name, value = stored value. cdd_genes = mysqldb_basic.query_dict_list(engine, GET_GENES_FOR_CDD) msg = f"{len(cdd_genes)} genes to search for conserved domains..." logger.info(msg) print(msg) # Only run the pipeline if there are genes returned that need it if len(cdd_genes) > 0: log_gene_ids(cdd_genes) make_tempdir(tmp_dir) # Identify unique translations to process mapped to GeneIDs. cds_trans_dict = create_cds_translation_dict(cdd_genes) unique_trans = list(cds_trans_dict.keys()) msg = (f"{len(unique_trans)} unique translations " "to search for conserved domains...") logger.info(msg) print(msg) # Process translations in batches. Otherwise, searching could take # so long that MySQL connection closes resulting in 1 or more # transaction errors. batch_indices = basic.create_indices(unique_trans, batch_size) total_rolled_back = 0 for indices in batch_indices: start = indices[0] stop = indices[1] msg = f"Processing translations {start + 1} to {stop}..." logger.info(msg) print(msg) sublist = unique_trans[start:stop] batch_rolled_back = search_translations( rpsblast, cdd_name, tmp_dir, evalue, threads, engine, sublist, cds_trans_dict) total_rolled_back += batch_rolled_back search_summary(total_rolled_back) engine.dispose() return
def main(unparsed_args_list): """Run main retrieve_updates pipeline.""" # Parse command line arguments args = parse_args(unparsed_args_list) date = time.strftime("%Y%m%d") args.output_folder = basic.set_path(args.output_folder, kind="dir", expect=True) working_dir = pathlib.Path(f"{date}_get_data") working_path = basic.make_new_dir(args.output_folder, working_dir, attempt=10) if working_path is None: print(f"Invalid working directory '{working_dir}'") sys.exit(1) ncbi_cred_dict = ncbi.get_ncbi_creds(args.ncbi_credentials_file) # Verify database connection and schema compatibility. print("Preparing genome data sets from the MySQL database...") engine = mysqldb.connect_to_db(args.database) mysqldb.check_schema_compatibility(engine, "the get_data pipeline") # Get existing data from MySQL to determine what needs to be updated. query = ("SELECT PhageID, Name, HostGenus, Status, Cluster, " "DateLastModified, Accession, RetrieveRecord, Subcluster, " "AnnotationAuthor FROM phage") mysqldb_genome_list = mysqldb.parse_genome_data(engine=engine, phage_query=query, gnm_type="mysqldb") engine.dispose() mysqldb_genome_dict = {} for gnm in mysqldb_genome_list: mysqldb_genome_dict[gnm.id] = gnm # Get data from PhagesDB if (args.updates or args.final or args.draft) is True: print("Retrieving data from PhagesDB...") phagesdb_phages = phagesdb.get_phagesdb_data(constants.API_SEQUENCED) phagesdb_phages_dict = basic.convert_list_to_dict( phagesdb_phages, "phage_name") phagesdb_genome_dict = phagesdb.parse_genomes_dict( phagesdb_phages_dict, gnm_type="phagesdb", seq=False) # Exit if all phage data wasn't retrieved. if len(phagesdb_genome_dict) == 0: sys.exit(1) # Returns a list of tuples. match_output = match_genomes(mysqldb_genome_dict, phagesdb_genome_dict) matched_genomes = match_output[0] unmatched_phagesdb_ids = match_output[1] if args.updates is True: get_update_data(working_path, matched_genomes) if args.final is True: get_final_data(working_path, matched_genomes) if args.genbank is True: get_genbank_data(working_path, mysqldb_genome_dict, ncbi_cred_dict, args.genbank_results) if args.draft is True: get_draft_data(working_path, unmatched_phagesdb_ids) print("\n\n\nRetrieve updates script completed.")
def test_set_path_2(self, sys_exit_mock): """Verify script exits when file does not exist, and is not expected to exist.""" output = basic.set_path(self.file, kind="file", expect=True) self.assertTrue(sys_exit_mock.called)