def main(argv: Optional[List[str]] = None, logger: Optional[logging.Logger] = None): """Set up parser and logger, and coordinate prediction of CAZymes.""" # build parser # Parse arguments # Check if namepsace isn't passed, if not parse command-line if argv is None: # Parse command-line parser = build_parser() args = parser.parse_args() else: parser = build_parser(argv) args = parser.parse_args() # Initiate logger # Note: log file only created if specified at cmdline if logger is None: logger = logging.getLogger(__name__) config_logger(args) # check current working directory, to make sure can access the CAZyme prediction tools check_cwd() # If specified output directory for genomic files, create output directory if args.output is not sys.stdout: make_output_directory(args.output, args.force, args.nodelete) # invoke prediction tools and build prediciton Proteome instances get_predictions(args) logger.info("Program finished, and no terminating.")
def main(argv: Optional[List[str]] = None, logger: Optional[logging.Logger] = None): """Set up parser and logger, and coordinate prediction of CAZymes.""" if argv is None: # Parse command-line parser = build_parser() args = parser.parse_args() else: parser = build_parser(argv) args = parser.parse_args() # Initiate logger # Note: log file only created if specified at cmdline if logger is None: logger = logging.getLogger(__name__) config_logger(args) # If specified output directory for genomic files, create output directory if args.output is not sys.stdout: make_output_directory(args.output, args.force, args.nodelete) # open the CAZy dict cazy_dict = get_cazy_dict(args.cazy) # retrieve paths to all dirs predictions = get_predictions(args.input) # USED IN R EVALUATION # perform stats evaluations stats.evaluate_performance(predictions, cazy_dict, args) if args.fam_freq is not None: # retrieve the frequency of each CAZy family across all test sets time_stamp = datetime.now().strftime("%Y_%m_%d") stats.get_fam_freq(args, cazy_dict, time_stamp) # USED IN R EVALUATION
def main(argv: Optional[List[str]] = None, logger: Optional[logging.Logger] = None): if argv is None: parser = build_parser() args = parser.parse_args() else: parser = build_parser(argv) args = parser.parse_args() if logger is None: config_logger(args) logger = logging.getLogger(__name__) Entrez.email = args.email make_output_directory(args.output, args.force, args.nodelete) # get the YAML file containing the genomic assemblies to be used for creating test sets assembly_dict = retrieve_assemblies_dict(args.yaml) # get dict containing the genomic assemblies of all CAZymes in CAZy cazy_dict = get_cazy_dict(args.cazy) temp_alignment_dir = args.output / "temp_alignment_dir" # create a test set for each genomic assembly for txid in tqdm(assembly_dict, desc="Parsing assemblies in config file"): for assembly in assembly_dict[txid]: # whipe temp dir clean prepare_output_dir(temp_alignment_dir) # download genomic assembly assembly_path = get_genomic_assembly(assembly, txid) # create a FASTA file containing all proteins sequences in the genomic assembly fasta_path = get_protein_seqs(assembly_path, assembly, txid) # differentiate between CAZymes and non-CAZymes and get test set of 100 CAZymes selected_cazymes, cazyme_fasta, non_cazymes, noncazyme_fasta = differentiate_cazymes_and_noncazymes( cazy_dict, fasta_path, temp_alignment_dir, ) if selected_cazymes is None: continue alignment_df = align_cazymes_and_noncazymes( cazyme_fasta, noncazyme_fasta, temp_alignment_dir) if alignment_df is None: continue final_fasta = compile_output_file_path(fasta_path) write_out_test_set(selected_cazymes, non_cazymes, alignment_df, final_fasta) # delete the temporary alignment dir shutil.rmtree(temp_alignment_dir)
def main(argv: Optional[List[str]] = None, logger: Optional[logging.Logger] = None): """Set up loggers, parsers and directories for retrieval of genomes from NCBI. Then retrieve taxonomy data and GenBank files from NCBI. Return GenBank (.gbff) files and dataframe of taxonomy data. """ if argv is None: parser = build_parser() args = parser.parse_args() else: parser = build_parser(argv) args = parser.parse_args() if logger is None: config_logger(args) logger = logging.getLogger(__name__) logger.info("Run initated") # Add users email address from parser if args.user is None: logger.error( "No user email provided. Email MUST be provided. Terminating programme" ) sys.exit(1) else: Entrez.email = args.user # If specified output directory for genomic files, create output directory if args.output is not sys.stdout: make_output_directory(args.output, args.force, args.nodelete) # Invoke main usage of programme # Create dataframe storing genus, species and NCBI Taxonomy ID, called 'species_table' species_table = parse_input_file(args.input_file, args.retries) # Pull down accession numbers and GenBank files (if not disabled) species_table["NCBI Accession Numbers"] = species_table.apply( get_accession_numbers, args=(args, ), axis=1) logger.info("Generated species table") # Write out dataframe if args.dataframe is not sys.stdout: write_out_dataframe(species_table, args.dataframe, args.force) else: species_table.to_csv(args.dataframe) # Program finished logger.info("Program finished and exiting")
def main(argv: Optional[List[str]] = None, logger: Optional[logging.Logger] = None): """Set up parser, loggers, and IO directories, then invoke scripts main function.""" # Parser arguments # Check if namespace isn't passed, if not parser command-line if argv is None: parser = build_parser() args = parser.parse_args() else: parser = build_parser(argv) args = parser.parse_args() # Initate logger # Note: log file only created if specificied at cmdline if logger is None: logger = config_logger(args) # Check config is present if (args.input is None) or (os.path.exists(args.input) is False): logger.error("No configuration file found. Terminating.") sys.exit(1) # If specified output directory, create output directory to write FASTA files too if args.outdir is not sys.stdout: make_output_directory(args.outdir, logger, args.force, args.nodelete) # Initate scripts main function read_configuration(args, logger)
def main(argv: Optional[List[str]] = None, logger: Optional[logging.Logger] = None): """Coordinate the retrieval of protein annotations from GenBank (.gbff) files. Including building parser, logger and output directory. Return dataframe of protein data. """ if argv is None: parser = build_parser() args = parser.parse_args() else: parser = build_parser(argv) args = parser.parse_args() if logger is None: config_logger(args) logger = logging.getLogger(__name__) # If specified output directory, create output directory to write FASTA files too if args.output is not sys.stdout: make_output_directory(args.output, args.force, args.nodelete) # Open input dataframe logger.info("Opening input dataframe %s", args.input_df) input_df = pd.read_csv(args.input_df, header=0, index_col=0) # Build dataframe protein_annotation_df = create_dataframe(input_df, args) # Write out dataframe if args.output_df is not None: write_out_dataframe(protein_annotation_df, args.output_df, args.force) # Write out FASTA files index = 0 for index in tqdm(range(len(protein_annotation_df["Genus"])), desc="Writing protein to FASTA"): df_row = protein_annotation_df.iloc[index] write_fasta(df_row, logger, args) index += 1 logger.info("Programme finsihed. Terminating.")
def test_config_logger_v_true(logger_args_true): """Tests building of logger""" config_logger(logger_args_true["args"])
def test_config_logger_v_false(logger_args_false): """Tests building of logger""" config_logger(logger_args_false["args"])