def main(argv: Optional[List[str]] = None, logger: Optional[logging.Logger] = None):
    """Set up parser and logger, and coordinate prediction of CAZymes."""
    # build parser
    # Parse arguments
    # Check if namepsace isn't passed, if not parse command-line
    if argv is None:
        # Parse command-line
        parser = build_parser()
        args = parser.parse_args()
    else:
        parser = build_parser(argv)
        args = parser.parse_args()

    # Initiate logger
    # Note: log file only created if specified at cmdline
    if logger is None:
        logger = logging.getLogger(__name__)
        config_logger(args)

    # check current working directory, to make sure can access the CAZyme prediction tools
    check_cwd()

    # If specified output directory for genomic files, create output directory
    if args.output is not sys.stdout:
        make_output_directory(args.output, args.force, args.nodelete)

    # invoke prediction tools and build prediciton Proteome instances
    get_predictions(args)

    logger.info("Program finished, and no terminating.")
Beispiel #2
0
def main(argv: Optional[List[str]] = None,
         logger: Optional[logging.Logger] = None):
    """Set up parser and logger, and coordinate prediction of CAZymes."""
    if argv is None:
        # Parse command-line
        parser = build_parser()
        args = parser.parse_args()
    else:
        parser = build_parser(argv)
        args = parser.parse_args()

    # Initiate logger
    # Note: log file only created if specified at cmdline
    if logger is None:
        logger = logging.getLogger(__name__)
        config_logger(args)

    # If specified output directory for genomic files, create output directory
    if args.output is not sys.stdout:
        make_output_directory(args.output, args.force, args.nodelete)

    # open the CAZy dict
    cazy_dict = get_cazy_dict(args.cazy)

    # retrieve paths to all dirs
    predictions = get_predictions(args.input)  # USED IN R EVALUATION

    # perform stats evaluations
    stats.evaluate_performance(predictions, cazy_dict, args)

    if args.fam_freq is not None:  # retrieve the frequency of each CAZy family across all test sets
        time_stamp = datetime.now().strftime("%Y_%m_%d")
        stats.get_fam_freq(args, cazy_dict, time_stamp)  # USED IN R EVALUATION
Beispiel #3
0
def main(argv: Optional[List[str]] = None,
         logger: Optional[logging.Logger] = None):
    if argv is None:
        parser = build_parser()
        args = parser.parse_args()
    else:
        parser = build_parser(argv)
        args = parser.parse_args()

    if logger is None:
        config_logger(args)
    logger = logging.getLogger(__name__)

    Entrez.email = args.email

    make_output_directory(args.output, args.force, args.nodelete)

    # get the YAML file containing the genomic assemblies to be used for creating test sets
    assembly_dict = retrieve_assemblies_dict(args.yaml)

    # get dict containing the genomic assemblies of all CAZymes in CAZy
    cazy_dict = get_cazy_dict(args.cazy)

    temp_alignment_dir = args.output / "temp_alignment_dir"

    # create a test set for each genomic assembly
    for txid in tqdm(assembly_dict, desc="Parsing assemblies in config file"):
        for assembly in assembly_dict[txid]:
            # whipe temp dir clean
            prepare_output_dir(temp_alignment_dir)

            # download genomic assembly
            assembly_path = get_genomic_assembly(assembly, txid)

            # create a FASTA file containing all proteins sequences in the genomic assembly
            fasta_path = get_protein_seqs(assembly_path, assembly, txid)

            # differentiate between CAZymes and non-CAZymes and get test set of 100 CAZymes
            selected_cazymes, cazyme_fasta, non_cazymes, noncazyme_fasta = differentiate_cazymes_and_noncazymes(
                cazy_dict,
                fasta_path,
                temp_alignment_dir,
            )

            if selected_cazymes is None:
                continue

            alignment_df = align_cazymes_and_noncazymes(
                cazyme_fasta, noncazyme_fasta, temp_alignment_dir)
            if alignment_df is None:
                continue

            final_fasta = compile_output_file_path(fasta_path)

            write_out_test_set(selected_cazymes, non_cazymes, alignment_df,
                               final_fasta)

    # delete the temporary alignment dir
    shutil.rmtree(temp_alignment_dir)
def main(argv: Optional[List[str]] = None,
         logger: Optional[logging.Logger] = None):
    """Set up loggers, parsers and directories for retrieval of genomes from NCBI.

    Then retrieve taxonomy data and GenBank files from NCBI.

    Return GenBank (.gbff) files and dataframe of taxonomy data.
    """
    if argv is None:
        parser = build_parser()
        args = parser.parse_args()
    else:
        parser = build_parser(argv)
        args = parser.parse_args()

    if logger is None:
        config_logger(args)
    logger = logging.getLogger(__name__)
    logger.info("Run initated")

    # Add users email address from parser
    if args.user is None:
        logger.error(
            "No user email provided. Email MUST be provided. Terminating programme"
        )
        sys.exit(1)
    else:
        Entrez.email = args.user

    # If specified output directory for genomic files, create output directory
    if args.output is not sys.stdout:
        make_output_directory(args.output, args.force, args.nodelete)

    # Invoke main usage of programme
    # Create dataframe storing genus, species and NCBI Taxonomy ID, called 'species_table'
    species_table = parse_input_file(args.input_file, args.retries)

    # Pull down accession numbers and GenBank files (if not disabled)
    species_table["NCBI Accession Numbers"] = species_table.apply(
        get_accession_numbers, args=(args, ), axis=1)
    logger.info("Generated species table")

    # Write out dataframe
    if args.dataframe is not sys.stdout:
        write_out_dataframe(species_table, args.dataframe, args.force)
    else:
        species_table.to_csv(args.dataframe)

    # Program finished
    logger.info("Program finished and exiting")
def main(argv: Optional[List[str]] = None,
         logger: Optional[logging.Logger] = None):
    """Set up parser, loggers, and IO directories, then invoke scripts main function."""
    # Parser arguments
    # Check if namespace isn't passed, if not parser command-line
    if argv is None:
        parser = build_parser()
        args = parser.parse_args()
    else:
        parser = build_parser(argv)
        args = parser.parse_args()

    # Initate logger
    # Note: log file only created if specificied at cmdline
    if logger is None:
        logger = config_logger(args)

    # Check config is present
    if (args.input is None) or (os.path.exists(args.input) is False):
        logger.error("No configuration file found. Terminating.")
        sys.exit(1)

    # If specified output directory, create output directory to write FASTA files too
    if args.outdir is not sys.stdout:
        make_output_directory(args.outdir, logger, args.force, args.nodelete)

    # Initate scripts main function
    read_configuration(args, logger)
Beispiel #6
0
def main(argv: Optional[List[str]] = None,
         logger: Optional[logging.Logger] = None):
    """Coordinate the retrieval of protein annotations from GenBank (.gbff) files.

    Including building parser, logger and output directory.

    Return dataframe of protein data.
    """
    if argv is None:
        parser = build_parser()
        args = parser.parse_args()
    else:
        parser = build_parser(argv)
        args = parser.parse_args()

    if logger is None:
        config_logger(args)
    logger = logging.getLogger(__name__)

    # If specified output directory, create output directory to write FASTA files too
    if args.output is not sys.stdout:
        make_output_directory(args.output, args.force, args.nodelete)

    # Open input dataframe
    logger.info("Opening input dataframe %s", args.input_df)
    input_df = pd.read_csv(args.input_df, header=0, index_col=0)

    # Build dataframe
    protein_annotation_df = create_dataframe(input_df, args)

    # Write out dataframe
    if args.output_df is not None:
        write_out_dataframe(protein_annotation_df, args.output_df, args.force)

    # Write out FASTA files
    index = 0
    for index in tqdm(range(len(protein_annotation_df["Genus"])),
                      desc="Writing protein to FASTA"):
        df_row = protein_annotation_df.iloc[index]
        write_fasta(df_row, logger, args)
        index += 1

    logger.info("Programme finsihed. Terminating.")
Beispiel #7
0
def test_config_logger_v_true(logger_args_true):
    """Tests building of logger"""
    config_logger(logger_args_true["args"])
Beispiel #8
0
def test_config_logger_v_false(logger_args_false):
    """Tests building of logger"""
    config_logger(logger_args_false["args"])