コード例 #1
0
def main(argv: Optional[List[str]] = None,
         logger: Optional[logging.Logger] = None):
    time_stamp = datetime.now().strftime(
        "%Y-%m-%d_%H-%M-%S")  # used in naming files
    start_time = datetime.now().strftime(
        "%Y-%m-%d %H:%M:%S")  # used in terminating message
    start_time = pd.to_datetime(start_time)

    # Program preparation
    if argv is None:
        parser = build_parser()
        args = parser.parse_args()
    else:
        parser = build_parser(argv)
        args = parser.parse_args()

    if logger is None:
        logger = logging.getLogger(__name__)
        config_logger(args)

    # parse the configuration data (cache the uniprot data as .csv files)
    connection, logger_name, cache_dir = connect_existing_db(
        args, time_stamp, start_time)

    # build cache directory
    if args.cache_dir is not None:  # use user defined cache dir
        cache_dir = args.cache_dir
        make_output_directory(cache_dir, args.force, args.nodelete_cache)
    else:
        cache_dir = cache_dir / "uniprot_data_retrieval"
        make_output_directory(cache_dir, args.force, args.nodelete_cache)

    (
        config_dict,
        class_filters,
        family_filters,
        kingdom_filters,
        taxonomy_filter_dict,
        ec_filters,
    ) = get_expansion_configuration(args)

    # add log to the local CAZyme database
    logger.info("Adding log of scrape to the local CAZyme database")

    retrieved_annotations = "UniProt accessions, Protein names"
    if args.ec:
        retrieved_annotations += ", EC numbers"
    if args.pdb:
        retrieved_annotations += ", PDB accessions"
    if args.sequence:
        retrieved_annotations += ", Protein sequence"
    if args.seq_update:
        retrieved_annotations += ", Updated UniProt protein sequences"

    with sql_orm.Session(bind=connection) as session:
        sql_interface.log_scrape_in_db(
            time_stamp,
            config_dict,
            taxonomy_filter_dict,
            kingdom_filters,
            ec_filters,
            'UniProt',
            retrieved_annotations,
            session,
            args,
        )

    # retrieve dict of genbank accession and genbank db ids from the local CAZyme db
    if args.genbank_accessions is not None:
        logger.warning(
            f"Getting GenBank accessions from file: {args.genbank_accessions}")
        with open(args.genbank_accessions, "r") as fh:
            lines = fh.read().splitlines()

        accessions = [line.strip() for line in lines]
        accessions = set(accessions)

        gbk_dict = get_selected_gbks.get_ids(accessions, connection)

    else:
        gbk_dict = get_selected_gbks.get_genbank_accessions(
            class_filters,
            family_filters,
            taxonomy_filter_dict,
            kingdom_filters,
            ec_filters,
            connection,
        )

    logger.warning(f"Retrieving UniProt data for {len(gbk_dict.keys())}")

    # if using cachce skip accession retrieval
    if args.use_uniprot_cache is not None:
        logger.warning(
            f"Using UniProt data from cache: {args.use_uniprot_cache}")
        with open(args.use_uniprot_cache, "r") as fh:
            uniprot_dict = json.load(fh)

        if args.ec:
            all_ecs = get_ecs_from_cache(uniprot_dict)
        else:
            all_ecs = set()

    else:

        # Get the UniProt accessions/IDs for the corresponding GenBank accessions
        if args.skip_uniprot_accessions is not None:
            logger.warning(
                f"Using UniProt accessions from cache: {args.skip_uniprot_accessions}"
            )
            with open(args.skip_uniprot_accessions, "r") as fh:
                uniprot_gkb_dict = json.load(fh)

        else:
            uniprot_gkb_dict = get_uniprot_accessions(
                gbk_dict,
                args)  # {uniprot_acc: {'gbk_acc': str, 'db_id': int}}

            uniprot_acc_cache = cache_dir / f"uniprot_accessions_{time_stamp}.json"
            with open(uniprot_acc_cache, "w") as fh:
                json.dump(uniprot_gkb_dict, fh)

        # get data from UniProt
        uniprot_dict, all_ecs = get_uniprot_data(uniprot_gkb_dict, cache_dir,
                                                 args)

        # converts sets to lists for json serialisation
        for uniprot_accession in uniprot_dict:
            try:
                uniprot_dict[uniprot_accession]['ec'] = list(
                    uniprot_dict[uniprot_accession]['ec'])
            except KeyError:
                pass
            try:
                uniprot_dict[uniprot_accession]['pdb'] = list(
                    uniprot_dict[uniprot_accession]['pdb'])
            except KeyError:
                pass

        uniprot_acc_cache = cache_dir / f"uniprot_data_{time_stamp}.json"
        with open(uniprot_acc_cache, "w") as fh:
            json.dump(uniprot_dict, fh)

    # add uniprot accessions (and sequences if seq retrieval is enabled)
    logger.warning("Adding data to the local CAZyme database")
    add_uniprot_accessions(uniprot_dict, gbk_dict, connection, args)

    # add ec numbers
    if (args.ec) and (len(all_ecs) != 0):
        add_ec_numbers(uniprot_dict, all_ecs, gbk_dict, connection, args)

    # add pdb accessions
    if args.pdb:
        add_pdb_accessions(uniprot_dict, gbk_dict, connection, args)

    closing_message("get_uniprot_data", start_time, args)
コード例 #2
0
def main(argv: Optional[List[str]] = None,
         logger: Optional[logging.Logger] = None):
    """Set up parser, logger and coordinate overal scrapping of CAZy."""
    cazy_home_url = "http://www.cazy.org"

    time_stamp = datetime.now().strftime(
        "%Y-%m-%d_%H-%M-%S")  # used in naming files
    start_time = datetime.now().strftime(
        "%Y-%m-%d %H:%M:%S")  # used in terminating message
    start_time = pd.to_datetime(start_time)

    # Program preparation
    if argv is None:
        parser = build_parser()
        args = parser.parse_args()
    else:
        parser = build_parser(argv)
        args = parser.parse_args()

    if logger is None:
        logger = logging.getLogger(__name__)
        config_logger(args)

    # check if printing out version or citation information
    if args.version:
        print(VERSION_INFO)
        return

    if args.citation:
        print(CITATION_INFO)
        return

    # check correct output was provided, exit if not operable
    if args.database is not None and args.db_output is not None:
        warning_message = (
            "Target path for a NEW database (--db_output, -d) and\n"
            "a path to an EXISTING database (--database, -D) were provided.\n"
            "Please provide one OR the other.\n"
            "Terminating program.")
        logger.warning(termcolour(warning_message, "red"))
        closing_message("cazy_webscraper", start_time, args)
        return

    if args.db_output is not None and args.db_output.exists():
        if args.force:
            logger.warning(f"Local db {args.db_output} already exists\n"
                           "Force is True\n"
                           "Ovewriting existing database.")
            os.remove(args.db_output)
        else:
            logger.warning(f"Local db {args.db_output} already exists\n"
                           "Force is False\n"
                           "Not ovewriting existing database\n"
                           "Termianting program")
            closing_message("cazy_webscraper", start_time, args)
            return

    Entrez.email = args.email

    logger.info("Parsing configuration")
    (
        excluded_classes,
        config_dict,
        cazy_class_synonym_dict,
        class_filters,
        fam_filters,
        kingdom_filters,
        taxonomy_filter_dict,
        taxonomy_filter_set,
    ) = parse_configuration.parse_configuration(args)

    scrape_config_message = (
        "Configuration:\n"
        f"Classes to scrape: {config_dict['classes']}\n"
        f"GH fams to scrape: {config_dict['Glycoside Hydrolases (GHs)']}\n"
        f"GT fams to scrape: {config_dict['GlycosylTransferases (GTs)']}\n"
        f"PL fams to scrape: {config_dict['Polysaccharide Lyases (PLs)']}\n"
        f"CE fams to scrape: {config_dict['Carbohydrate Esterases (CEs)']}\n"
        f"AA fams to scrape: {config_dict['Auxiliary Activities (AAs)']}\n"
        f"CBM fams to scrape: {config_dict['Carbohydrate-Binding Modules (CBMs)']}\n"
        f"Scraping subfamilies: {args.subfamilies}")

    if len(taxonomy_filter_set) != 0:
        scrape_config_message += "\nTaxonomy filters applied."

    if len(kingdom_filters) < 5:
        scrape_config_message += f"\nScraping only tax kingdoms: {kingdom_filters}"

    logger.info(termcolour(scrape_config_message, "cyan"))

    if args.database is not None:  # adding data to an EXISTING database
        connection, logger_name, cache_dir = connect_existing_db(
            args, time_stamp, start_time)
    else:  # build a new database
        connection, logger_name, cache_dir = connect_to_new_db(
            args, time_stamp, start_time)

    logger.info("Adding log of scrape to the local CAZyme database")
    with sql_orm.Session(bind=connection) as session:
        sql_interface.log_scrape_in_db(
            time_stamp,
            config_dict,
            kingdom_filters,
            taxonomy_filter_dict,
            set(),  # ec_filters not applied when scraping CAZy
            'CAZy',
            'CAZy annotations',
            session,
            args,
        )

    if args.cache_dir is not None:  # use user defined cache dir
        cache_dir = args.cache_dir
        make_output_directory(cache_dir, args.force, args.nodelete_cache)
    else:
        make_output_directory(cache_dir, args.force, args.nodelete_cache)

    logger.warning(f"Created cache dir: {cache_dir}")

    if args.log is not None:  # write additional log files to user specified dir
        logger_name = args.log.name
        if logger_name.endswith(".log"):
            logger_name = logger_name[:-4]
        make_output_directory(args.log, args.force, args.nodelete_log)
    else:
        # write the additional log files to the .cazy_webscraper cache dire
        logger_name = "log"

    logger.info("Starting retrieval of data from CAZy")

    if args.cazy_data is not None:
        logger.warning(
            f"Retrieving CAZy data from predownloaded CAZy db dump at:\n{args.cazy_data}"
        )

    get_cazy_data(
        cazy_home_url,
        excluded_classes,
        cazy_class_synonym_dict,
        config_dict,
        class_filters,
        fam_filters,
        kingdom_filters,
        taxonomy_filter_set,
        connection,
        cache_dir,
        logger_name,
        time_stamp,
        args,
    )

    closing_message("cazy_webscraper", start_time, args)
コード例 #3
0
def get_validation_data(
    cazy_home_url,
    excluded_classes,
    cazy_synonym_dict,
    config_dict,
    cache_dir,
    connection_failures_logger,
    time_stamp,
    args,
):
    """Coordinate retrieving the population sizes of CAZy familes from the CAZy website.
    
    :param cazy_home_url: str, URL to CAZy home page
    :param excluded_classes: list of CAZy classes NOT to scrape
    :param cazy_synonym_dict: dict of accepted CAZy class name synonyms
    :param config_dict: dict keyed by CAZy classes, values by set of CAZy families to scrape
    :param cache_dir: path to cache dir
    :param connection_failures_logger: logger, logg incorrect URLs and URLs to which a connection 
            could not be made
    :param time_stamp: str, time cazy_webscraper was invoked
    :param args: cmd-line args parser
    
    Return dict, keyed by CAZy family (str) and valued by population size (int)
    """
    # make dir fo caching HTML files
    cache_dir = cache_dir / "html"
    file_io.make_output_directory(cache_dir, args.force, args.nodelete_cache)

    cazy_fam_populations = {}  # {fam(str): population(int)}

    # retrieve list of CAZy class instances, one instance per class to be scrapped
    cazy_classes = get_cazy_classes(
        cazy_home_url,
        excluded_classes,
        cazy_synonym_dict,
        cache_dir,
        time_stamp,
        args,
    )
    if cazy_classes is None:
        return

    for cazy_class in tqdm(cazy_classes,
                           desc="Retrieving CAZy family population sizes"):

        # first attempt of scraping, retrieve URLs to CAZy families
        if len(list(cazy_class.failed_families.keys())) == 0:
            fam_pops_to_retrieve = config_dict[
                cazy_class.name]  # retrieve user specified fams
        else:
            fam_pops_to_retrieve = list(
                cazy_class.failed_families.keys())  # retry failed connections

        family_populations, err_message, incorrect_urls, failed_families = get_cazy_family_pops(
            cazy_class.name,
            cazy_class.url,
            cazy_home_url,
            fam_pops_to_retrieve,
            cache_dir,
            time_stamp,
            args,
        )

        if incorrect_urls is not None:  # log families for which compiled URL is incorrect
            [
                connection_failures_logger.warning(url_message)
                for url_message in incorrect_urls
            ]

        if family_populations is None:  # couldn't retrieve family populations
            cazy_class.tries += 1

            # check if maximum number of attempts to connect have been met
            if cazy_class.tries == (args.retries +
                                    1):  # Maximum number of tries met
                connection_failures_logger.warning(
                    f"{cazy_class.url}\t"
                    f"{cazy_class.name}\t"
                    f"CAZy family populations not retrieved from {cazy_class.name}\t"
                    f"{err_message}")

            else:
                for fam in failed_families:
                    try:
                        cazy_class.failed_families[fam] += 1
                        if cazy_class.failed_families[fam] == (args.retries +
                                                               1):
                            # max number of attemptes made, do not retry connection
                            del cazy_class.failed_families[fam]
                    except KeyError:
                        cazy_class.failed_families[fam] = 1

                cazy_classes.append(
                    cazy_class)  # retry retriving family populations later

            continue  # go onto next CAZy class

        else:  # retrieved CAZy family populations
            cazy_fam_populations.update(family_populations)

    # log any errors that meant no family population could be retrieved
    for cazy_class in cazy_classes:
        for fam in list(cazy_class.failed_families.keys()):
            connection_failures_logger.warning(
                f"{fam}\t"
                "Retrieved no family population for data retrieval validation\n"
                f"Failed to conencted to CAZy after {(args.retries + 1)*(args.retries +1)} attempts"
            )

    return cazy_fam_populations
コード例 #4
0
def connect_to_new_db(args, time_stamp, start_time):
    """Build and connect to a new local CAZyme database.
    
    :param args: cmd-line args parser
    :param time_stamp: str, time cazy_webscraper was invoked
    :param start_time: pd date-time obj, time cazy_webscraper was invoked
    
    Return connection to the database, name of the logger, and path to the cache dir
    """
    logger = logging.getLogger(__name__)

    if args.db_output is not None:  # user defined target output for the NEW database
        
        if (os.path.isfile(args.db_output)):  # target file exists
            if args.force:
                logger.warning(
                    "Overwriting existing local CAZyme database at:\n"
                    f"{args.db_output}"
                )

            else:
                logger.warning(
                    "Target path for new database already exists.\n"
                    "Either enable forced overwriting (-f) or add data this data (-D).\n"
                    "Terminating program."
                )
                closing_message("cazy_webscraper", start_time, args)
                sys.exit(1)
        
        else:  # may need to build dirs
            logger.info(
                "Building new local CAZyme database\n"
                f"Output directory: {(args.db_output).parent}\n"
                f"Force overwriting exiting output file: {args.force}"
            )

        if str((args.db_output).parent) != '.':  # dirs defined in output put
            output_dir = (args.db_output).parent
            make_output_directory(output_dir, args.force, args.nodelete)
            cache_dir = Path(f"{str(output_dir)}/.cazy_webscraper_{time_stamp}")
            
        else:  # writing to cwd
            cache_dir = Path(f".cazy_webscraper_{time_stamp}")

        logger_name = str(args.db_output).split('.')[0]
        db_path = args.db_output
    
    else:
        logger.info("Using default database name and writing to cwd")
        db_path = Path(f"cazy_webscraper_{time_stamp}.db")
        cache_dir = Path(f".cazy_webscraper_{time_stamp}")
        logger_name = f'cazy_webscraper_{time_stamp}'
    
    try:
        connection = sql_orm.get_db_connection(db_path, args, new=True)
        logger.warning(f"Built new local CAZyme database at\n{db_path}")
    except Exception:
        logger.error(
            "Failed to build new SQL database\n."
            "Terminating program",
            exc_info=True,
        )
        closing_message("cazy_webscraper", start_time, args)
        sys.exit(1)

    return connection, logger_name, cache_dir
コード例 #5
0
def main(argv: Optional[List[str]] = None,
         logger: Optional[logging.Logger] = None):
    time_stamp = datetime.now().strftime(
        "%Y-%m-%d_%H-%M-%S")  # used in naming files
    start_time = datetime.now().strftime(
        "%Y-%m-%d %H:%M:%S")  # used in terminating message
    start_time = pd.to_datetime(start_time)

    # Program preparation
    if argv is None:
        parser = genbank_cov_parser.build_parser()
        args = parser.parse_args()
    else:
        parser = genbank_cov_parser.build_parser(argv)
        args = parser.parse_args()

    if logger is None:
        logger = logging.getLogger(__name__)
        config_logger(args)

    Entrez.email = args.email

    # check if need to build output dir
    if os.getcwd() != args.output_dir:
        make_output_directory(args.output_dir, args.force, args.nodelete)

    # connect to the local CAZyme database
    connection, logger_name, cache_dir = cazy_scraper.connect_existing_db(
        args, time_stamp)
    # make cache_dir
    make_output_directory(cache_dir, args.force_cache, args.nodelete_cache)

    no_accession_logger = cache_dir / f"no_genomic_accession_retrieved_{time_stamp}.log"

    # load Genbank and Kingdom records from the db
    logger.warning(
        "Retrieving Genbanks, Taxs and Kingdoms records from the local CAZyme db"
    )
    genbank_kingdom_dict = get_table_dicts.get_gbk_kingdom_dict(connection)
    # {kingdom: {genus: {species: {protein_accessions}}}

    logger.warning(
        f"Retrieved Genbanks, Taxs and Kingdoms records from the local CAZyme db"
    )

    nucleotide_accessions_dict = get_nucleotide_accessions(
        genbank_kingdom_dict,
        no_accession_logger,
        args,
    )

    output_path = cache_dir / f"nucleotide_accessions_{time_stamp}.json"
    with open(output_path, 'w') as fh:
        json.dump(nucleotide_accessions_dict, fh)

    genomic_accession_dict = get_genomic_accessions(nucleotide_accessions_dict,
                                                    no_accession_logger, args)
    output_path = cache_dir / f"genomic_accession_numbers_{time_stamp}.json"
    with open(output_path, 'w') as fh:
        json.dump(genomic_accession_dict, fh)

    write_out_genomic_accessions(genomic_accession_dict, time_stamp, args)

    ncbi_genomes_totals = get_ncbi_counts(args)

    write_out_genome_coverage(ncbi_genomes_totals, genomic_accession_dict,
                              time_stamp, args)

    end_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    end_time = pd.to_datetime(end_time)
    total_time = end_time - start_time

    if args.verbose:
        logger.info(
            "Finished calculting the local CAZyme db's coverage of GenBank\n"
            f"Scrape initated at {start_time}\n"
            f"Scrape finished at {end_time}\n"
            f"Total run time: {total_time}"
            f"Version: {cazy_scraper.VERSION_INFO}\n"
            f"Citation: {cazy_scraper.CITATION_INFO}")
    else:
        print("=====================cazy_webscraper=====================\n"
              "Finished calculting the local CAZyme db's coverage of GenBank\n"
              f"Scrape initated at {start_time}\n"
              f"Scrape finished at {end_time}\n"
              f"Total run time: {total_time}"
              f"Version: {cazy_scraper.VERSION_INFO}\n"
              f"Citation: {cazy_scraper.CITATION_INFO}")
コード例 #6
0
def main(argv: Optional[List[str]] = None,
         logger: Optional[logging.Logger] = None):
    """Set up parser, logger and coordinate overal scrapping of CAZy."""
    time_stamp = datetime.now().strftime(
        "%Y-%m-%d_%H-%M-%S")  # used in naming files
    start_time = datetime.now().strftime(
        "%Y-%m-%d %H:%M:%S")  # used in terminating message
    start_time = pd.to_datetime(start_time)

    # Program preparation
    if argv is None:
        parser = api_parser.build_parser()
        args = parser.parse_args()
    else:
        parser = api_parser.build_parser(argv)
        args = parser.parse_args()

    if logger is None:
        logger = logging.getLogger(__name__)
        config_logger(args)

    connection, logger_name, cache_dir = cazy_scraper.connect_existing_db(
        args, time_stamp, start_time)

    if args.output_dir is not None:
        file_io.make_output_directory(args.output_dir, args.force,
                                      args.nodelete)

    if args.cache_dir is not None:  # use user defined cache dir
        cache_dir = args.cache_dir
        file_io.make_output_directory(cache_dir, args.force,
                                      args.nodelete_cache)
    else:
        cache_dir = cache_dir / "uniprot_data_retrieval"
        file_io.make_output_directory(cache_dir, args.force,
                                      args.nodelete_cache)

    (
        config_dict,
        class_filters,
        family_filters,
        kingdom_filters,
        taxonomy_filter_dict,
        ec_filters,
    ) = get_expansion_configuration(args)

    output_path = compile_output_name(args)

    existing_files = ""
    if 'json' in args.file_types:
        json_output_path = output_path + ".json"
        logger.warning(f"JSON output path: {json_output_path}")
        if Path(json_output_path).exists():
            existing_files = existing_files + " " + f"{json_output_path}\n"

    if 'csv' in args.file_types:
        csv_output_path = output_path + ".csv"
        logger.warning(f"CSV output path: {csv_output_path}")
        if Path(csv_output_path).exists():
            existing_files = existing_files + " " + f"{csv_output_path}\n"

    existing_files = existing_files.strip()
    if len(existing_files) != 0:
        if args.overwrite:
            logger.warning(
                "The output files\n"
                f"{existing_files}"
                "Exist already. Overwrite is True. Overwriting output files")
        else:
            logger.warning(
                "The output files\n"
                f"{existing_files}"
                "Exist already. Overwrite is False\n"
                "To overwrite the files use the --overwrite flag, or "
                "change the output file prefix using the --prefix flag\n"
                "Terminating program")
            closing_message("cw_query_database", start_time, args)
            sys.exit(1)

    # get the records of GenBank accessions matching the criteria of interest
    # {gbk_acc: gbk_id}
    gbk_dict = get_selected_gbks.get_genbank_accessions(
        class_filters,
        family_filters,
        taxonomy_filter_dict,
        kingdom_filters,
        ec_filters,
        connection,
    )

    query_data = get_query_data(gbk_dict, connection, args)
    logger.warning(
        f"Retrieved {len(list(query_data.keys()))} proteins from the local db")

    if 'json' in args.file_types:
        write_json_output(json_output_path, query_data, args)

    if 'csv' in args.file_types:
        write_csv_output(query_data, args, csv_output_path)

    closing_message("cw_query_database", start_time, args)
コード例 #7
0
def main(argv: Optional[List[str]] = None,
         logger: Optional[logging.Logger] = None):
    """Set up programme and initate run."""
    time_stamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    start_time = datetime.now().strftime(
        "%Y-%m-%d %H:%M:%S")  # used in terminating message
    start_time = pd.to_datetime(start_time)
    # parse cmd-line arguments
    if argv is None:
        parser = pdb_strctre_parser.build_parser()
        args = parser.parse_args()
    else:
        args = pdb_strctre_parser.build_parser(argv).parse_args()

    if logger is None:
        logger = logging.getLogger(__package__)
        config_logger(args)

    connection, logger_name, cache_dir = connect_existing_db(
        args, time_stamp, start_time)

    (
        config_dict,
        class_filters,
        family_filters,
        kingdom_filters,
        taxonomy_filter_dict,
        ec_filters,
    ) = parse_configuration.get_expansion_configuration(args)

    gbk_dict = {}  # {gbk_acc: gbk_id}

    gbk_table_dict = get_table_dicts.get_gbk_table_dict(connection)
    # {genbank_accession: 'taxa_id': str, 'gbk_id': int}

    if args.genbank_accessions is not None:
        logger.warning(
            f"Retrieving PDB structures for GenBank accessions listed in {args.genbank_accessions}"
        )
        gbk_dict.update(get_user_genbank_sequences(gbk_table_dict, args))

    if args.uniprot_accessions is not None:
        logger.warning(
            f"Extracting protein sequences for UniProt accessions listed in {args.uniprot_accessions}"
        )
        uniprot_table_dict = get_table_dicts.get_uniprot_table_dict(connection)
        gbk_dict.update(
            get_user_uniprot_sequences(gbk_table_dict, uniprot_table_dict,
                                       args))

    pdb_accessions = get_selected_pdbs.get_pdb_accessions(
        class_filters,
        family_filters,
        taxonomy_filter_dict,
        kingdom_filters,
        ec_filters,
        gbk_table_dict,
        connection,
    )

    if len(pdb_accessions) == 0:
        logger.warning("No PDB accessions matched the criteria provided.\n"
                       "Retrieving no protein structure files from PDB")
    else:
        logger.warning(
            f"Retrieving {len(pdb_accessions)} structure files from PDB")

    # make output and cache dirs
    if args.cache_dir is not None:  # use user defined cache dir
        cache_dir = args.cache_dir
        make_output_directory(cache_dir, args.force, args.nodelete_cache)
    else:
        cache_dir = cache_dir / "pdb_retrieval"
        make_output_directory(cache_dir, args.force, args.nodelete_cache)

    download_pdb_structures(pdb_accessions, args)

    cache_path = cache_dir / f"pdb_retrieval_{time_stamp}.txt"
    with open(cache_path, 'a') as fh:
        for acc in pdb_accessions:
            fh.write(f"{acc}\n")

    closing_message("Get PDB structure files", start_time, args)
コード例 #8
0
def main(argv: Optional[List[str]] = None,
         logger: Optional[logging.Logger] = None):
    """Set up programme and initate run."""
    time_stamp = datetime.now().strftime(
        "%Y-%m-%d_%H-%M-%S")  # used in naming files
    start_time = datetime.now().strftime(
        "%Y-%m-%d %H:%M:%S")  # used in terminating message
    start_time = pd.to_datetime(start_time)
    date_today = datetime.now().strftime(
        "%Y-%m-%d")  # used as seq_update_date in the db

    # parse cmd-line arguments
    if argv is None:
        parser = build_parser()
        args = parser.parse_args()
    else:
        args = build_parser(argv).parse_args()

    if logger is None:
        logger = logging.getLogger(__package__)
        config_logger(args)

    logger.info("Providing user email address to NCBI.Entrez")
    Entrez.email = args.email

    if args.seq_update:
        logger.warning("Enabled updating sequences")

    connection, logger_name, cache_dir = connect_existing_db(
        args, time_stamp, start_time)

    if args.cache_dir is not None:  # use user defined cache dir
        cache_dir = args.cache_dir
        make_output_directory(cache_dir, args.force, args.nodelete_cache)
    else:
        cache_dir = cache_dir / "genbank_data_retrieval"
        make_output_directory(cache_dir, args.force, args.nodelete_cache)

    (
        config_dict,
        class_filters,
        family_filters,
        kingdom_filters,
        taxonomy_filter_dict,
        ec_filters,
    ) = get_expansion_configuration(args)

    # add log to the local CAZyme database
    logger.info("Adding log of scrape to the local CAZyme database")
    with sql_orm.Session(bind=connection) as session:
        retrieved_data = "GenBank protein sequences"
        sql_interface.log_scrape_in_db(
            time_stamp,
            config_dict,
            kingdom_filters,
            taxonomy_filter_dict,
            ec_filters,
            'GenBank',
            retrieved_data,
            session,
            args,
        )

    # retrieve dict of genbank accession and genbank db ids from the local CAZyme db
    if args.genbank_accessions is not None:
        logger.warning(
            f"Getting GenBank accessions from file: {args.genbank_accessions}")
        with open(args.genbank_accessions, "r") as fh:
            lines = fh.read().splitlines()

        accessions = [line.strip() for line in lines]
        accessions = set(accessions)

        gbk_dict = get_selected_gbks.get_ids(accessions, connection)

    else:
        gbk_dict = get_selected_gbks.get_genbank_accessions(
            class_filters,
            family_filters,
            taxonomy_filter_dict,
            kingdom_filters,
            ec_filters,
            connection,
        )

    genbank_accessions = list(gbk_dict.keys())
    logger.warning(f"Retrieving GenBank sequences for {len(gbk_dict.keys())}")

    if args.seq_dict:
        logger.warning(f"Getting sequences from cache: {args.seq_dict}")
        with open(args.seq_dict, "r") as fh:
            cache_dict = json.load(fh)

        # convert strs to SeqRecords
        seq_dict = {}
        for key in cache_dict:
            seq_dict[key] = Seq(cache_dict[key])

    else:
        seq_dict, no_seq = get_sequences(genbank_accessions,
                                         args)  # {gbk_accession: seq}

        # only cache the sequence. Seq obj is not JSON serializable
        cache_dict = {}
        for key in seq_dict:
            cache_dict[key] = str(seq_dict[key])

        # cache the retrieved sequences
        cache_path = cache_dir / f"genbank_seqs_{time_stamp}.json"
        with open(cache_path, "w") as fh:
            json.dump(cache_dict, fh)

        if len(no_seq) != 0:
            no_seq_cache = cache_dir / f"no_seq_retrieved_{time_stamp}.txt"
            logger.warning(
                f"No protein sequence retrieved for {len(no_seq)}\n"
                f"The GenBank accessions for these proteins have been written to: {no_seq_cache}"
            )
            with open(no_seq_cache, "a") as fh:
                for acc in no_seq:
                    fh.write(f"{acc}\n")

    logger.warning(
        f"Adding {len(list(seq_dict.keys()))} protein seqs to the db")

    add_genbank_data.add_gbk_seqs_to_db(seq_dict, date_today, gbk_dict,
                                        connection, args)

    closing_message("get_genbank_sequences", start_time, args)
コード例 #9
0
def main(argv: Optional[List[str]] = None,
         logger: Optional[logging.Logger] = None):
    """Set up programme and initate run."""
    time_stamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    start_time = datetime.now().strftime(
        "%Y-%m-%d %H:%M:%S")  # used in terminating message
    start_time = pd.to_datetime(start_time)
    # parse cmd-line arguments
    if argv is None:
        parser = build_parser()
        args = parser.parse_args()
    else:
        args = build_parser(argv).parse_args()

    if logger is None:
        logger = logging.getLogger(__package__)
        config_logger(args)

    validate_user_options(args)

    # make output directories
    if args.fasta_file:
        target_dir = args.fasta_file.parent
        make_output_directory(target_dir, args.force, args.nodelete)
    if args.fasta_dir:
        make_output_directory(args.fasta_dir, args.force, args.nodelete)

    connection, logger_name, cache_dir = connect_existing_db(
        args, time_stamp, start_time)

    if args.cache_dir is not None:  # use user defined cache dir
        cache_dir = args.cache_dir
        make_output_directory(cache_dir, args.force, args.nodelete_cache)
    else:
        cache_dir = cache_dir / "sequence_extraction"
        make_output_directory(cache_dir, args.force, args.nodelete_cache)

    (
        config_dict,
        class_filters,
        family_filters,
        kingdom_filters,
        taxonomy_filter_dict,
        ec_filters,
    ) = parse_configuration.get_expansion_configuration(args)

    gbk_table_dict = get_table_dicts.get_gbk_table_dict(connection)
    # {genbank_accession: 'taxa_id': str, 'gbk_id': int}

    # check what additional tabled needed to be loaded
    if any(((args.genbank_accessions
             is not None), (args.uniprot_accessions
                            is not None), ('genbank' in args.source))):
        logger.info("Loading the GenBank table")
        gbk_seq_dict = get_table_dicts.get_gbk_table_seq_dict(connection)
        # {genbank_accession: 'sequence': str, 'seq_date': str}

    if any(((args.uniprot_accessions is not None), ('uniprot'
                                                    in args.source))):
        logger.info("Loading the UniProt table")
        uniprot_table_dict = get_table_dicts.get_uniprot_table_dict(connection)
        # {acc: {name: str, gbk_id: int, seq: str, seq_date:str } }

    # build dick {gbk_acc: db_id} matching the users specified criteria
    # either via a list in a file or parameters provided via config file and/or command line

    gbk_dict = {}  # {gbk_acc: gbk_id}

    if args.genbank_accessions is not None:
        logger.warning(
            f"Extracting protein sequences for GenBank accessions listed in {args.genbank_accessions}"
        )
        gbk_dict.update(get_user_genbank_sequences(gbk_table_dict, args))

    if args.uniprot_accessions is not None:
        logger.warning(
            f"Extracting protein sequences for UniProt accessions listed in {args.uniprot_accessions}"
        )
        gbk_dict.update(
            get_user_uniprot_sequences(gbk_table_dict, uniprot_table_dict,
                                       args))

    if len(list(gbk_dict.keys())) == 0:
        gbk_dict = get_selected_gbks.get_genbank_accessions(
            class_filters,
            family_filters,
            taxonomy_filter_dict,
            kingdom_filters,
            ec_filters,
            connection,
        )

    # extract protein sequences from the database

    extracted_sequences = {}  # {accession: {'db': str, 'seq': str}}

    if 'genbank' in args.source:
        extracted_sequences.update(
            get_genbank_sequences(gbk_seq_dict, gbk_dict))

    if 'uniprot' in args.source:
        extracted_sequences.update(
            get_uniprot_sequences(uniprot_table_dict, gbk_dict))

    protein_records = []

    for protein_accession in tqdm(extracted_sequences, "Compiling SeqRecords"):
        try:
            new_record = SeqRecord(
                Seq(extracted_sequences[protein_accession]['seq']),
                id=protein_accession,
                description=extracted_sequences[protein_accession]['db'])
            protein_records.append(new_record)
        except TypeError:
            if extracted_sequences[protein_accession]['seq'] is not None:
                logger.warning(
                    f"Seq for {protein_accession} retrieved as type {type(extracted_sequences[protein_accession]['seq'])}\n"
                    "Not adding to FASTA file")
            pass  # passed when sequence is None

    # write out the sequences to the specified outputs

    logger.warning(f"Extracted {len(protein_records)}")

    write_output(protein_records, cache_dir, args)

    closing_message("extract_sequences", start_time, args)