Exemple #1
0
def main(unparsed_args_list):
    """Run main get_gb_records pipeline."""
    # Parse command line arguments
    args = parse_args(unparsed_args_list)
    date = time.strftime("%Y%m%d")
    args.output_folder = basic.set_path(args.output_folder,
                                        kind="dir",
                                        expect=True)

    working_dir = pathlib.Path(f"{date}_get_gb_records")
    working_path = basic.make_new_dir(args.output_folder,
                                      working_dir,
                                      attempt=10)

    if working_path is None:
        print(f"Invalid working directory '{working_dir}'")
        sys.exit(1)
    ncbi_cred_dict = ncbi.get_ncbi_creds(args.ncbi_credentials_file)

    # Verify database connection and schema compatibility.
    print("Connecting to the MySQL database...")
    engine = mysqldb.connect_to_db(args.database)
    mysqldb.check_schema_compatibility(engine, "the get_gb_records pipeline")

    # Create data sets
    print("Retrieving accessions from the database...")
    accessions = mysqldb.get_distinct_data(engine, "phage", "Accession")
    engine.dispose()
    if "" in accessions:
        accessions.remove("")
    if None in accessions:
        accessions.remove(None)

    get_genbank_data(working_path, accessions, ncbi_cred_dict)
Exemple #2
0
 def test_connect_to_db_2(self, get_engine_mock, sys_exit_mock):
     """Verify that sys exit is called when engine is None."""
     get_engine_mock.return_value = (None, "")
     engine = mysqldb.connect_to_db(db2)
     with self.subTest():
         self.assertTrue(get_engine_mock.called)
     with self.subTest():
         self.assertTrue(sys_exit_mock.called)
Exemple #3
0
 def test_connect_to_db_1(self, getpass_mock):
     """Verify that engine returned when valid info provided."""
     getpass_mock.side_effect = [user, pwd]
     engine = mysqldb.connect_to_db(db2)
     with self.subTest():
         self.assertTrue(getpass_mock.called)
     with self.subTest():
         self.assertIsNotNone(engine)
Exemple #4
0
def main(unparsed_args_list, engine1=None):
    """Run main conversion pipeline."""
    # Parse command line arguments
    args = parse_args(unparsed_args_list)
    if engine1 is None:
        engine1 = mysqldb.connect_to_db(args.database)
    target = args.schema_version
    actual = mysqldb.get_schema_version(engine1)
    steps, dir = get_conversion_direction(actual, target)

    # Iterate through list of versions and implement SQL files.
    if dir == "none":
        if args.verbose == True:
            print("No schema conversion is needed.")
        convert = False
    else:
        convert = True

    if convert == True:
        if (args.new_database_name is not None
                and args.new_database_name != args.database):
            result = mysqldb.drop_create_db(engine1, args.new_database_name)
            if result == 0:
                result = mysqldb.copy_db(engine1, args.new_database_name)
                if result == 0:
                    # Create a new connection to the new database.
                    engine2, msg = mysqldb.get_engine(
                        database=args.new_database_name,
                        username=engine1.url.username,
                        password=engine1.url.password,
                        echo=False)
                else:
                    print("Error: Unable to copy the database for conversion.")
                    convert = False
            else:
                print(
                    "Error: Unable to create the new database for conversion.")
                convert = False
        else:
            engine2 = engine1

        if convert == True:
            stop_step, summary = convert_schema(engine2,
                                                actual,
                                                dir,
                                                steps,
                                                verbose=args.verbose)
            engine2.dispose()
            if stop_step == target:
                if args.verbose == True:
                    print("\n\nThe database schema conversion was successful.")
            else:
                print("\n\nError: "
                      "The database schema conversion was not successful. "
                      f"Unable to proceed past schema version {stop_step}.")
            if args.verbose == True:
                print_summary(summary)
    engine1.dispose()
Exemple #5
0
def main(unparsed_args):
    """Runs the complete update pipeline."""
    args = parse_args(unparsed_args)

    # Verify database connection and schema compatibility.
    print("Connecting to the MySQL database...")
    engine = mysqldb.connect_to_db(args.database)
    mysqldb.check_schema_compatibility(engine, "the update pipeline")

    if args.version == True:
        mysqldb.change_version(engine)
        print("Database version updated.")

    if args.ticket_table is not None:
        update_table_path = basic.set_path(args.ticket_table,
                                           kind="file",
                                           expect=True)

        # Iterate through the tickets and process them sequentially.
        list_of_update_tickets = []
        with update_table_path.open(mode='r') as f:
            file_reader = csv.DictReader(f)
            for dict in file_reader:
                list_of_update_tickets.append(dict)

        # Variables to be used for end summary
        processed = 0
        succeeded = 0
        failed = 0

        for dict in list_of_update_tickets:
            conn = engine.connect()
            # Pass the raw db_api connection
            handler = RandomFieldUpdateHandler(conn.connection)
            handler.table = dict["table"]  # Which table will be updated?
            handler.field = dict["field"]  # Which field will be updated?
            handler.value = dict[
                "value"]  # What value will be put in that field?
            handler.key_name = dict[
                "key_name"]  # How will we know which row is the right one?
            handler.key_value = dict[
                "key_value"]  # How will we know which row is the right one?
            handler.validate_ticket(
            )  # Make sure all handler attributes are valid
            status = handler.execute_ticket()  # Do what was requested
            if status == 1:
                processed += 1
                succeeded += 1
            else:
                processed += 1
                failed += 1

        engine.dispose()
        print("\nDone iterating through tickets.")
        if succeeded > 0:
            print(f"{succeeded} / {processed} tickets successfully handled.")
        if failed > 0:
            print(f"{failed} / {processed} tickets failed to be handled.")
Exemple #6
0
def main(unparsed_args_list):
    """Run main freeze database pipeline."""
    args = parse_args(unparsed_args_list)
    # engine1, msg = mysqldb.get_engine(database=args.database, echo=False)

    # Verify database connection and schema compatibility.
    print("Connecting to the MySQL database...")
    engine1 = mysqldb.connect_to_db(args.database)
    mysqldb.check_schema_compatibility(engine1, "the freeze pipeline")

    # Get the number of draft genomes.
    query = "SELECT count(*) as count FROM phage WHERE Status != 'draft'"
    result = engine1.execute(query).fetchall()
    phage_count = result[0]["count"]

    # Create the new frozen database folder e.g. Actinobacteriophage_XYZ.
    prefix = get_prefix()
    new_database = f"{prefix}_{phage_count}"

    # Create the new database
    result = mysqldb.drop_create_db(engine1, new_database)

    # Copy database.
    if result == 0:
        result = mysqldb.copy_db(engine1, new_database)
        if result == 0:
            print(f"Deleting 'draft' genomes...")
            engine2, msg = mysqldb.get_engine(username=engine1.url.username,
                                              password=engine1.url.password,
                                              database=new_database)

            statement3 = "DELETE FROM phage WHERE Status = 'draft'"
            engine2.execute(statement3)
            statement4 = "UPDATE version SET Version = 0"
            engine2.execute(statement4)
            # Close up all connections in the connection pool.
            engine2.dispose()
        else:
            print("Unable to copy the database.")
        # Close up all connections in the connection pool.
        engine1.dispose()
    else:
        print(f"Error creating new database: {new_database}.")
    print("Freeze database script completed.")
Exemple #7
0
def main(argument_list):
    """
    :param argument_list:
    :return:
    """
    # Setup argument parser
    cdd_parser = setup_argparser()

    # Use argument parser to parse argument_list
    args = cdd_parser.parse_args(argument_list)

    # Store arguments in more easily accessible variables
    database = args.db
    cdd_dir = expand_path(args.dir)
    cdd_name = learn_cdd_name(cdd_dir)
    threads = args.threads
    evalue = args.evalue
    rpsblast = args.rpsblast
    tmp_dir = args.tmp_dir
    output_folder = args.output_folder
    log_file = args.log_file
    reset = args.reset

    # Set up directory.
    output_folder = basic.set_path(output_folder, kind="dir", expect=True)
    results_folder = pathlib.Path(RESULTS_FOLDER)
    results_path = basic.make_new_dir(output_folder,
                                      results_folder,
                                      attempt=10)
    if results_path is None:
        print("Unable to create output_folder.")
        sys.exit(1)

    log_file = pathlib.Path(results_path, log_file)

    # Set up root logger.
    logging.basicConfig(
        filename=log_file,
        filemode="w",
        level=logging.DEBUG,
        format="pdm_utils find_domains: %(levelname)s: %(message)s")
    logger.info(f"pdm_utils version: {VERSION}")
    logger.info(f"CDD run date: {constants.CURRENT_DATE}")
    logger.info(f"Command line arguments: {' '.join(argument_list)}")
    logger.info(f"Results directory: {results_path}")

    # Early exit if either 1) cdd_name == "" or 2) no rpsblast given and we are
    # unable to find one
    if cdd_name == "":
        msg = (f"Unable to learn CDD database name. Make sure the files in "
               f"{cdd_dir} all have the same basename.")
        logger.error(msg)
        print(msg)
        return

    # Get the rpsblast command and path.
    if rpsblast == "":
        command = get_rpsblast_command()
        rpsblast = get_rpsblast_path(command)

    # Verify database connection and schema compatibility.
    engine = mysqldb.connect_to_db(database)
    logger.info(f"Connected to database: {database}.")
    mysqldb.check_schema_compatibility(engine, "the find_domains pipeline")
    logger.info(f"Schema version is compatible.")
    logger.info("Command line arguments verified.")

    if reset:
        logger.info("Clearing all domain data currently in the database.")
        clear_domain_data(engine)

    # Get gene data that needs to be processed
    # in dict format where key = column name, value = stored value.
    # result = engine.execute(GET_GENES_FOR_CDD)
    cdd_genes = mysqldb.query_dict_list(engine, GET_GENES_FOR_CDD)
    msg = f"{len(cdd_genes)} genes to search for conserved domains..."
    logger.info(msg)
    print(msg)

    # Only run the pipeline if there are genes returned that need it
    if len(cdd_genes) > 0:
        log_gene_ids(cdd_genes)

        # Create temp_dir
        make_tempdir(tmp_dir)

        # TODO dev
        # translations = get_unique_translations(cdd_genes)

        # Build jobs list
        jobs = []

        # TODO dev
        # translation_id = 0
        # for translation in translations:
        #     translation_id += 1
        #     jobs.append((rpsblast, cdd_name, tmp_dir, evalue,
        #                  translation_id, translation))

        for cdd_gene in cdd_genes:
            jobs.append((rpsblast, cdd_name, tmp_dir, evalue,
                         cdd_gene["GeneID"], cdd_gene["Translation"]))

        results = parallelize(jobs, threads, search_and_process)
        print("\n")

        # TODO dev
        # results_dict = create_results_dict(results)
        # map_results_to_genes(cdd_genes, results_dict)

        insert_domain_data(engine, results)
        engine.dispose()
    return
Exemple #8
0
def main(argument_list):
    # Set up the argument parser
    phamerate_parser = setup_argparser()

    # Parse arguments
    args = phamerate_parser.parse_args(argument_list)
    program = args.program
    temp_dir = args.temp_dir

    # Initialize SQLAlchemy engine with database provided at CLI
    engine = mysqldb.connect_to_db(args.db)

    # If we made it past the above connection_status() check, database access
    # works (user at least has SELECT privileges on the indicated database).
    # We'll assume that they also have UPDATE, INSERT, and TRUNCATE privileges.

    # Record start time
    start = datetime.datetime.now()

    # Refresh temp_dir
    if os.path.exists(temp_dir):
        try:
            shutil.rmtree(temp_dir)
        except OSError:
            print(f"Failed to delete existing temp directory '{temp_dir}'")
            return
    try:
        os.makedirs(temp_dir)
    except OSError:
        print(f"Failed to create new temp directory '{temp_dir}")
        return

    # Get old pham data and un-phamerated genes
    old_phams = get_pham_geneids(engine)
    old_colors = get_pham_colors(engine)
    unphamerated = get_new_geneids(engine)

    # Get GeneIDs & translations, and translation groups
    genes_and_trans = map_geneids_to_translations(engine)
    translation_groups = map_translations_to_geneids(engine)

    # Write input fasta file
    write_fasta(translation_groups, temp_dir)

    # Create clusterdb and perform clustering
    program_params = get_program_params(program, args)
    create_clusterdb(program, temp_dir)
    phamerate(program_params, program, temp_dir)

    # Parse phameration output
    new_phams = parse_output(program, temp_dir)
    new_phams = reintroduce_duplicates(new_phams, translation_groups,
                                       genes_and_trans)

    # Preserve old pham names and colors
    new_phams, new_colors = preserve_phams(old_phams, new_phams, old_colors,
                                           unphamerated)

    # Early exit if we don't have new phams or new colors - avoids
    # overwriting the existing pham data with potentially incomplete new data
    if len(new_phams) == 0 or len(new_colors) == 0:
        print("Failed to parse new pham/color data... Terminating pipeline")
        return

    # If we got past the early exit, we are probably safe to truncate the
    # pham table, and insert the new pham data
    # Clear old pham data - auto commits at end of transaction - this will also
    # set all PhamID values in gene table to NULL
    commands = ["DELETE FROM pham"]
    mysqldb.execute_transaction(engine, commands)

    # Insert new pham/color data
    reinsert_pham_data(new_phams, new_colors, engine)

    # Fix miscolored phams/orphams
    fix_miscolored_phams(engine)

    # Close all connections in the connection pool.
    engine.dispose()

    # Record stop time
    stop = datetime.datetime.now()

    # Report phameration elapsed time
    print("Elapsed time: {}".format(str(stop - start)))
Exemple #9
0
def main(unparsed_args_list):
    """Run main retrieve_updates pipeline."""
    # Parse command line arguments
    args = parse_args(unparsed_args_list)
    date = time.strftime("%Y%m%d")

    args.output_folder = basic.set_path(args.output_folder,
                                        kind="dir",
                                        expect=True)

    working_dir = pathlib.Path(f"{date}_get_data")
    working_path = basic.make_new_dir(args.output_folder,
                                      working_dir,
                                      attempt=10)

    if working_path is None:
        print(f"Invalid working directory '{working_dir}'")
        sys.exit(1)

    ncbi_cred_dict = ncbi.get_ncbi_creds(args.ncbi_credentials_file)

    # Verify database connection and schema compatibility.
    print("Preparing genome data sets from the MySQL database...")
    engine = mysqldb.connect_to_db(args.database)
    mysqldb.check_schema_compatibility(engine, "the get_data pipeline")

    # Get existing data from MySQL to determine what needs to be updated.
    query = ("SELECT PhageID, Name, HostGenus, Status, Cluster, "
             "DateLastModified, Accession, RetrieveRecord, Subcluster, "
             "AnnotationAuthor FROM phage")

    mysqldb_genome_list = mysqldb.parse_genome_data(engine=engine,
                                                    phage_query=query,
                                                    gnm_type="mysqldb")
    engine.dispose()
    mysqldb_genome_dict = {}
    for gnm in mysqldb_genome_list:
        mysqldb_genome_dict[gnm.id] = gnm

    # Get data from PhagesDB
    if (args.updates or args.final or args.draft) is True:
        print("Retrieving data from PhagesDB...")
        phagesdb_phages = phagesdb.get_phagesdb_data(constants.API_SEQUENCED)
        phagesdb_phages_dict = basic.convert_list_to_dict(
            phagesdb_phages, "phage_name")
        phagesdb_genome_dict = phagesdb.parse_genomes_dict(
            phagesdb_phages_dict, gnm_type="phagesdb", seq=False)

        # Exit if all phage data wasn't retrieved.
        if len(phagesdb_genome_dict) == 0:
            sys.exit(1)

        # Returns a list of tuples.
        match_output = match_genomes(mysqldb_genome_dict, phagesdb_genome_dict)
        matched_genomes = match_output[0]
        unmatched_phagesdb_ids = match_output[1]

    if args.updates is True:
        get_update_data(working_path, matched_genomes)
    if args.final is True:
        get_final_data(working_path, matched_genomes)
    if args.genbank is True:
        get_genbank_data(working_path, mysqldb_genome_dict, ncbi_cred_dict,
                         args.genbank_results)
    if args.draft is True:
        get_draft_data(working_path, unmatched_phagesdb_ids)
    print("\n\n\nRetrieve updates script completed.")