def test_ErrorConfigParameter():
    '''Test the class ErrorConfigParameter (error when a configuration file parameter is incorrect)'''
    # This file is not created, just a tmp path
    test_parameter = "TestParameter"
    # Test instantiation
    test_error = NCBImetaErrors.ErrorConfigParameter(test_parameter)
    # Test str representation (error message)
    error_output = str(test_error)
    error_expect = ("\n\nA parameter name and/or value in the configuration file is set incorrectly:" + "\n" + test_parameter)
    assert error_output == error_expect
def test_ErrorMaxFetchAttemptsExceeded():
    '''Test the class ErrorMaxFetchAttemptsExceeded (error when maximum fetch attempts has been exceeded)'''
    # This file is not created, just a tmp path
    test_ID = '123456789'
    # Test instantiation
    test_error = NCBImetaErrors.ErrorMaxFetchAttemptsExceeded(test_ID)
    # Test str representation (error message)
    error_output = str(test_error)
    error_expect = ("\n\nThe Maximum number of fetch attempts was exceeded for ID:" + "\n" + test_ID)
    assert error_output == error_expect
def test_ErrorConfigFileNotExists(tmpdir):
    '''Test the class ErrorAnnotFileNotExists (error when a configuration file doesn't exist)'''
    # This file is not created, just a tmp path
    tmpfile = os.path.join(tmpdir.strpath, "tmpfile")
    # Test instantiation
    test_error = NCBImetaErrors.ErrorConfigFileNotExists(tmpfile)
    # Test str representation (error message)
    error_output = str(test_error)
    error_expect = ("\n\nConfig file does not exist in the specified location." + "\n" + "Location specified: " + tmpfile)
    assert error_output == error_expect
def test_ErrorColumnsNotUnique():
    '''Test the class ErrorColumnsNotUnique (error when their are non unique columns in a database)'''
    # This file is not created, just a tmp path
    test_column = "TestColumn"
    # Test instantiation
    test_error = NCBImetaErrors.ErrorColumnsNotUnique(test_column)
    # Test str representation (error message)
    error_output = str(test_error)
    error_expect = ("\n\nThe following columns are not unique in the database:" + "\n" + test_column)
    assert error_output == error_expect
def test_ErrorEntryNotInDB():
    '''Test the class ErrorEntryNotInDB (error when an entry doesn't exist in a database)'''
    # This file is not created, just a tmp path
    test_entry = "TestEntry"
    # Test instantiation
    test_error = NCBImetaErrors.ErrorEntryNotInDB(test_entry)
    # Test str representation (error message)
    error_output = str(test_error)
    error_expect = ("\n\nThe entry does not exist in the database." + "\n" + "Unknown entry found: " + test_entry)
    assert error_output == error_expect
def test_ErrorEntryMultipleMatches():
    '''Test the class ErrorEntryMultipleMatches (error when their are multiple matching entries in a database)'''
    # This file is not created, just a tmp path
    test_entry = "TestEntry"
    # Test instantiation
    test_error = NCBImetaErrors.ErrorEntryMultipleMatches(test_entry)
    # Test str representation (error message)
    error_output = str(test_error)
    error_expect = ("\n\nThe entry has multiple matches in the database." + "\n" + "Multiple matches for entry: " + test_entry)
    assert error_output == error_expect
def test_ErrorSQLNameSanitize():
    '''Test the class ErrorSQLNameSanitize (error when a table name is improperly formatted)'''
    # Use an improper table name
    test_name = "); drop tables --"
    test_sanitize_name = "droptables"
    # Raise the error
    test_error = NCBImetaErrors.ErrorSQLNameSanitize(test_name, test_sanitize_name)
    error_output = str(test_error)
    error_expect =  ("\n\nThe name: " + test_name + " contains problematic characters. Please rename it to: " + test_sanitize_name )
    assert error_output == error_expect
def test_ErrorTableNotInDB(tmpdir):
    '''Test the class ErrorTableNotInDB (error when a table doesn't exist in a database)'''
    # This file is not created, just a tmp path
    tmpfile = os.path.join(tmpdir.strpath, "tmpfile")
    # Test instantiation
    test_error = NCBImetaErrors.ErrorTableNotInDB(tmpfile)
    # Test str representation (error message)
    error_output = str(test_error)
    error_expect = ("\n\nThe table does not exist in the database." + "\n" + "Unknown table found: " + tmpfile)
    assert error_output == error_expect
Exemple #9
def test_ErrorOutputDirNotExists(tmpdir):
    '''Test the class ErrorOutputDirNotExists (error when a directory doesn't exist)'''
    tmpdir = tmpdir.strpath
    # Test instantiation
    test_error = NCBImetaErrors.ErrorOutputDirNotExists(tmpdir)
    # Test str representation (error message)
    error_output = str(test_error)
    error_expect = ("\n\nOutput directory does not exist." + "\n" +
                    "User entered: " + tmpdir)
    assert error_output == error_expect
def test_ErrorMaxReadAttemptsExceeded():
    '''Test the class ErrorMaxReadAttemptsExceeded (error when maximum read attempts has been exceeded)'''
    # This file is not created, just a tmp path
    test_table = 'TestTable'
    # Test instantiation
    test_error = NCBImetaErrors.ErrorMaxReadAttemptsExceeded(test_table)
    # Test str representation (error message)
    error_output = str(test_error)
    error_expect = ("\n\nThe Maximum number of read attempts was exceeded for table:" + "\n" + test_table)
    assert error_output == error_expect
def test_ErrorDBNotExists(tmpdir):
    '''Test the class ErrorDBNotExists (error when a database doesn't exist)'''
    # This file is not created, just a tmp path
    tmpfile = os.path.join(tmpdir.strpath, "tmpfile")
    # Test instantiation
    test_error = NCBImetaErrors.ErrorDBNotExists(tmpfile)
    # Test str representation (error message)
    error_output = str(test_error)
    error_expect = ("\n\nDatabase does not exist." + "\n" + tmpfile)
    assert error_output == error_expect
def test_ErrorConfigYAMLFormat(tmpdir):
    '''Test the class ErrorConfigYAMLFormat (error when a configuration file is improperly formatted)'''
    # This file is not created, just a tmp path
    tmpfile = os.path.join(tmpdir.strpath, "tmpfile")
    # Test instantiation
    test_error = NCBImetaErrors.ErrorConfigYAMLFormat(tmpfile)
    # Test str representation (error message)
    error_output = str(test_error)
    error_expect = ("\n\nThe configuration file could not be loaded, please confirm that this is a proper YAML file: " + "\n" + tmpfile)
    assert error_output == error_expect
Exemple #13
def HTTPErrorCatch(http_method, max_fetch_attempts, sleep_time, **kwargs):
    Return result of http_method and check if an HTTP Error is generated

    http_method (function): An http record-fetching or searching method.
    max_fetch_attempts (int): Maximum number of tries for fetching a record_dict.
    sleep_time (float): Number of seconds to wait in between fetch read_attempts.
    kwargs(dict): keyword arguments for the http_method function.
    # Attemp the http_method function, wrapped in HTTP error checking
    ID_handle_retrieved = False
    fetch_attempts = 0
    while not ID_handle_retrieved and fetch_attempts < max_fetch_attempts:
            ID_handle = http_method(**kwargs)
            ID_handle_retrieved = True
        # HTTP Errors
        except urllib.error.HTTPError as error:
            # Error code 429: Too Many Requests
            if error.code == 429:
                fetch_attempts += 1
                print("HTTP Error " + str(error.code) + ": " +
                print("Fetch Attempt: " + str(fetch_attempts) + "/" +
                print("Sleeping for " + str(sleep_time) +
                      " seconds before retrying.")
            # General HTTP Error Code, non specific
                fetch_attempts += 1
                print("HTTP Error " + str(error.code) + ": " +
                print("Fetch Attempt: " + str(fetch_attempts) + "/" +
                print("Retrying record fetching.")
        # URL Errors
        except urllib.error.URLError as error:
            fetch_attempts += 1
            print("URL Error: " + str(error.reason))
            print("Fetch Attempt: " + str(fetch_attempts) + "/" +
            print("Retrying record fetching.")

        # If the maximum number of fetch attempts has been exceeded
        if fetch_attempts == max_fetch_attempts and not ID_handle_retrieved:
            raise NCBImetaErrors.ErrorMaxFetchAttemptsExceeded(ID)

    return ID_handle
Exemple #14
args = vars(parser.parse_args())

db_name = args['dbName']
output_dir = args['outputDir']

#                           Argument Checking                           #

# Check if database exists
if os.path.exists(db_name):
    conn = sqlite3.connect(db_name)
    print('\nOpening database: ' + db_name, flush=True)
    raise NCBImetaErrors.ErrorDBNotExists(db_name)

# Check if output dir exists
if not os.path.exists(output_dir):
    raise NCBImetaErrors.ErrorOutputDirNotExists(output_dir)

# no errors were raised, safe to connect to db
cur = conn.cursor()

#                         Process Database                              #

# Get a list of tables
cur.execute("SELECT name FROM sqlite_master WHERE type='table';")
table_list = cur.fetchall()
db_name = args['dbName']
db_table = args['dbTable']
annot_file_name = args['annotFile']
db_value_sep = ";"

#                           Argument Checking                           #

#---------------------------Check Database------------------------------#

if os.path.exists(db_name):
    conn = sqlite3.connect(db_name)
    print('\nOpening database: ' + db_name, flush=True)
    raise NCBImetaErrors.ErrorDBNotExists(db_name)

if not os.path.exists(annot_file_name):
    raise NCBImetaErrors.ErrorAnnotFileNotExists(annot_file_name)

# no errors were raised, safe to connect to db
cur = conn.cursor()

#---------------------------Check Table---------------------------------#

# Check table name
table_name = db_table
table_name_sanitize = NCBImetaUtilities.sql_sanitize(table_name)
if table_name != table_name_sanitize:
    raise NCBImetaErrors.ErrorSQLNameSanitize(table_name, table_name_sanitize)
Exemple #16
parser.add_argument('--version', action='version', version='%(prog)s v0.6.1')

# Retrieve user parameters
args = vars(parser.parse_args())

config_path = args['configPath']
flat_mode = args['flatMode']

#                            Error Catching                                    #

# Check if configuration file exists
if not os.path.exists(config_path):
    raise NCBImetaErrors.ErrorConfigFileNotExists(config_path)

# Load the YAML configuration file
with open(config_path) as config_file:
    config_data = yaml.load(config_file, Loader=yaml.FullLoader)
    if config_data is None:
        raise NCBImetaErrors.ErrorConfigYAMLFormat(config_file)

# Retrieve configuration file values and error catching
#--- Output Directory ---#
    CONFIG_OUTPUT_DIR = config_data["OUTPUT_DIR"]
except KeyError:
    raise NCBImetaErrors.ErrorConfigParameter("OUTPUT_DIR")
#--- User Email ---#
Exemple #17
def UpdateDB(table, output_dir, database, email, search_term, table_columns,
             log_path, db_dir, api_key, force_pause_seconds):
    Update the contents of a local sqlite database using records retrieved from NCBI as configured by the user.

    table (str): Name of the NCBI database to search.
    output_dir (str): Path to the directory where output is written.
    database (str): Filename of the local sqlite database.
    email (str): User email.
    search_term (str): Entrez search query.
    table_columns(dict): Dictionary of column name and API name as value, ex. {AssemblyGenbankID : GbUid}.
    log_path(str): Path to the directory where the logfile is stored in.
    db_dir(str): Path to the directory where the database is stored in.
    api_key(str): NCBI user account API Key.
    force_pause_seconds(float): Number of seconds to wait in between fetch read_attempts.

    print("\nCreating/Updating the " + table +
          " table using the following parameters: " + "\n" + "\t" +
          "Database: " + "\t\t" + database + "\n" + "\t" + "Search Term:" +
          "\t" + "\t" + search_term + "\n" + "\t" + "Email: " + "\t\t\t" +
          email + "\n" + "\t" + "API Key: " + "\t\t" + api_key + "\n" + "\t" +
          "Output Directory: " + "\t" + output_dir + "\n\n",
          flush=True) = email
    Entrez.api_key = api_key
    # Allow a maximum of 3 tries for error catching before exiting program
    Entrez.max_tries = 3
    # Sleep for 1 second after an error has been generated before retrying
    Entrez.sleep_between_tries = 1

    #                                File Setup                                 #
    # Name of Log File
    log_file_path = os.path.join(
        LOG_PATH, "",
        os.path.splitext(database)[0] + "_" + table + ".log")

    # Check if the file already exists, either write or append to it.
    if os.path.exists(log_file_path):
        log_file = open(log_file_path, "a")
        log_file = open(log_file_path, "w")

    #                                SQL Setup                                 #

    # Check for problematic table name
    table_name = table
    table_name_sanitize = NCBImetaUtilities.sql_sanitize(table_name)
    if table_name != table_name_sanitize:
        raise NCBImetaErrors.ErrorSQLNameSanitize(table_name,

    # Connect to database and establish cursor for commands.
    conn = sqlite3.connect(os.path.join(db_dir, "", database))
    cur = conn.cursor()

    ## Create the database, with dynamic variables from config file
    sql_query = ("Create TABLE IF NOT EXISTS " + table +
                 table + "_id TEXT")

    for column_name_dict in table_columns:
        column_name = list(column_name_dict.keys())[0]
        # Check for problematic column name
        col_name_sanitize = NCBImetaUtilities.sql_sanitize(column_name)
        if column_name != col_name_sanitize:
            raise NCBImetaErrors.ErrorSQLNameSanitize(column_name,

        # By default, every user-specified column is type TEXT
        sql_query += ", " + column_name + " TEXT"

    sql_query += ")"


    #                          Entrez Search                                #
    # Read the record, check for http, url, and runtime errors
    read_succeed = False
    read_attempts = 0

    # Database reading and entrez searching occur in a while loop to catch errors
    while not read_succeed and read_attempts < Entrez.max_tries:
        kwargs = {
            "db": table.lower(),
            "term": search_term,
            "retmax": "9999999"
        entrez_method = Entrez.esearch
        # Possible urllib error and RuntimeErrors occurring in the next line
        handle = NCBImetaUtilities.HTTPErrorCatch(entrez_method,
            record =
            read_succeed = True
        except RuntimeError:
            read_attempts += 1
            print("Runtime Error encountered. Sleeping for " +
                  str(Entrez.sleep_between_tries) +
                  " seconds before retrying.")

    if read_attempts == Entrez.max_tries and not read_succeed:
        raise ErrorMaxReadAttemptsExceeded(table)

    # Count total number of entries, create counter
    num_records = int(record['Count'])
    num_processed = 0

    #                          Iterate Through ID List                      #

    for ID in record['IdList']:
        #-------------------Progress Log and Entry Counter-------------------#
        # Increment entry counter and record progress to screen
        num_processed += 1
        print("ID: " + ID, flush=True)
        print("Processing record: " +
               str(num_processed) + \
               "/" + str(num_records), flush = True)

        #------------Check if Record Already Exists in Database------------#
        sql_query = ("SELECT EXISTS(SELECT " + table + "_id FROM " + table +
                     " WHERE " + table + "_id=?)")
        cur.execute(sql_query, (ID, ))

        # 0 if not found, 1 if found
        record_exists = cur.fetchone()[0]

        # If the record_exists, skip the whole next part (ie. "continue" to next record)
        if record_exists:
        The ID should not exists in the table UNLESS the record was fully parsed.
        ie. The database does not get updated until the end of each record.
        # This is the sleep command before implementing the HTTPerror catching in next section
        # This is controlled by the user configuration file

        #---------------If the table isn't in Database, Add it------------#
        # The Assembly table cannot be retrieved using efetch, only docsum esummary
        if table.lower() == "assembly":
            # Use the http function to return a record summary, but wrapped in HTTP error checking
            kwargs = {"db": table.lower(), "id": ID, "retmode": "xml"}
            entrez_method = Entrez.esummary
            # We're working with any other table instead, use efetch and get xml
            kwargs = {"db": table.lower(), "id": ID, "retmode": "xml"}
            if table.lower() == "nucleotide":
                kwargs["rettype"] = "gb"
            entrez_method = Entrez.efetch

        # ID_handle is an _io.TextIOWrapped object, which originally had utf-8 encoding
        ID_handle = NCBImetaUtilities.HTTPErrorCatch(
            entrez_method, Entrez.max_tries, Entrez.sleep_between_tries,

        # Ideal world: Pass an undecoded string to the xml parser
        # Could be accomplished by opening in binary ('rb')
        # tempfiles by default are opened as mode='w+b'
        with tempfile.NamedTemporaryFile(delete=False) as temp_b:
            # Write the data from ID_handle to a temporary file (binary)
            for line in ID_handle:
            # Read the data as binary, into the XML parser. Avoids encoding issues
            with open(, 'rb') as xml_source:
                ID_root = etree.parse(xml_source, parser=LXML_CDATA_PARSER)

        #                         NCBI Record Parsing                          #


        column_dict = {}
        # Add ID to the dictionary
        column_dict[table + "_id"] = [ID]
        # A special dictionary for gbseq annotations
        gbseq_dict = {}
        # Iterate through each column to search for metadata
        for column in table_columns:
            column_name = list(column.keys())[0]
            column_value = []
            column_payload = list(column.values())[0]
            column_payload = column_payload.split(", ")
            # Initialize with empty values
            column_dict[column_name] = column_value

            #   XML Parse for node or attribute
            working_root = ID_root
            # If there are special character, this query should not be used for xpath!!
            bool_special_char = False
            for char in XPATH_SPECIAL_CHAR:
                for xquery in column_payload:
                    if char in xquery:
                        bool_special_char = True
            # If no special characters, run xpath search Functions
            if not bool_special_char:
                NCBImetaUtilities.xml_search(working_root, column_payload,
                                             column_payload[0], column_name,

            # Special parsing for GBSeq_comment
            # If we're on the GBSeq_comment element and the comment was added to the dictionary
            if "GBSeq_comment" in column_payload and len(
                    column_dict[column_name]) > 0:
                comment = column_dict[column_name][0]
                # Fix the CDS vs CDSs ambiguity
                comment = comment.replace("CDSs", "CDS")
                # comment is initialize subdivided by semi-colons
                split_comment = comment.split(";")
                for item in split_comment:
                    # Further subdivided by double colons
                    split_item = item.split("::")
                    # The elements we're interested have the :: otherwise skip
                    if len(split_item) < 2: continue
                    # Left side is the column name, right side is the metadata
                    split_key = split_item[0].lstrip(" ").rstrip(" ")
                    split_value = split_item[1].lstrip(" ").rstrip(" ")
                    gbseq_dict[split_key] = split_value

            # If the value was still empty, check for gbseq comment
            if column_payload[0] in gbseq_dict:

        # Add quotations around each value for sql insertion
        for key in column_dict:
            # Remove empty string elements
            while "" in column_dict[key]:

            # Concatenate multi elements
            column_dict[key] = DB_VALUE_SEP.join(column_dict[key])

        # Write the column values to the db with dynamic variables
        sql_q_marks = ",".join(["?"] * len(column_dict.keys()))
        sql_q_marks = "(" + sql_q_marks + ")"
        sql_dynamic_colnames = "(" + ",".join(column_dict.keys()) + ")"
        sql_values_placeholder = [
            column_dict[column] for column in column_dict.keys()

        sql_query = "INSERT INTO " + table + " " + sql_dynamic_colnames + " VALUES " + sql_q_marks

        cur.execute(sql_query, sql_values_placeholder)

        # Write to logfile
        now =
        log_file.write("[" + str(now) + "]" + "\t" +
                       "New entry added with ID:" + "\t" + ID + "." + "\n")

unique_header_list = unique_header_str.split(" ")

#                           Argument Checking                           #


#---------------------------Check Database------------------------------#

if os.path.exists(db_name):
    conn = sqlite3.connect(db_name)
    print('\nOpening database: ' + db_name + "\n", flush=True)
    raise NCBImetaErrors.ErrorDBNotExists(db_name)

# no errors were raised, safe to connect to db
cur = conn.cursor()

#---------------------------Check Table Names----------------------------#
# Check the table names for problematic char
table_name_list = [db_anchor] + [db_final] + db_accessory_list
for table_name in table_name_list:
    table_name_sanitize = NCBImetaUtilities.sql_sanitize(table_name)
    if table_name != table_name_sanitize:
        raise NCBImetaErrors.ErrorSQLNameSanitize(table_name,

# Check the column names
for col_name in unique_header_list:
db_name = args['dbName']
db_table = args['dbTable']
annot_file_name = args['annotFile']

#                           Argument Checking                           #

#---------------------------Check Database------------------------------#

if os.path.exists(db_name):
    conn = sqlite3.connect(db_name)
    print('\nOpening database: ' + db_name, flush=True)
    raise NCBImetaErrors.ErrorDBNotExists(db_name)

if not os.path.exists(annot_file_name):
    raise NCBImetaErrors.ErrorAnnotFileNotExists(annot_file_name)

# no errors were raised, safe to connect to db
cur = conn.cursor()

#---------------------------Check Table---------------------------------#

if not table_exists(cur, db_table):
    raise NCBImetaErrors.ErrorTableNotInDB(db_table)

#                                File Setup                             #
Exemple #20
args = vars(parser.parse_args())

db_name = args['dbName']
output_dir = args['outputDir']

#                           Argument Checking                           #

# Check if database exists
if os.path.exists(db_name):
    conn = sqlite3.connect(db_name)
    print('\nOpening database: ' + db_name, flush=True)
    raise NCBImetaErrors.ErrorDBNotExists(db_name)

# Check if output dir exists
if not os.path.exists(output_dir):

# no errors were raised, safe to connect to db
cur = conn.cursor()

#                         Process Database                              #

# Get a list of tables
cur.execute("SELECT name FROM sqlite_master WHERE type='table';")
table_list = cur.fetchall()
Exemple #21
db_all_tables = [db_anchor] + db_accessory_list
unique_header_str = args['dbUnique']
unique_header_list = unique_header_str.split(" ")

#                           Argument Checking                           #

#---------------------------Check Database------------------------------#

if os.path.exists(db_name):
    conn = sqlite3.connect(db_name)
    print('\nOpening database: ' + db_name + "\n", flush=True)
    raise NCBImetaErrors.ErrorDBNotExists(db_name)

# no errors were raised, safe to connect to db
cur = conn.cursor()

#---------------------------Check Tables---------------------------------#

if not NCBImetaUtilities.table_exists(cur, db_anchor):
    raise NCBImetaErrors.ErrorTableNotInDB(db_anchor)
for table in db_accessory_list:
    if not NCBImetaUtilities.table_exists(cur, table):
        raise NCBImetaErrors.ErrorTableNotInDB(table)

#                                File Setup                             #