def test_ErrorConfigParameter(): '''Test the class ErrorConfigParameter (error when a configuration file parameter is incorrect)''' # This file is not created, just a tmp path test_parameter = "TestParameter" # Test instantiation test_error = NCBImetaErrors.ErrorConfigParameter(test_parameter) # Test str representation (error message) error_output = str(test_error) error_expect = ("\n\nA parameter name and/or value in the configuration file is set incorrectly:" + "\n" + test_parameter) assert error_output == error_expect
def test_ErrorMaxFetchAttemptsExceeded(): '''Test the class ErrorMaxFetchAttemptsExceeded (error when maximum fetch attempts has been exceeded)''' # This file is not created, just a tmp path test_ID = '123456789' # Test instantiation test_error = NCBImetaErrors.ErrorMaxFetchAttemptsExceeded(test_ID) # Test str representation (error message) error_output = str(test_error) error_expect = ("\n\nThe Maximum number of fetch attempts was exceeded for ID:" + "\n" + test_ID) assert error_output == error_expect
def test_ErrorConfigFileNotExists(tmpdir): '''Test the class ErrorAnnotFileNotExists (error when a configuration file doesn't exist)''' # This file is not created, just a tmp path tmpfile = os.path.join(tmpdir.strpath, "tmpfile") # Test instantiation test_error = NCBImetaErrors.ErrorConfigFileNotExists(tmpfile) # Test str representation (error message) error_output = str(test_error) error_expect = ("\n\nConfig file does not exist in the specified location." + "\n" + "Location specified: " + tmpfile) assert error_output == error_expect
def test_ErrorColumnsNotUnique(): '''Test the class ErrorColumnsNotUnique (error when their are non unique columns in a database)''' # This file is not created, just a tmp path test_column = "TestColumn" # Test instantiation test_error = NCBImetaErrors.ErrorColumnsNotUnique(test_column) # Test str representation (error message) error_output = str(test_error) error_expect = ("\n\nThe following columns are not unique in the database:" + "\n" + test_column) assert error_output == error_expect
def test_ErrorEntryNotInDB(): '''Test the class ErrorEntryNotInDB (error when an entry doesn't exist in a database)''' # This file is not created, just a tmp path test_entry = "TestEntry" # Test instantiation test_error = NCBImetaErrors.ErrorEntryNotInDB(test_entry) # Test str representation (error message) error_output = str(test_error) error_expect = ("\n\nThe entry does not exist in the database." + "\n" + "Unknown entry found: " + test_entry) assert error_output == error_expect
def test_ErrorEntryMultipleMatches(): '''Test the class ErrorEntryMultipleMatches (error when their are multiple matching entries in a database)''' # This file is not created, just a tmp path test_entry = "TestEntry" # Test instantiation test_error = NCBImetaErrors.ErrorEntryMultipleMatches(test_entry) # Test str representation (error message) error_output = str(test_error) error_expect = ("\n\nThe entry has multiple matches in the database." + "\n" + "Multiple matches for entry: " + test_entry) assert error_output == error_expect
def test_ErrorSQLNameSanitize(): '''Test the class ErrorSQLNameSanitize (error when a table name is improperly formatted)''' # Use an improper table name test_name = "); drop tables --" test_sanitize_name = "droptables" # Raise the error test_error = NCBImetaErrors.ErrorSQLNameSanitize(test_name, test_sanitize_name) error_output = str(test_error) error_expect = ("\n\nThe name: " + test_name + " contains problematic characters. Please rename it to: " + test_sanitize_name ) assert error_output == error_expect
def test_ErrorTableNotInDB(tmpdir): '''Test the class ErrorTableNotInDB (error when a table doesn't exist in a database)''' # This file is not created, just a tmp path tmpfile = os.path.join(tmpdir.strpath, "tmpfile") # Test instantiation test_error = NCBImetaErrors.ErrorTableNotInDB(tmpfile) # Test str representation (error message) error_output = str(test_error) error_expect = ("\n\nThe table does not exist in the database." + "\n" + "Unknown table found: " + tmpfile) assert error_output == error_expect
def test_ErrorOutputDirNotExists(tmpdir): '''Test the class ErrorOutputDirNotExists (error when a directory doesn't exist)''' tmpdir = tmpdir.strpath # Test instantiation test_error = NCBImetaErrors.ErrorOutputDirNotExists(tmpdir) # Test str representation (error message) error_output = str(test_error) error_expect = ("\n\nOutput directory does not exist." + "\n" + "User entered: " + tmpdir) assert error_output == error_expect
def test_ErrorMaxReadAttemptsExceeded(): '''Test the class ErrorMaxReadAttemptsExceeded (error when maximum read attempts has been exceeded)''' # This file is not created, just a tmp path test_table = 'TestTable' # Test instantiation test_error = NCBImetaErrors.ErrorMaxReadAttemptsExceeded(test_table) # Test str representation (error message) error_output = str(test_error) error_expect = ("\n\nThe Maximum number of read attempts was exceeded for table:" + "\n" + test_table) assert error_output == error_expect
def test_ErrorDBNotExists(tmpdir): '''Test the class ErrorDBNotExists (error when a database doesn't exist)''' # This file is not created, just a tmp path tmpfile = os.path.join(tmpdir.strpath, "tmpfile") # Test instantiation test_error = NCBImetaErrors.ErrorDBNotExists(tmpfile) # Test str representation (error message) error_output = str(test_error) error_expect = ("\n\nDatabase does not exist." + "\n" + tmpfile) assert error_output == error_expect
def test_ErrorConfigYAMLFormat(tmpdir): '''Test the class ErrorConfigYAMLFormat (error when a configuration file is improperly formatted)''' # This file is not created, just a tmp path tmpfile = os.path.join(tmpdir.strpath, "tmpfile") # Test instantiation test_error = NCBImetaErrors.ErrorConfigYAMLFormat(tmpfile) # Test str representation (error message) error_output = str(test_error) error_expect = ("\n\nThe configuration file could not be loaded, please confirm that this is a proper YAML file: " + "\n" + tmpfile) print("TESTING") assert error_output == error_expect
def HTTPErrorCatch(http_method, max_fetch_attempts, sleep_time, **kwargs): ''' Return result of http_method and check if an HTTP Error is generated Parameters: http_method (function): An http record-fetching or searching method. max_fetch_attempts (int): Maximum number of tries for fetching a record_dict. sleep_time (float): Number of seconds to wait in between fetch read_attempts. kwargs(dict): keyword arguments for the http_method function. ''' # Attemp the http_method function, wrapped in HTTP error checking ID_handle_retrieved = False fetch_attempts = 0 while not ID_handle_retrieved and fetch_attempts < max_fetch_attempts: try: ID_handle = http_method(**kwargs) ID_handle_retrieved = True # HTTP Errors except urllib.error.HTTPError as error: # Error code 429: Too Many Requests if error.code == 429: fetch_attempts += 1 print("HTTP Error " + str(error.code) + ": " + str(error.reason)) print("Fetch Attempt: " + str(fetch_attempts) + "/" + str(max_fetch_attempts)) print("Sleeping for " + str(sleep_time) + " seconds before retrying.") time.sleep(sleep_time) # General HTTP Error Code, non specific else: fetch_attempts += 1 print("HTTP Error " + str(error.code) + ": " + str(error.reason)) print("Fetch Attempt: " + str(fetch_attempts) + "/" + str(max_fetch_attempts)) print("Retrying record fetching.") # URL Errors except urllib.error.URLError as error: fetch_attempts += 1 print("URL Error: " + str(error.reason)) print("Fetch Attempt: " + str(fetch_attempts) + "/" + str(max_fetch_attempts)) print("Retrying record fetching.") # If the maximum number of fetch attempts has been exceeded if fetch_attempts == max_fetch_attempts and not ID_handle_retrieved: raise NCBImetaErrors.ErrorMaxFetchAttemptsExceeded(ID) return ID_handle
args = vars(parser.parse_args()) db_name = args['dbName'] output_dir = args['outputDir'] #-----------------------------------------------------------------------# # Argument Checking # #-----------------------------------------------------------------------# # Check if database exists if os.path.exists(db_name): conn = sqlite3.connect(db_name) print('\nOpening database: ' + db_name, flush=True) else: raise NCBImetaErrors.ErrorDBNotExists(db_name) # Check if output dir exists if not os.path.exists(output_dir): raise NCBImetaErrors.ErrorOutputDirNotExists(output_dir) # no errors were raised, safe to connect to db cur = conn.cursor() #-----------------------------------------------------------------------# # Process Database # #-----------------------------------------------------------------------# # Get a list of tables cur.execute("SELECT name FROM sqlite_master WHERE type='table';") table_list = cur.fetchall()
db_name = args['dbName'] db_table = args['dbTable'] annot_file_name = args['annotFile'] db_value_sep = ";" #-----------------------------------------------------------------------# # Argument Checking # #-----------------------------------------------------------------------# #---------------------------Check Database------------------------------# if os.path.exists(db_name): conn = sqlite3.connect(db_name) print('\nOpening database: ' + db_name, flush=True) else: raise NCBImetaErrors.ErrorDBNotExists(db_name) if not os.path.exists(annot_file_name): raise NCBImetaErrors.ErrorAnnotFileNotExists(annot_file_name) # no errors were raised, safe to connect to db cur = conn.cursor() #---------------------------Check Table---------------------------------# # Check table name table_name = db_table table_name_sanitize = NCBImetaUtilities.sql_sanitize(table_name) if table_name != table_name_sanitize: raise NCBImetaErrors.ErrorSQLNameSanitize(table_name, table_name_sanitize)
parser.add_argument('--version', action='version', version='%(prog)s v0.6.1') # Retrieve user parameters args = vars(parser.parse_args()) config_path = args['configPath'] flat_mode = args['flatMode'] #------------------------------------------------------------------------------# # Error Catching # #------------------------------------------------------------------------------# # Check if configuration file exists if not os.path.exists(config_path): raise NCBImetaErrors.ErrorConfigFileNotExists(config_path) # Load the YAML configuration file with open(config_path) as config_file: config_data = yaml.load(config_file, Loader=yaml.FullLoader) if config_data is None: raise NCBImetaErrors.ErrorConfigYAMLFormat(config_file) # Retrieve configuration file values and error catching #--- Output Directory ---# try: CONFIG_OUTPUT_DIR = config_data["OUTPUT_DIR"] except KeyError: raise NCBImetaErrors.ErrorConfigParameter("OUTPUT_DIR") #--- User Email ---# try:
def UpdateDB(table, output_dir, database, email, search_term, table_columns, log_path, db_dir, api_key, force_pause_seconds): ''' Update the contents of a local sqlite database using records retrieved from NCBI as configured by the user. Parameters: table (str): Name of the NCBI database to search. output_dir (str): Path to the directory where output is written. database (str): Filename of the local sqlite database. email (str): User email. search_term (str): Entrez search query. table_columns(dict): Dictionary of column name and API name as value, ex. {AssemblyGenbankID : GbUid}. log_path(str): Path to the directory where the logfile is stored in. db_dir(str): Path to the directory where the database is stored in. api_key(str): NCBI user account API Key. force_pause_seconds(float): Number of seconds to wait in between fetch read_attempts. ''' print("\nCreating/Updating the " + table + " table using the following parameters: " + "\n" + "\t" + "Database: " + "\t\t" + database + "\n" + "\t" + "Search Term:" + "\t" + "\t" + search_term + "\n" + "\t" + "Email: " + "\t\t\t" + email + "\n" + "\t" + "API Key: " + "\t\t" + api_key + "\n" + "\t" + "Output Directory: " + "\t" + output_dir + "\n\n", flush=True) Entrez.email = email Entrez.api_key = api_key # Allow a maximum of 3 tries for error catching before exiting program Entrez.max_tries = 3 # Sleep for 1 second after an error has been generated before retrying Entrez.sleep_between_tries = 1 #---------------------------------------------------------------------------# # File Setup # #---------------------------------------------------------------------------# # Name of Log File log_file_path = os.path.join( LOG_PATH, "", os.path.splitext(database)[0] + "_" + table + ".log") # Check if the file already exists, either write or append to it. if os.path.exists(log_file_path): log_file = open(log_file_path, "a") else: log_file = open(log_file_path, "w") #--------------------------------------------------------------------------# # SQL Setup # #--------------------------------------------------------------------------# # Check for problematic table name table_name = table table_name_sanitize = NCBImetaUtilities.sql_sanitize(table_name) if table_name != table_name_sanitize: raise NCBImetaErrors.ErrorSQLNameSanitize(table_name, table_name_sanitize) # Connect to database and establish cursor for commands. conn = sqlite3.connect(os.path.join(db_dir, "", database)) cur = conn.cursor() ## Create the database, with dynamic variables from config file sql_query = ("Create TABLE IF NOT EXISTS " + table + " (id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE, " + table + "_id TEXT") for column_name_dict in table_columns: column_name = list(column_name_dict.keys())[0] # Check for problematic column name col_name_sanitize = NCBImetaUtilities.sql_sanitize(column_name) if column_name != col_name_sanitize: raise NCBImetaErrors.ErrorSQLNameSanitize(column_name, col_name_sanitize) # By default, every user-specified column is type TEXT sql_query += ", " + column_name + " TEXT" sql_query += ")" cur.execute(sql_query) #-----------------------------------------------------------------------# # Entrez Search # #-----------------------------------------------------------------------# # Read the record, check for http, url, and runtime errors read_succeed = False read_attempts = 0 # Database reading and entrez searching occur in a while loop to catch errors while not read_succeed and read_attempts < Entrez.max_tries: kwargs = { "db": table.lower(), "term": search_term, "retmax": "9999999" } entrez_method = Entrez.esearch # Possible urllib error and RuntimeErrors occurring in the next line handle = NCBImetaUtilities.HTTPErrorCatch(entrez_method, Entrez.max_tries, Entrez.sleep_between_tries, **kwargs) try: record = Entrez.read(handle) read_succeed = True except RuntimeError: read_attempts += 1 print("Runtime Error encountered. Sleeping for " + str(Entrez.sleep_between_tries) + " seconds before retrying.") time.sleep(Entrez.sleep_between_tries) if read_attempts == Entrez.max_tries and not read_succeed: raise ErrorMaxReadAttemptsExceeded(table) # Count total number of entries, create counter num_records = int(record['Count']) num_processed = 0 #-----------------------------------------------------------------------# # Iterate Through ID List # #-----------------------------------------------------------------------# for ID in record['IdList']: #-------------------Progress Log and Entry Counter-------------------# # Increment entry counter and record progress to screen num_processed += 1 print("ID: " + ID, flush=True) print("Processing record: " + str(num_processed) + \ "/" + str(num_records), flush = True) #------------Check if Record Already Exists in Database------------# sql_query = ("SELECT EXISTS(SELECT " + table + "_id FROM " + table + " WHERE " + table + "_id=?)") cur.execute(sql_query, (ID, )) # 0 if not found, 1 if found record_exists = cur.fetchone()[0] # If the record_exists, skip the whole next part (ie. "continue" to next record) if record_exists: continue ''' IMPORTANT: The ID should not exists in the table UNLESS the record was fully parsed. ie. The database does not get updated until the end of each record. ''' # This is the sleep command before implementing the HTTPerror catching in next section # This is controlled by the user configuration file time.sleep(force_pause_seconds) #---------------If the table isn't in Database, Add it------------# # The Assembly table cannot be retrieved using efetch, only docsum esummary if table.lower() == "assembly": # Use the http function to return a record summary, but wrapped in HTTP error checking kwargs = {"db": table.lower(), "id": ID, "retmode": "xml"} entrez_method = Entrez.esummary else: # We're working with any other table instead, use efetch and get xml kwargs = {"db": table.lower(), "id": ID, "retmode": "xml"} if table.lower() == "nucleotide": kwargs["rettype"] = "gb" entrez_method = Entrez.efetch # ID_handle is an _io.TextIOWrapped object, which originally had utf-8 encoding ID_handle = NCBImetaUtilities.HTTPErrorCatch( entrez_method, Entrez.max_tries, Entrez.sleep_between_tries, **kwargs) # Ideal world: Pass an undecoded string to the xml parser # Could be accomplished by opening in binary ('rb') # tempfiles by default are opened as mode='w+b' with tempfile.NamedTemporaryFile(delete=False) as temp_b: # Write the data from ID_handle to a temporary file (binary) for line in ID_handle: temp_b.write(str.encode(line)) temp_b.close() # Read the data as binary, into the XML parser. Avoids encoding issues with open(temp_b.name, 'rb') as xml_source: ID_root = etree.parse(xml_source, parser=LXML_CDATA_PARSER) #----------------------------------------------------------------------# # NCBI Record Parsing # #----------------------------------------------------------------------# #print(etree.tostring(ID_root).decode()) column_dict = {} # Add ID to the dictionary column_dict[table + "_id"] = [ID] # A special dictionary for gbseq annotations gbseq_dict = {} # Iterate through each column to search for metadata for column in table_columns: column_name = list(column.keys())[0] column_value = [] column_payload = list(column.values())[0] column_payload = column_payload.split(", ") # Initialize with empty values column_dict[column_name] = column_value #-------------------------------------------------------# # XML Parse for node or attribute #-------------------------------------------------------# working_root = ID_root # If there are special character, this query should not be used for xpath!! bool_special_char = False for char in XPATH_SPECIAL_CHAR: for xquery in column_payload: if char in xquery: bool_special_char = True # If no special characters, run xpath search Functions if not bool_special_char: NCBImetaUtilities.xml_search(working_root, column_payload, column_payload[0], column_name, column_dict) # Special parsing for GBSeq_comment # If we're on the GBSeq_comment element and the comment was added to the dictionary if "GBSeq_comment" in column_payload and len( column_dict[column_name]) > 0: comment = column_dict[column_name][0] # Fix the CDS vs CDSs ambiguity comment = comment.replace("CDSs", "CDS") # comment is initialize subdivided by semi-colons split_comment = comment.split(";") for item in split_comment: # Further subdivided by double colons split_item = item.split("::") # The elements we're interested have the :: otherwise skip if len(split_item) < 2: continue # Left side is the column name, right side is the metadata split_key = split_item[0].lstrip(" ").rstrip(" ") split_value = split_item[1].lstrip(" ").rstrip(" ") gbseq_dict[split_key] = split_value # If the value was still empty, check for gbseq comment if column_payload[0] in gbseq_dict: column_dict[column_name].append(gbseq_dict[column_payload[0]]) # Add quotations around each value for sql insertion for key in column_dict: # Remove empty string elements while "" in column_dict[key]: column_dict[key].remove("") # Concatenate multi elements column_dict[key] = DB_VALUE_SEP.join(column_dict[key]) # Write the column values to the db with dynamic variables sql_q_marks = ",".join(["?"] * len(column_dict.keys())) sql_q_marks = "(" + sql_q_marks + ")" sql_dynamic_colnames = "(" + ",".join(column_dict.keys()) + ")" sql_values_placeholder = [ column_dict[column] for column in column_dict.keys() ] sql_query = "INSERT INTO " + table + " " + sql_dynamic_colnames + " VALUES " + sql_q_marks cur.execute(sql_query, sql_values_placeholder) # Write to logfile now = datetime.datetime.now() log_file.write("[" + str(now) + "]" + "\t" + "New entry added with ID:" + "\t" + ID + "." + "\n") conn.commit() # CLEANUP conn.commit() cur.close() log_file.close()
unique_header_list = unique_header_str.split(" ") DB_VALUE_SEP = ";" #-----------------------------------------------------------------------# # Argument Checking # #-----------------------------------------------------------------------# print("START") #---------------------------Check Database------------------------------# if os.path.exists(db_name): conn = sqlite3.connect(db_name) print('\nOpening database: ' + db_name + "\n", flush=True) else: raise NCBImetaErrors.ErrorDBNotExists(db_name) # no errors were raised, safe to connect to db cur = conn.cursor() #---------------------------Check Table Names----------------------------# # Check the table names for problematic char table_name_list = [db_anchor] + [db_final] + db_accessory_list for table_name in table_name_list: table_name_sanitize = NCBImetaUtilities.sql_sanitize(table_name) if table_name != table_name_sanitize: raise NCBImetaErrors.ErrorSQLNameSanitize(table_name, table_name_sanitize) # Check the column names for col_name in unique_header_list:
db_name = args['dbName'] db_table = args['dbTable'] annot_file_name = args['annotFile'] #-----------------------------------------------------------------------# # Argument Checking # #-----------------------------------------------------------------------# #---------------------------Check Database------------------------------# if os.path.exists(db_name): conn = sqlite3.connect(db_name) print('\nOpening database: ' + db_name, flush=True) else: raise NCBImetaErrors.ErrorDBNotExists(db_name) if not os.path.exists(annot_file_name): raise NCBImetaErrors.ErrorAnnotFileNotExists(annot_file_name) # no errors were raised, safe to connect to db cur = conn.cursor() #---------------------------Check Table---------------------------------# if not table_exists(cur, db_table): raise NCBImetaErrors.ErrorTableNotInDB(db_table) #-----------------------------------------------------------------------# # File Setup # #-----------------------------------------------------------------------#
args = vars(parser.parse_args()) db_name = args['dbName'] output_dir = args['outputDir'] #-----------------------------------------------------------------------# # Argument Checking # #-----------------------------------------------------------------------# # Check if database exists if os.path.exists(db_name): conn = sqlite3.connect(db_name) print('\nOpening database: ' + db_name, flush=True) else: raise NCBImetaErrors.ErrorDBNotExists(db_name) # Check if output dir exists if not os.path.exists(output_dir): os.makedirs(CONFIG_OUTPUT_DIR) # no errors were raised, safe to connect to db cur = conn.cursor() #-----------------------------------------------------------------------# # Process Database # #-----------------------------------------------------------------------# # Get a list of tables cur.execute("SELECT name FROM sqlite_master WHERE type='table';") table_list = cur.fetchall()
db_all_tables = [db_anchor] + db_accessory_list unique_header_str = args['dbUnique'] unique_header_list = unique_header_str.split(" ") DB_VALUE_SEP = ";" #-----------------------------------------------------------------------# # Argument Checking # #-----------------------------------------------------------------------# #---------------------------Check Database------------------------------# if os.path.exists(db_name): conn = sqlite3.connect(db_name) print('\nOpening database: ' + db_name + "\n", flush=True) else: raise NCBImetaErrors.ErrorDBNotExists(db_name) # no errors were raised, safe to connect to db cur = conn.cursor() #---------------------------Check Tables---------------------------------# if not NCBImetaUtilities.table_exists(cur, db_anchor): raise NCBImetaErrors.ErrorTableNotInDB(db_anchor) for table in db_accessory_list: if not NCBImetaUtilities.table_exists(cur, table): raise NCBImetaErrors.ErrorTableNotInDB(table) #-----------------------------------------------------------------------# # File Setup # #-----------------------------------------------------------------------#