def test_ErrorSQLNameSanitize(): '''Test the class ErrorSQLNameSanitize (error when a table name is improperly formatted)''' # Use an improper table name test_name = "); drop tables --" test_sanitize_name = "droptables" # Raise the error test_error = NCBImetaErrors.ErrorSQLNameSanitize(test_name, test_sanitize_name) error_output = str(test_error) error_expect = ("\n\nThe name: " + test_name + " contains problematic characters. Please rename it to: " + test_sanitize_name ) assert error_output == error_expect
else: raise NCBImetaErrors.ErrorDBNotExists(db_name) if not os.path.exists(annot_file_name): raise NCBImetaErrors.ErrorAnnotFileNotExists(annot_file_name) # no errors were raised, safe to connect to db cur = conn.cursor() #---------------------------Check Table---------------------------------# # Check table name table_name = db_table table_name_sanitize = NCBImetaUtilities.sql_sanitize(table_name) if table_name != table_name_sanitize: raise NCBImetaErrors.ErrorSQLNameSanitize(table_name, table_name_sanitize) # Check table exists if not NCBImetaUtilities.table_exists(cur, db_table): raise NCBImetaErrors.ErrorTableNotInDB(db_table) #-----------------------------------------------------------------------# # File Setup # #-----------------------------------------------------------------------# # get list of column names in Table cur.execute('''SELECT * FROM {}'''.format(db_table)) db_col_names = [description[0] for description in cur.description] #-----------------------------------------------------------------------# # Annotation Setup #
def UpdateDB(table, output_dir, database, email, search_term, table_columns, log_path, db_dir, api_key, force_pause_seconds): ''' Update the contents of a local sqlite database using records retrieved from NCBI as configured by the user. Parameters: table (str): Name of the NCBI database to search. output_dir (str): Path to the directory where output is written. database (str): Filename of the local sqlite database. email (str): User email. search_term (str): Entrez search query. table_columns(dict): Dictionary of column name and API name as value, ex. {AssemblyGenbankID : GbUid}. log_path(str): Path to the directory where the logfile is stored in. db_dir(str): Path to the directory where the database is stored in. api_key(str): NCBI user account API Key. force_pause_seconds(float): Number of seconds to wait in between fetch read_attempts. ''' print("\nCreating/Updating the " + table + " table using the following parameters: " + "\n" + "\t" + "Database: " + "\t\t" + database + "\n" + "\t" + "Search Term:" + "\t" + "\t" + search_term + "\n" + "\t" + "Email: " + "\t\t\t" + email + "\n" + "\t" + "API Key: " + "\t\t" + api_key + "\n" + "\t" + "Output Directory: " + "\t" + output_dir + "\n\n", flush=True) Entrez.email = email Entrez.api_key = api_key # Allow a maximum of 3 tries for error catching before exiting program Entrez.max_tries = 3 # Sleep for 1 second after an error has been generated before retrying Entrez.sleep_between_tries = 1 #---------------------------------------------------------------------------# # File Setup # #---------------------------------------------------------------------------# # Name of Log File log_file_path = os.path.join( LOG_PATH, "", os.path.splitext(database)[0] + "_" + table + ".log") # Check if the file already exists, either write or append to it. if os.path.exists(log_file_path): log_file = open(log_file_path, "a") else: log_file = open(log_file_path, "w") #--------------------------------------------------------------------------# # SQL Setup # #--------------------------------------------------------------------------# # Check for problematic table name table_name = table table_name_sanitize = NCBImetaUtilities.sql_sanitize(table_name) if table_name != table_name_sanitize: raise NCBImetaErrors.ErrorSQLNameSanitize(table_name, table_name_sanitize) # Connect to database and establish cursor for commands. conn = sqlite3.connect(os.path.join(db_dir, "", database)) cur = conn.cursor() ## Create the database, with dynamic variables from config file sql_query = ("Create TABLE IF NOT EXISTS " + table + " (id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE, " + table + "_id TEXT") for column_name_dict in table_columns: column_name = list(column_name_dict.keys())[0] # Check for problematic column name col_name_sanitize = NCBImetaUtilities.sql_sanitize(column_name) if column_name != col_name_sanitize: raise NCBImetaErrors.ErrorSQLNameSanitize(column_name, col_name_sanitize) # By default, every user-specified column is type TEXT sql_query += ", " + column_name + " TEXT" sql_query += ")" cur.execute(sql_query) #-----------------------------------------------------------------------# # Entrez Search # #-----------------------------------------------------------------------# # Read the record, check for http, url, and runtime errors read_succeed = False read_attempts = 0 # Database reading and entrez searching occur in a while loop to catch errors while not read_succeed and read_attempts < Entrez.max_tries: kwargs = { "db": table.lower(), "term": search_term, "retmax": "9999999" } entrez_method = Entrez.esearch # Possible urllib error and RuntimeErrors occurring in the next line handle = NCBImetaUtilities.HTTPErrorCatch(entrez_method, Entrez.max_tries, Entrez.sleep_between_tries, **kwargs) try: record = Entrez.read(handle) read_succeed = True except RuntimeError: read_attempts += 1 print("Runtime Error encountered. Sleeping for " + str(Entrez.sleep_between_tries) + " seconds before retrying.") time.sleep(Entrez.sleep_between_tries) if read_attempts == Entrez.max_tries and not read_succeed: raise ErrorMaxReadAttemptsExceeded(table) # Count total number of entries, create counter num_records = int(record['Count']) num_processed = 0 #-----------------------------------------------------------------------# # Iterate Through ID List # #-----------------------------------------------------------------------# for ID in record['IdList']: #-------------------Progress Log and Entry Counter-------------------# # Increment entry counter and record progress to screen num_processed += 1 print("ID: " + ID, flush=True) print("Processing record: " + str(num_processed) + \ "/" + str(num_records), flush = True) #------------Check if Record Already Exists in Database------------# sql_query = ("SELECT EXISTS(SELECT " + table + "_id FROM " + table + " WHERE " + table + "_id=?)") cur.execute(sql_query, (ID, )) # 0 if not found, 1 if found record_exists = cur.fetchone()[0] # If the record_exists, skip the whole next part (ie. "continue" to next record) if record_exists: continue ''' IMPORTANT: The ID should not exists in the table UNLESS the record was fully parsed. ie. The database does not get updated until the end of each record. ''' # This is the sleep command before implementing the HTTPerror catching in next section # This is controlled by the user configuration file time.sleep(force_pause_seconds) #---------------If the table isn't in Database, Add it------------# # The Assembly table cannot be retrieved using efetch, only docsum esummary if table.lower() == "assembly": # Use the http function to return a record summary, but wrapped in HTTP error checking kwargs = {"db": table.lower(), "id": ID, "retmode": "xml"} entrez_method = Entrez.esummary else: # We're working with any other table instead, use efetch and get xml kwargs = {"db": table.lower(), "id": ID, "retmode": "xml"} if table.lower() == "nucleotide": kwargs["rettype"] = "gb" entrez_method = Entrez.efetch # ID_handle is an _io.TextIOWrapped object, which originally had utf-8 encoding ID_handle = NCBImetaUtilities.HTTPErrorCatch( entrez_method, Entrez.max_tries, Entrez.sleep_between_tries, **kwargs) # Ideal world: Pass an undecoded string to the xml parser # Could be accomplished by opening in binary ('rb') # tempfiles by default are opened as mode='w+b' with tempfile.NamedTemporaryFile(delete=False) as temp_b: # Write the data from ID_handle to a temporary file (binary) for line in ID_handle: temp_b.write(str.encode(line)) temp_b.close() # Read the data as binary, into the XML parser. Avoids encoding issues with open(temp_b.name, 'rb') as xml_source: ID_root = etree.parse(xml_source, parser=LXML_CDATA_PARSER) #----------------------------------------------------------------------# # NCBI Record Parsing # #----------------------------------------------------------------------# #print(etree.tostring(ID_root).decode()) column_dict = {} # Add ID to the dictionary column_dict[table + "_id"] = [ID] # A special dictionary for gbseq annotations gbseq_dict = {} # Iterate through each column to search for metadata for column in table_columns: column_name = list(column.keys())[0] column_value = [] column_payload = list(column.values())[0] column_payload = column_payload.split(", ") # Initialize with empty values column_dict[column_name] = column_value #-------------------------------------------------------# # XML Parse for node or attribute #-------------------------------------------------------# working_root = ID_root # If there are special character, this query should not be used for xpath!! bool_special_char = False for char in XPATH_SPECIAL_CHAR: for xquery in column_payload: if char in xquery: bool_special_char = True # If no special characters, run xpath search Functions if not bool_special_char: NCBImetaUtilities.xml_search(working_root, column_payload, column_payload[0], column_name, column_dict) # Special parsing for GBSeq_comment # If we're on the GBSeq_comment element and the comment was added to the dictionary if "GBSeq_comment" in column_payload and len( column_dict[column_name]) > 0: comment = column_dict[column_name][0] # Fix the CDS vs CDSs ambiguity comment = comment.replace("CDSs", "CDS") # comment is initialize subdivided by semi-colons split_comment = comment.split(";") for item in split_comment: # Further subdivided by double colons split_item = item.split("::") # The elements we're interested have the :: otherwise skip if len(split_item) < 2: continue # Left side is the column name, right side is the metadata split_key = split_item[0].lstrip(" ").rstrip(" ") split_value = split_item[1].lstrip(" ").rstrip(" ") gbseq_dict[split_key] = split_value # If the value was still empty, check for gbseq comment if column_payload[0] in gbseq_dict: column_dict[column_name].append(gbseq_dict[column_payload[0]]) # Add quotations around each value for sql insertion for key in column_dict: # Remove empty string elements while "" in column_dict[key]: column_dict[key].remove("") # Concatenate multi elements column_dict[key] = DB_VALUE_SEP.join(column_dict[key]) # Write the column values to the db with dynamic variables sql_q_marks = ",".join(["?"] * len(column_dict.keys())) sql_q_marks = "(" + sql_q_marks + ")" sql_dynamic_colnames = "(" + ",".join(column_dict.keys()) + ")" sql_values_placeholder = [ column_dict[column] for column in column_dict.keys() ] sql_query = "INSERT INTO " + table + " " + sql_dynamic_colnames + " VALUES " + sql_q_marks cur.execute(sql_query, sql_values_placeholder) # Write to logfile now = datetime.datetime.now() log_file.write("[" + str(now) + "]" + "\t" + "New entry added with ID:" + "\t" + ID + "." + "\n") conn.commit() # CLEANUP conn.commit() cur.close() log_file.close()