def test_HTTPErrorCatch(tmpdir): '''Test the utility function HTTPErrorCatch (catch HTTP Errors)''' # Test query assembly test_ID = '5025191' test_email = '*****@*****.**' Entrez.email = test_email test_table = 'Assembly' test_max_tries = 10 test_total_tries = 10 test_sleep_between_tries = 0 test_kwargs = {"db": test_table.lower(), "id": test_ID} test_entrez_method = Entrez.esummary for i in range(1, test_total_tries): ID_handle = NCBImetaUtilities.HTTPErrorCatch(test_entrez_method, test_max_tries, test_sleep_between_tries, **test_kwargs) try: ID_record = Entrez.read(ID_handle, validate=False) assert 1 except RuntimeError: # The ID_handle was not succesfully retrieved or the data is not correct assert 0
def UpdateDB(table, output_dir, database, email, search_term, table_columns, log_path, db_dir, api_key, force_pause_seconds): ''' Update the contents of a local sqlite database using records retrieved from NCBI as configured by the user. Parameters: table (str): Name of the NCBI database to search. output_dir (str): Path to the directory where output is written. database (str): Filename of the local sqlite database. email (str): User email. search_term (str): Entrez search query. table_columns(dict): Dictionary of column name and API name as value, ex. {AssemblyGenbankID : GbUid}. log_path(str): Path to the directory where the logfile is stored in. db_dir(str): Path to the directory where the database is stored in. api_key(str): NCBI user account API Key. force_pause_seconds(float): Number of seconds to wait in between fetch read_attempts. ''' print("\nCreating/Updating the " + table + " table using the following parameters: " + "\n" + "\t" + "Database: " + "\t\t" + database + "\n" + "\t" + "Search Term:" + "\t" + "\t" + search_term + "\n" + "\t" + "Email: " + "\t\t\t" + email + "\n" + "\t" + "API Key: " + "\t\t" + api_key + "\n" + "\t" + "Output Directory: " + "\t" + output_dir + "\n\n", flush=True) Entrez.email = email Entrez.api_key = api_key # Allow a maximum of 3 tries for error catching before exiting program Entrez.max_tries = 3 # Sleep for 1 second after an error has been generated before retrying Entrez.sleep_between_tries = 1 #---------------------------------------------------------------------------# # File Setup # #---------------------------------------------------------------------------# # Name of Log File log_file_path = os.path.join( LOG_PATH, "", os.path.splitext(database)[0] + "_" + table + ".log") # Check if the file already exists, either write or append to it. if os.path.exists(log_file_path): log_file = open(log_file_path, "a") else: log_file = open(log_file_path, "w") #--------------------------------------------------------------------------# # SQL Setup # #--------------------------------------------------------------------------# # Connect to database and establish cursor for commands. conn = sqlite3.connect(os.path.join(db_dir, "", database)) cur = conn.cursor() ## Create the database, with dynamic variables from config file sql_query = ("Create TABLE IF NOT EXISTS " + table + " (id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE, " + table + "_id TEXT") for column_name_dict in table_columns: column_name = list(column_name_dict.keys())[0] # By default, every user-specified column is type TEXT sql_query += ", " + column_name + " TEXT" sql_query += ")" cur.execute(sql_query) #-----------------------------------------------------------------------# # Entrez Search # #-----------------------------------------------------------------------# # Read the record, check for http, url, and runtime errors read_succeed = False read_attempts = 0 # Database reading and entrez searching occur in a while loop to catch errors while not read_succeed and read_attempts < Entrez.max_tries: kwargs = { "db": table.lower(), "term": search_term, "retmax": "9999999" } entrez_method = Entrez.esearch # Possible urllib error and RuntimeErrors occurring in the next line handle = NCBImetaUtilities.HTTPErrorCatch(entrez_method, Entrez.max_tries, Entrez.sleep_between_tries, **kwargs) try: record = Entrez.read(handle) read_succeed = True except RuntimeError: read_attempts += 1 print("Runtime Error encountered. Sleeping for " + str(Entrez.sleep_between_tries) + " seconds before retrying.") time.sleep(Entrez.sleep_between_tries) if read_attempts == Entrez.max_tries and not read_succeed: raise ErrorMaxReadAttemptsExceeded(table) # Count total number of entries, create counter num_records = int(record['Count']) num_processed = 0 #-----------------------------------------------------------------------# # Iterate Through ID List # #-----------------------------------------------------------------------# for ID in record['IdList']: #-------------------Progress Log and Entry Counter-------------------# # Increment entry counter and record progress to screen num_processed += 1 print("ID: " + ID, flush=True) print("Processing record: " + str(num_processed) + \ "/" + str(num_records), flush = True) #------------Check if Record Already Exists in Database------------# sql_query = ("SELECT EXISTS(SELECT " + table + "_id FROM " + table + " WHERE " + table + "_id=?)") cur.execute(sql_query, (ID, )) # 0 if not found, 1 if found record_exists = cur.fetchone()[0] # If the record_exists, skip the whole next part (ie. "continue" to next record) if record_exists: continue ''' IMPORTANT: The ID should not exists in the table UNLESS the record was fully parsed. ie. The database does not get updated until the end of each record. ''' # This is the sleep command before implementing the HTTPerror catching in next section # This is controlled by the user configuration file time.sleep(force_pause_seconds) #---------------If the table isn't in Database, Add it------------# # The Assembly table cannot be retrieved using efetch, only docsum esummary if table.lower() == "assembly": # Use the http function to return a record summary, but wrapped in HTTP error checking kwargs = {"db": table.lower(), "id": ID, "retmode": "xml"} entrez_method = Entrez.esummary else: # We're working with any other table instead, use efetch and get xml kwargs = {"db": table.lower(), "id": ID, "retmode": "xml"} if table.lower() == "nucleotide": kwargs["rettype"] = "gb" entrez_method = Entrez.efetch # ID_handle is an _io.TextIOWrapped object, which originally had utf-8 encoding ID_handle = NCBImetaUtilities.HTTPErrorCatch( entrez_method, Entrez.max_tries, Entrez.sleep_between_tries, **kwargs) # Ideal world: Pass an undecoded string to the xml parser # Could be accomplished by opening in binary ('rb') # tempfiles by default are opened as mode='w+b' with tempfile.NamedTemporaryFile(delete=False) as temp_b: # Write the data from ID_handle to a temporary file (binary) for line in ID_handle: temp_b.write(str.encode(line)) temp_b.close() # Read the data as binary, into the XML parser. Avoids encoding issues with open(temp_b.name, 'rb') as xml_source: ID_root = etree.parse(xml_source, parser=LXML_CDATA_PARSER) #----------------------------------------------------------------------# # NCBI Record Parsing # #----------------------------------------------------------------------# #print(etree.tostring(ID_root).decode()) column_dict = {} # Add ID to the dictionary column_dict[table + "_id"] = [ID] # A special dictionary for gbseq annotations gbseq_dict = {} # Iterate through each column to search for metadata for column in table_columns: column_name = list(column.keys())[0] column_value = [] column_payload = list(column.values())[0] column_payload = column_payload.split(", ") # Initialize with empty values column_dict[column_name] = column_value #-------------------------------------------------------# # XML Parse for node or attribute #-------------------------------------------------------# working_root = ID_root # If there are special character, this query should not be used for xpath!! bool_special_char = False for char in XPATH_SPECIAL_CHAR: for xquery in column_payload: if char in xquery: bool_special_char = True # If no special characters, run xpath search Functions if not bool_special_char: NCBImetaUtilities.xml_search(working_root, column_payload, column_payload[0], column_name, column_dict) # Special parsing for GBSeq_comment # If we're on the GBSeq_comment element and the comment was added to the dictionary if "GBSeq_comment" in column_payload and len( column_dict[column_name]) > 0: comment = column_dict[column_name][0] # Fix the CDS vs CDSs ambiguity comment = comment.replace("CDSs", "CDS") # comment is initialize subdivided by semi-colons split_comment = comment.split(";") for item in split_comment: # Further subdivided by double colons split_item = item.split("::") # The elements we're interested have the :: otherwise skip if len(split_item) < 2: continue # Left side is the column name, right side is the metadata split_key = split_item[0].lstrip(" ").rstrip(" ") split_value = split_item[1].lstrip(" ").rstrip(" ") gbseq_dict[split_key] = split_value # If the value was still empty, check for gbseq comment if column_payload[0] in gbseq_dict: column_dict[column_name].append(gbseq_dict[column_payload[0]]) # Add quotations around each value for sql insertion for key in column_dict: # Remove empty string elements while "" in column_dict[key]: column_dict[key].remove("") # Remove quotations from each list element for i in range(0, len(column_dict[key])): column_dict[key][i] = column_dict[key][i].replace("\"", "") # The following is to help with single quotes inside column_dict[key] = "\"" + DB_VALUE_SEP.join( column_dict[key]) + "\"" # Write the column values to the db with dynamic variables sql_dynamic_table = "INSERT INTO " + table + " (" sql_dynamic_vars = ",".join([column for column in column_dict.keys()]) + ") " sql_dynamic_qmarks = "VALUES (" + ",".join( ["?" for column in column_dict.keys()]) + ") " sql_dynamic_values = " VALUES (" + ",".join( [column_dict[column] for column in column_dict.keys()]) + ")" sql_query = sql_dynamic_table + sql_dynamic_vars + sql_dynamic_values sql_query_q = sql_dynamic_table + sql_dynamic_vars + sql_dynamic_qmarks cur.execute(sql_query) # Write to logfile now = datetime.datetime.now() log_file.write("[" + str(now) + "]" + "\t" + "New entry added with ID:" + "\t" + ID + "." + "\n") conn.commit() # CLEANUP conn.commit() cur.close() log_file.close()