Esempio n. 1
0
def test_ErrorSQLNameSanitize():
    '''Test the class ErrorSQLNameSanitize (error when a table name is improperly formatted)'''
    # Use an improper table name
    test_name = "); drop tables --"
    test_sanitize_name = "droptables"
    # Raise the error
    test_error = NCBImetaErrors.ErrorSQLNameSanitize(test_name, test_sanitize_name)
    error_output = str(test_error)
    error_expect =  ("\n\nThe name: " + test_name + " contains problematic characters. Please rename it to: " + test_sanitize_name )
    assert error_output == error_expect
else:
    raise NCBImetaErrors.ErrorDBNotExists(db_name)

if not os.path.exists(annot_file_name):
    raise NCBImetaErrors.ErrorAnnotFileNotExists(annot_file_name)

# no errors were raised, safe to connect to db
cur = conn.cursor()

#---------------------------Check Table---------------------------------#

# Check table name
table_name = db_table
table_name_sanitize = NCBImetaUtilities.sql_sanitize(table_name)
if table_name != table_name_sanitize:
    raise NCBImetaErrors.ErrorSQLNameSanitize(table_name, table_name_sanitize)

# Check table exists
if not NCBImetaUtilities.table_exists(cur, db_table):
    raise NCBImetaErrors.ErrorTableNotInDB(db_table)

#-----------------------------------------------------------------------#
#                                File Setup                             #
#-----------------------------------------------------------------------#

# get list of column names in Table
cur.execute('''SELECT * FROM {}'''.format(db_table))
db_col_names = [description[0] for description in cur.description]

#-----------------------------------------------------------------------#
#                             Annotation Setup                          #
Esempio n. 3
0
def UpdateDB(table, output_dir, database, email, search_term, table_columns,
             log_path, db_dir, api_key, force_pause_seconds):
    '''
    Update the contents of a local sqlite database using records retrieved from NCBI as configured by the user.

    Parameters:
    table (str): Name of the NCBI database to search.
    output_dir (str): Path to the directory where output is written.
    database (str): Filename of the local sqlite database.
    email (str): User email.
    search_term (str): Entrez search query.
    table_columns(dict): Dictionary of column name and API name as value, ex. {AssemblyGenbankID : GbUid}.
    log_path(str): Path to the directory where the logfile is stored in.
    db_dir(str): Path to the directory where the database is stored in.
    api_key(str): NCBI user account API Key.
    force_pause_seconds(float): Number of seconds to wait in between fetch read_attempts.
    '''

    print("\nCreating/Updating the " + table +
          " table using the following parameters: " + "\n" + "\t" +
          "Database: " + "\t\t" + database + "\n" + "\t" + "Search Term:" +
          "\t" + "\t" + search_term + "\n" + "\t" + "Email: " + "\t\t\t" +
          email + "\n" + "\t" + "API Key: " + "\t\t" + api_key + "\n" + "\t" +
          "Output Directory: " + "\t" + output_dir + "\n\n",
          flush=True)

    Entrez.email = email
    Entrez.api_key = api_key
    # Allow a maximum of 3 tries for error catching before exiting program
    Entrez.max_tries = 3
    # Sleep for 1 second after an error has been generated before retrying
    Entrez.sleep_between_tries = 1

    #---------------------------------------------------------------------------#
    #                                File Setup                                 #
    #---------------------------------------------------------------------------#
    # Name of Log File
    log_file_path = os.path.join(
        LOG_PATH, "",
        os.path.splitext(database)[0] + "_" + table + ".log")

    # Check if the file already exists, either write or append to it.
    if os.path.exists(log_file_path):
        log_file = open(log_file_path, "a")
    else:
        log_file = open(log_file_path, "w")

    #--------------------------------------------------------------------------#
    #                                SQL Setup                                 #
    #--------------------------------------------------------------------------#

    # Check for problematic table name
    table_name = table
    table_name_sanitize = NCBImetaUtilities.sql_sanitize(table_name)
    if table_name != table_name_sanitize:
        raise NCBImetaErrors.ErrorSQLNameSanitize(table_name,
                                                  table_name_sanitize)

    # Connect to database and establish cursor for commands.
    conn = sqlite3.connect(os.path.join(db_dir, "", database))
    cur = conn.cursor()

    ## Create the database, with dynamic variables from config file
    sql_query = ("Create TABLE IF NOT EXISTS " + table +
                 " (id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE, " +
                 table + "_id TEXT")

    for column_name_dict in table_columns:
        column_name = list(column_name_dict.keys())[0]
        # Check for problematic column name
        col_name_sanitize = NCBImetaUtilities.sql_sanitize(column_name)
        if column_name != col_name_sanitize:
            raise NCBImetaErrors.ErrorSQLNameSanitize(column_name,
                                                      col_name_sanitize)

        # By default, every user-specified column is type TEXT
        sql_query += ", " + column_name + " TEXT"

    sql_query += ")"

    cur.execute(sql_query)

    #-----------------------------------------------------------------------#
    #                          Entrez Search                                #
    #-----------------------------------------------------------------------#
    # Read the record, check for http, url, and runtime errors
    read_succeed = False
    read_attempts = 0

    # Database reading and entrez searching occur in a while loop to catch errors
    while not read_succeed and read_attempts < Entrez.max_tries:
        kwargs = {
            "db": table.lower(),
            "term": search_term,
            "retmax": "9999999"
        }
        entrez_method = Entrez.esearch
        # Possible urllib error and RuntimeErrors occurring in the next line
        handle = NCBImetaUtilities.HTTPErrorCatch(entrez_method,
                                                  Entrez.max_tries,
                                                  Entrez.sleep_between_tries,
                                                  **kwargs)
        try:
            record = Entrez.read(handle)
            read_succeed = True
        except RuntimeError:
            read_attempts += 1
            print("Runtime Error encountered. Sleeping for " +
                  str(Entrez.sleep_between_tries) +
                  " seconds before retrying.")
            time.sleep(Entrez.sleep_between_tries)

    if read_attempts == Entrez.max_tries and not read_succeed:
        raise ErrorMaxReadAttemptsExceeded(table)

    # Count total number of entries, create counter
    num_records = int(record['Count'])
    num_processed = 0

    #-----------------------------------------------------------------------#
    #                          Iterate Through ID List                      #
    #-----------------------------------------------------------------------#

    for ID in record['IdList']:
        #-------------------Progress Log and Entry Counter-------------------#
        # Increment entry counter and record progress to screen
        num_processed += 1
        print("ID: " + ID, flush=True)
        print("Processing record: " +
               str(num_processed) + \
               "/" + str(num_records), flush = True)

        #------------Check if Record Already Exists in Database------------#
        sql_query = ("SELECT EXISTS(SELECT " + table + "_id FROM " + table +
                     " WHERE " + table + "_id=?)")
        cur.execute(sql_query, (ID, ))

        # 0 if not found, 1 if found
        record_exists = cur.fetchone()[0]

        # If the record_exists, skip the whole next part (ie. "continue" to next record)
        if record_exists:
            continue
        '''
        IMPORTANT:
        The ID should not exists in the table UNLESS the record was fully parsed.
        ie. The database does not get updated until the end of each record.
        '''
        # This is the sleep command before implementing the HTTPerror catching in next section
        # This is controlled by the user configuration file
        time.sleep(force_pause_seconds)

        #---------------If the table isn't in Database, Add it------------#
        # The Assembly table cannot be retrieved using efetch, only docsum esummary
        if table.lower() == "assembly":
            # Use the http function to return a record summary, but wrapped in HTTP error checking
            kwargs = {"db": table.lower(), "id": ID, "retmode": "xml"}
            entrez_method = Entrez.esummary
        else:
            # We're working with any other table instead, use efetch and get xml
            kwargs = {"db": table.lower(), "id": ID, "retmode": "xml"}
            if table.lower() == "nucleotide":
                kwargs["rettype"] = "gb"
            entrez_method = Entrez.efetch

        # ID_handle is an _io.TextIOWrapped object, which originally had utf-8 encoding
        ID_handle = NCBImetaUtilities.HTTPErrorCatch(
            entrez_method, Entrez.max_tries, Entrez.sleep_between_tries,
            **kwargs)

        # Ideal world: Pass an undecoded string to the xml parser
        # Could be accomplished by opening in binary ('rb')
        # tempfiles by default are opened as mode='w+b'
        with tempfile.NamedTemporaryFile(delete=False) as temp_b:
            # Write the data from ID_handle to a temporary file (binary)
            for line in ID_handle:
                temp_b.write(str.encode(line))
            temp_b.close()
            # Read the data as binary, into the XML parser. Avoids encoding issues
            with open(temp_b.name, 'rb') as xml_source:
                ID_root = etree.parse(xml_source, parser=LXML_CDATA_PARSER)

        #----------------------------------------------------------------------#
        #                         NCBI Record Parsing                          #
        #----------------------------------------------------------------------#

        #print(etree.tostring(ID_root).decode())

        column_dict = {}
        # Add ID to the dictionary
        column_dict[table + "_id"] = [ID]
        # A special dictionary for gbseq annotations
        gbseq_dict = {}
        # Iterate through each column to search for metadata
        for column in table_columns:
            column_name = list(column.keys())[0]
            column_value = []
            column_payload = list(column.values())[0]
            column_payload = column_payload.split(", ")
            # Initialize with empty values
            column_dict[column_name] = column_value

            #-------------------------------------------------------#
            #   XML Parse for node or attribute
            #-------------------------------------------------------#
            working_root = ID_root
            # If there are special character, this query should not be used for xpath!!
            bool_special_char = False
            for char in XPATH_SPECIAL_CHAR:
                for xquery in column_payload:
                    if char in xquery:
                        bool_special_char = True
            # If no special characters, run xpath search Functions
            if not bool_special_char:
                NCBImetaUtilities.xml_search(working_root, column_payload,
                                             column_payload[0], column_name,
                                             column_dict)

            # Special parsing for GBSeq_comment
            # If we're on the GBSeq_comment element and the comment was added to the dictionary
            if "GBSeq_comment" in column_payload and len(
                    column_dict[column_name]) > 0:
                comment = column_dict[column_name][0]
                # Fix the CDS vs CDSs ambiguity
                comment = comment.replace("CDSs", "CDS")
                # comment is initialize subdivided by semi-colons
                split_comment = comment.split(";")
                for item in split_comment:
                    # Further subdivided by double colons
                    split_item = item.split("::")
                    # The elements we're interested have the :: otherwise skip
                    if len(split_item) < 2: continue
                    # Left side is the column name, right side is the metadata
                    split_key = split_item[0].lstrip(" ").rstrip(" ")
                    split_value = split_item[1].lstrip(" ").rstrip(" ")
                    gbseq_dict[split_key] = split_value

            # If the value was still empty, check for gbseq comment
            if column_payload[0] in gbseq_dict:
                column_dict[column_name].append(gbseq_dict[column_payload[0]])

        # Add quotations around each value for sql insertion
        for key in column_dict:
            # Remove empty string elements
            while "" in column_dict[key]:
                column_dict[key].remove("")

            # Concatenate multi elements
            column_dict[key] = DB_VALUE_SEP.join(column_dict[key])

        # Write the column values to the db with dynamic variables
        sql_q_marks = ",".join(["?"] * len(column_dict.keys()))
        sql_q_marks = "(" + sql_q_marks + ")"
        sql_dynamic_colnames = "(" + ",".join(column_dict.keys()) + ")"
        sql_values_placeholder = [
            column_dict[column] for column in column_dict.keys()
        ]

        sql_query = "INSERT INTO " + table + " " + sql_dynamic_colnames + " VALUES " + sql_q_marks

        cur.execute(sql_query, sql_values_placeholder)

        # Write to logfile
        now = datetime.datetime.now()
        log_file.write("[" + str(now) + "]" + "\t" +
                       "New entry added with ID:" + "\t" + ID + "." + "\n")
        conn.commit()

    # CLEANUP
    conn.commit()
    cur.close()
    log_file.close()