def normalizeSingleFeatDict(self, fDict, minMaxDict, colMapObj): """Convenience function for running normalization of a sinlge featDict""" newDict = {}; for key, val in fDict.iteritems(): if key in minMaxDict and key in colMapObj: (minVal, rangeVal) = minMaxDict[key] newKey = colMapObj[key] newDict[newKey] = (val - minVal)/rangeVal; else: if self.logwarning and key not in minMaxDict: log.warning('key : %s not in minMaxDict' % str(key)) if self.logwarning and key not in colMapObj: log.warning('key : %s not in colMapObj' % str(key)); return newDict;
def createCanonicalAtomMapSmiString_singleComponent(mol): """return the canon atm map string for a single component""" sym = symmetricAtomsExist(mol) if not sym: return createStdAtomMapSmiString(mol) isAtomMapped = hasAtomMaps(mol) if not isAtomMapped: return createStdAtomMapSmiString(mol) ## Otherwise, run through possibleSmiles = createPossibleAtomMapSmiles(createStdAtomMapSmiString(mol)) possibleSmiles.sort(); if len(possibleSmiles) > 0: return possibleSmiles[0] log.warning('Possible SMILES = NONE, for %s' % createStdAtomMapSmiString(mol)) return ''
def normalizeFeatDictList(self, featureReader, minMaxDict, colMapObj): """Given a fetaure reader. A norm param dictionary, and a new colMapObj, yield out tuples of the form (id, newMappedFDict)""" for fDict in featureReader: newDict = {}; idVal = featureReader.objDescriptions[-1]; for key, val in fDict.iteritems(): if key in minMaxDict and key in colMapObj: (minVal, rangeVal) = minMaxDict[key] newKey = colMapObj[key] newDict[newKey] = (val - minVal)/rangeVal; else: if key not in minMaxDict and self.logwarning: log.warning('key : %s not in minMaxDict' % str(key)) pass; if key not in colMapObj and self.logwarning: log.warning('key : %s not in colMapObj' % str(key)); pass yield (idVal, newDict)
def findOrInsertItem(tableName, searchDict, insertDict=None, retrieveCol=None, forceUpdate=False, autoCommit=True, conn=None, connFactory=None): """Search the named table in database for a row whose attributes match the key-value pairs specified in searchDict. If one exists, then return the column (probably the primary key) named by retrieveCol. Otherwise, insert a row into the table with the data specified in the insertDict key-value pairs and try accessing the retrieveCol again (presumably the one just inserted). If forceUpdate is specified as True, then, even if the row already exists in the database, update the row to match the contents of the insertDict. The connection object to the database (conn) can be specified, otherwise it will just default to that returned by the connection() method. If no insertDict is specified, use the searchDict as necessary. If no retrieveCol is specified, then will attempt to find the default primary key column based on the table name. Returns a tuple (col, isNew) where col is the value of the retrieveCol and isNew is a boolean indicating if this came from a new row just inserted or if it was just taken from an existing record. """ extConn = ( conn is not None ); if insertDict == None: insertDict = searchDict if conn == None: # If no specific connection object provided, look for a connection factory to produce one if connFactory is not None: conn = connFactory.connection(); else: # No connection or factory specified, just fall back on default connection then conn = connection(); try: cur = conn.cursor() # Create the query for checking if it's already in the database searchQuery = SQLQuery(); if retrieveCol == None: searchQuery.addSelect(defaultIDColumn(tableName)); else: searchQuery.addSelect(retrieveCol); searchQuery.addFrom(tableName) for i, (col, value) in enumerate(searchDict.iteritems()): if value is not None: searchQuery.addWhereEqual(col, value); else: # Equals operator doesn't work for null values searchQuery.addWhereOp(col,"is",value); # Convert query as a model into a single string searchParams= searchQuery.params; searchQuery = str(searchQuery); log.debug("Before Select Query: "+ parameterizeQueryString( searchQuery, searchParams ) ); # Check if the retrieveCol is already in the database, # by these search criteria cur.execute( searchQuery, searchParams ); result = cur.fetchone() log.debug("After Select/fetchone Query: "+ parameterizeQueryString( searchQuery, searchParams ) ); rowExisted = result is not None; if ( rowExisted ): if forceUpdate: # Item already exists, but want to force an update with the insertDict contents updateRow( tableName, insertDict, searchDict.values(), searchDict.keys(), conn=conn ); cur.execute( searchQuery, searchParams ); result = cur.fetchone() return (result[0], not rowExisted) else: # Item does not yet exist. Insert it, then get the retrieveCol again. insertRow( tableName, insertDict, conn=conn, cursor=cur ); # allow user to not commit when providing his/her own connection if not extConn or autoCommit: conn.commit(); # Now that insert or update has completed, try to retrieve the data again, # using sequences if possible #if retrieveCol is None: #cur.execute(identityQuery(tableName)); #else: # comment out the above because it wasn't working for some tables. cur.execute( searchQuery, searchParams ); result = cur.fetchone() if (result != None): # Returning data from the just inserted item return (result[0], not rowExisted) else: log.warning("For table "+tableName+", could not find "+ str(searchDict) +" even after inserting "+ str(insertDict) ) return (None, None) finally: if not extConn: conn.close(); # If we opened the connection ourselves, then close it ourselves
def updateFromFile( sourceFile, tableName, columnNames=None, nIdCols=1, delim=None, skipErrors=False, connFactory=None ): """Update the database with the contents of a whitespace-delimited text file. Updates the contents of the <tableName> with the data from the <sourceFile>. One line is expected in the <sourceFile> per row in the database, with each item delimited by the <delim> character (specify None for any whitespace). These items will be inserted under the respective order of the given list of <columnNames>. If the columnNames parameter is not provided, assume the first line of the <sourceFile> contains the column names. To know which rows to update, assume the FIRST column listed in <columnNames> is the ID column to identify rows by. In that case, the data value there from the <sourceFile> will not be used to update the row, but will instead be used to identify the row to update the rest of the data by. If more than one column is necessary to identify a row (composite key), indicate how many of the first columns in <columnNames> should be used with <nIdCols>. Note that these key ID values must not be None / null. The query looks for rows where columnname = value, and the = operator always returns false when the value is null. Returns the total number of rows successfully updated. """ if columnNames is None or len(columnNames) < 1: headerLine = sourceFile.readline(); columnNames = headerLine.split(delim); conn = None; if connFactory is not None: conn = connFactory.connection(); else: conn = connection() cur = conn.cursor() nCols = len(columnNames); try: # Prepare the SQL Statement sql = []; sql.append("update"); sql.append( tableName ); sql.append("set"); # Data Columns for i in xrange(nIdCols,nCols): sql.append(columnNames[i]); sql.append("="); sql.append(SQL_PLACEHOLDER); sql.append(","); sql.pop(); # Remove extra comma at end # ID Columns sql.append("where") for i in xrange(nIdCols): sql.append(columnNames[i]); sql.append("="); sql.append(SQL_PLACEHOLDER); sql.append("and"); sql.pop(); # Remove extra comma at end sql = str.join(" ",sql); log.debug(sql) # Loop through file and execute update statement for every line progress = ProgressDots() for iLine, line in enumerate(sourceFile): if not line.startswith(COMMENT_TAG): try: line = line[:-1]; # Strip the newline character params = line.split(delim); # Special handling for null / None string for iParam in xrange(len(params)): if params[iParam] == "" or params[iParam] == NULL_STRING: # Treat blank strings as NULL params[iParam] = None; # Reposition ID columns to end of parameter list idParams = params[:nIdCols]; dataParams = params[nIdCols:]; paramTuple = dataParams; paramTuple.extend( idParams ); paramTuple = tuple(paramTuple); cur.execute(sql, paramTuple); # Need to "auto-commit" after each command, # otherwise a skipped error will rollback # any previous commands as well if skipErrors: conn.commit() progress.Update() except Exception, err: conn.rollback(); # Reset changes and connection state log.critical(sql); log.critical(paramTuple); log.warning("Error Executing in Script: %s", parameterizeQueryString(sql,paramTuple) ); if skipErrors: log.warning(err) else: raise err conn.commit() return progress.GetCounts();
def insertFile( sourceFile, tableName, columnNames=None, delim=None, idFile=None, skipErrors=False, dateColFormats=None, escapeStrings=False, estInput=None, connFactory=None ): """Insert the contents of a whitespace-delimited text file into the database. For PostgreSQL specifically, consider alternative direct COPY command that can run 10x: E.g., gzip -d -c TTeam_2014.tsv.gz | psql -U jonc101 -c "COPY tteamx ( pat_deid, enc_deid, relation, prov_id, prov_name, start_date, end_date ) FROM STDIN WITH (FORMAT csv, DELIMITER E'\t', HEADER, NULL 'None');" resident-access-log-2017 Inserts the contents of the <sourceFile> into the database under the <tableName>. One line is expected in the <sourceFile> per row in the database, with each item delimited by the <delim> character. These items will be inserted under the respective order of the given list of columnNames. Use the built-in csv module for parsing out lines and managing quotes, etc. If delimiter is not specified (None), then default to tab-delimited If idFile is provided, then will try to run SQL from identityQuery method after each insert, and write out the contents, one per line to the idFile. Will bypass above step if can find an insert column with the expected default ID column ("tableName_id") If dateColFormats provided, expect a dictionary keyed by the names of columns that should be as interpreted date strings, with values equal to the Python date format string to parse them by. If a format string is not provided, a series of standard date format strings will be attempted (but this is inefficient for repeated date text parsing and error handling). Returns the total number of rows successfully inserted. """ if columnNames is not None and len(columnNames) < 1: columnNames = None; # If empty columnNames list, then reset to null and look for it in first line of data reader = TabDictReader(sourceFile, fieldnames=columnNames, delimiter=delim); columnNames = reader.fieldnames; idCol = defaultIDColumn(tableName); iIdCol = None; # Index of manually specified ID column. May be null for iCol, colName in enumerate(columnNames): if colName == idCol: iIdCol = iCol; if dateColFormats is not None: # Ensure column keys are normalized dateCols = dateColFormats.keys(); for dateCol in dateCols: normalCol = normalizeColName(dateCol); dateColFormats[normalCol] = dateColFormats[dateCol]; conn = None; if connFactory is not None: conn = connFactory.connection(); else: conn = connection() cur = conn.cursor() try: # Prepare the SQL Statement sqlParts = [] sqlParts.append("insert into") sqlParts.append( tableName ) sqlParts.append("(") sqlParts.append( str.join(",", columnNames) ); sqlParts.append(")") sqlParts.append("values") sqlParts.append("(") for i in range(len(columnNames)): sqlParts.append( SQL_PLACEHOLDER ) # Parameter placeholder, depends on DB-API sqlParts.append(",") sqlParts.pop(); # Remove extra end comma sqlParts.append(")") sql = str.join(" ",sqlParts) log.debug(sql) # Loop through file and execute insert statement everytime find enough delimited parameters. nInserts = 0 nCols = len(columnNames) params = list(); progress = ProgressDots(total=estInput); for iLine, rowModel in enumerate(reader): # Parse out data values from strings for iCol, colName in enumerate(columnNames): value = parseValue(rowModel[colName], colName, dateColFormats, escapeStrings); params.append(value); log.debug(params) try: cur.execute(sql,tuple(params)) nInserts += cur.rowcount if idFile != None: rowId = None; if iIdCol is not None: # Look for manually assigned ID value first rowId = params[iIdCol]; else: cur.execute(identityQuery(tableName)); rowId = cur.fetchone()[0]; print >> idFile, rowId; # Need to "auto-commit" after each command, # otherwise a skipped error will rollback # any previous commands as well if skipErrors: conn.commit() progress.Update() except Exception, err: log.info(sql); log.info(tuple(params)) conn.rollback(); # Reset any changes since the last commit if skipErrors: log.warning("Error Executing in Script: "+ sql ) log.warning(err) else: raise; params = list(); conn.commit() return nInserts
def querySourceItems(self, patientIds=None, progress=None, conn=None): """Query the database for list of all patient demographics and yield the results one at a time. If patientIds provided, only return items matching those IDs. """ extConn = conn is not None if not extConn: conn = self.connFactory.connection() # Column headers to query for that map to respective fields in analysis table headers = [ "pat_id", "birth_year", "gender", "death_date", "race", "ethnicity" ] query = SQLQuery() for header in headers: query.addSelect(header) query.addFrom("stride_patient as sp") if patientIds is not None: query.addWhereIn("sp.pat_id", patientIds) # Query to get an estimate of how long the process will be if progress is not None: progress.total = DBUtil.execute(query.totalQuery(), conn=conn)[0][0] cursor = conn.cursor() # Do one massive query, but yield data for one item at a time. cursor.execute(str(query), tuple(query.params)) row = cursor.fetchone() while row is not None: rowModel = RowItemModel(row, headers) if rowModel["birth_year"] is None: # Blank values, doesn't make sense. Skip it log.warning(rowModel) else: # Record birth at resolution of year rowModel["itemDate"] = datetime(rowModel["birth_year"], 1, 1) rowModel["name"] = "Birth" rowModel["description"] = "Birth Year" yield rowModel # Record another at resolution of decade decade = (rowModel["birth_year"] / 10) * 10 rowModel["itemDate"] = datetime(rowModel["birth_year"], 1, 1) rowModel["name"] = "Birth%ds" % decade rowModel["description"] = "Birth Decade %ds" % decade yield rowModel # Summarize race and ethnicity information into single field of interest raceEthnicity = self.summarizeRaceEthnicity(rowModel) rowModel["itemDate"] = datetime(rowModel["birth_year"], 1, 1) rowModel["name"] = "Race" + (raceEthnicity.translate( None, " ()-/")) # Strip off punctuation rowModel["description"] = "Race/Ethnicity: %s" % raceEthnicity yield rowModel gender = rowModel["gender"].title() rowModel["name"] = gender rowModel["description"] = "%s Gender" % gender yield rowModel if rowModel["death_date"] is not None: rowModel["name"] = "Death" rowModel["description"] = "Death Date" rowModel["itemDate"] = rowModel["death_date"] yield rowModel row = cursor.fetchone() progress.Update() # Slight risk here. Normally DB connection closing should be in finally of a try block, # but using the "yield" generator construct forbids us from using a try, finally construct. cursor.close() if not extConn: conn.close()