Ejemplo n.º 1
0
 def normalizeSingleFeatDict(self, fDict, minMaxDict, colMapObj):
     """Convenience function for running normalization of a sinlge featDict"""
     newDict = {};
     for key, val in fDict.iteritems():
         if key in minMaxDict and key in colMapObj:
             (minVal, rangeVal) = minMaxDict[key]
             newKey = colMapObj[key]
             newDict[newKey] = (val - minVal)/rangeVal;
         else:
             if self.logwarning and key not in minMaxDict:
                 log.warning('key : %s not in minMaxDict' % str(key))
             if self.logwarning and key not in colMapObj:
                 log.warning('key : %s not in colMapObj' % str(key));
     return newDict;
Ejemplo n.º 2
0
def createCanonicalAtomMapSmiString_singleComponent(mol):
    """return the canon atm map string for a single component"""
    sym = symmetricAtomsExist(mol)
    if not sym:
        return createStdAtomMapSmiString(mol)
    isAtomMapped = hasAtomMaps(mol)
    if not isAtomMapped:
        return createStdAtomMapSmiString(mol)
    
    ## Otherwise, run through
    possibleSmiles = createPossibleAtomMapSmiles(createStdAtomMapSmiString(mol))
    possibleSmiles.sort();
    if len(possibleSmiles) > 0:
        return possibleSmiles[0]
    
    log.warning('Possible SMILES = NONE, for %s' % createStdAtomMapSmiString(mol))
    return ''
Ejemplo n.º 3
0
 def normalizeFeatDictList(self, featureReader, minMaxDict, colMapObj):
     """Given a fetaure reader.  A norm param dictionary, and a new colMapObj,
     yield out tuples of the form (id, newMappedFDict)"""
     for fDict in featureReader:
         newDict = {};
         idVal = featureReader.objDescriptions[-1];
         for key, val in fDict.iteritems():
             if key in minMaxDict and key in colMapObj:
                 (minVal, rangeVal) = minMaxDict[key]
                 newKey = colMapObj[key]
                 newDict[newKey] = (val - minVal)/rangeVal;
             else:
                 if key not in minMaxDict and self.logwarning:
                     log.warning('key : %s not in minMaxDict' % str(key))
                     pass;
                 if key not in colMapObj and self.logwarning:
                     log.warning('key : %s not in colMapObj' % str(key));
                     pass
         yield (idVal, newDict)
Ejemplo n.º 4
0
def findOrInsertItem(tableName, searchDict, insertDict=None, retrieveCol=None, forceUpdate=False, autoCommit=True, conn=None, connFactory=None):
    """Search the named table in database for a row whose attributes match the key-value pairs specified in searchDict.  

    If one exists, then return the column (probably the primary key) named by retrieveCol.  
    Otherwise, insert a row into the table with the data specified in the insertDict key-value pairs
    and try accessing the retrieveCol again (presumably the one just inserted).  
    
    If forceUpdate is specified as True, then, even if the row already exists in the database, 
    update the row to match the contents of the insertDict.
    
    The connection object to the database (conn) can be specified, otherwise it will just default 
    to that returned by the connection() method.  If no insertDict is specified, use the searchDict 
    as necessary.  If no retrieveCol is specified, then will attempt to find the default primary 
    key column based on the table name.

    Returns a tuple (col, isNew) where col is the value of the retrieveCol and isNew is a boolean 
    indicating if this came from a new row just inserted or if it was just taken from an existing record.
    """
    extConn = ( conn is not None );
    if insertDict == None:
        insertDict = searchDict 
    if conn == None:
        # If no specific connection object provided, look for a connection factory to produce one
        if connFactory is not None:
            conn = connFactory.connection();
        else:
            # No connection or factory specified, just fall back on default connection then
            conn = connection();
    
    try:
        cur = conn.cursor()

        # Create the query for checking if it's already in the database
        searchQuery = SQLQuery();
        
        if retrieveCol == None:
            searchQuery.addSelect(defaultIDColumn(tableName));
        else:
            searchQuery.addSelect(retrieveCol);
        searchQuery.addFrom(tableName)

        for i, (col, value) in enumerate(searchDict.iteritems()):
            if value is not None:
                searchQuery.addWhereEqual(col, value);
            else:
                # Equals operator doesn't work for null values
                searchQuery.addWhereOp(col,"is",value);

        # Convert query as a model into a single string
        searchParams= searchQuery.params;
        searchQuery = str(searchQuery);
        log.debug("Before Select Query: "+ parameterizeQueryString( searchQuery, searchParams ) );

        # Check if the retrieveCol is already in the database, 
        #   by these search criteria
        cur.execute( searchQuery, searchParams );
        result = cur.fetchone()
        
        log.debug("After Select/fetchone Query: "+ parameterizeQueryString( searchQuery, searchParams ) );
        
        rowExisted = result is not None;
        
        if ( rowExisted ):
            if forceUpdate:
                # Item already exists, but want to force an update with the insertDict contents
                updateRow( tableName, insertDict, searchDict.values(), searchDict.keys(), conn=conn );
                cur.execute( searchQuery, searchParams );
                result = cur.fetchone()
            return (result[0], not rowExisted)
        else:
            # Item does not yet exist.  Insert it, then get the retrieveCol again.
            insertRow( tableName, insertDict, conn=conn, cursor=cur );
            
            # allow user to not commit when providing his/her own connection
            if not extConn or autoCommit:
                conn.commit();

        # Now that insert or update has completed, try to retrieve the data again,
        # using sequences if possible
        #if retrieveCol is None:
            #cur.execute(identityQuery(tableName));
        #else:
        # comment out the above because it wasn't working for some tables.
        cur.execute( searchQuery, searchParams );
        result = cur.fetchone()
        if (result != None):
            # Returning data from the just inserted item
            return (result[0], not rowExisted)
        else:
            log.warning("For table "+tableName+", could not find "+ str(searchDict) +" even after inserting "+ str(insertDict) )
            return (None, None)
    finally:
        if not extConn:
            conn.close();   # If we opened the connection ourselves, then close it ourselves
Ejemplo n.º 5
0
def updateFromFile( sourceFile, tableName, columnNames=None, nIdCols=1, delim=None, skipErrors=False, connFactory=None  ):
    """Update the database with the contents of a whitespace-delimited text file.
    
    Updates the contents of the <tableName> with the data from the <sourceFile>.  
    One line is expected in the <sourceFile> per row in the database, with each item 
    delimited by the <delim> character (specify None for any whitespace).  
    These items will be inserted under the respective order of the given list of 
    <columnNames>.  If the columnNames parameter is not provided, assume the
    first line of the <sourceFile> contains the column names.

    To know which rows to update, assume the FIRST column listed in <columnNames> is
    the ID column to identify rows by.  In that case, the data value there from the
    <sourceFile> will not be used to update the row, but will instead be used to
    identify the row to update the rest of the data by.  If more than one column
    is necessary to identify a row (composite key), indicate how many of the
    first columns in <columnNames> should be used with <nIdCols>.  Note that these key ID 
    values must not be None / null.  The query looks for rows where columnname = value,
    and the = operator always returns false when the value is null.

    Returns the total number of rows successfully updated.
    """
    if columnNames is None or len(columnNames) < 1:
        headerLine = sourceFile.readline();
        columnNames = headerLine.split(delim);
    
    conn = None;
    if connFactory is not None:
        conn = connFactory.connection();
    else:
        conn = connection()
    cur  = conn.cursor()

    nCols = len(columnNames);
    
    try:
        # Prepare the SQL Statement
        sql = [];
        sql.append("update");
        sql.append( tableName );
        sql.append("set");

        # Data Columns
        for i in xrange(nIdCols,nCols):
            sql.append(columnNames[i]);
            sql.append("=");
            sql.append(SQL_PLACEHOLDER);
            sql.append(",");
        sql.pop();  # Remove extra comma at end

        # ID Columns
        sql.append("where")
        for i in xrange(nIdCols):
            sql.append(columnNames[i]);
            sql.append("=");
            sql.append(SQL_PLACEHOLDER);
            sql.append("and");
        sql.pop();  # Remove extra comma at end

        sql = str.join(" ",sql);

        log.debug(sql)

        # Loop through file and execute update statement for every line
        progress = ProgressDots()
        for iLine, line in enumerate(sourceFile):
            if not line.startswith(COMMENT_TAG):
                try:
                    line = line[:-1];    # Strip the newline character
                    params = line.split(delim);
                    
                    # Special handling for null / None string
                    for iParam in xrange(len(params)):
                        if params[iParam] == "" or params[iParam] == NULL_STRING:   # Treat blank strings as NULL
                            params[iParam] = None;

                    # Reposition ID columns to end of parameter list
                    idParams = params[:nIdCols];
                    dataParams = params[nIdCols:];
                    paramTuple = dataParams;
                    paramTuple.extend( idParams );
                    paramTuple = tuple(paramTuple);
                    
                    cur.execute(sql, paramTuple);

                    # Need to "auto-commit" after each command, 
                    #   otherwise a skipped error will rollback 
                    #   any previous commands as well
                    if skipErrors:
                        conn.commit()    

                    progress.Update()

                except Exception, err:
                    conn.rollback();    # Reset changes and connection state
                    log.critical(sql);
                    log.critical(paramTuple);
                    log.warning("Error Executing in Script: %s", parameterizeQueryString(sql,paramTuple) );
                    if skipErrors:
                        log.warning(err)
                    else:
                        raise err

        conn.commit()

        return progress.GetCounts();
Ejemplo n.º 6
0
def insertFile( sourceFile, tableName, columnNames=None, delim=None, idFile=None, skipErrors=False, dateColFormats=None, escapeStrings=False, estInput=None, connFactory=None ):
    """Insert the contents of a whitespace-delimited text file into the database.
    
    For PostgreSQL specifically, consider alternative direct COPY command that can run 10x:
    E.g., gzip -d -c TTeam_2014.tsv.gz | psql -U jonc101 -c "COPY tteamx (  pat_deid,  enc_deid,  relation,  prov_id,  prov_name,  start_date,  end_date ) FROM STDIN WITH (FORMAT csv, DELIMITER E'\t', HEADER, NULL 'None');" resident-access-log-2017

    Inserts the contents of the <sourceFile> into the database
    under the <tableName>.  One line is expected in the <sourceFile>
    per row in the database, with each item delimited by the <delim>
    character.  These items will be inserted under the respective
    order of the given list of columnNames.

    Use the built-in csv module for parsing out lines and managing quotes, etc.
    If delimiter is not specified (None), then default to tab-delimited
    
    If idFile is provided, then will try to run SQL from identityQuery method
    after each insert, and write out the contents, one per line to the idFile.
    Will bypass above step if can find an insert column with the expected default ID column ("tableName_id")
    
    If dateColFormats provided, expect a dictionary keyed by the names of columns
    that should be as interpreted date strings, with values equal to the 
    Python date format string to parse them by.  
    If a format string is not provided, a series of standard date format strings will be attempted 
    (but this is inefficient for repeated date text parsing and error handling).
    
    Returns the total number of rows successfully inserted.
    """
    if columnNames is not None and len(columnNames) < 1:
        columnNames = None; # If empty columnNames list, then reset to null and look for it in first line of data

    reader = TabDictReader(sourceFile, fieldnames=columnNames, delimiter=delim);
    columnNames = reader.fieldnames;

    idCol = defaultIDColumn(tableName);
    iIdCol = None;  # Index of manually specified ID column. May be null
    for iCol, colName in enumerate(columnNames):
        if colName == idCol:
            iIdCol = iCol;

    if dateColFormats is not None:
        # Ensure column keys are normalized
        dateCols = dateColFormats.keys();
        for dateCol in dateCols:
            normalCol = normalizeColName(dateCol);
            dateColFormats[normalCol] = dateColFormats[dateCol];

    conn = None;
    if connFactory is not None:
        conn = connFactory.connection();
    else:
        conn = connection()
    cur  = conn.cursor()
    
    try:
        # Prepare the SQL Statement
        sqlParts = []
        sqlParts.append("insert into")
        sqlParts.append( tableName )

        sqlParts.append("(")
        sqlParts.append( str.join(",", columnNames) );
        sqlParts.append(")")

        sqlParts.append("values")
        sqlParts.append("(")
        for i in range(len(columnNames)):
            sqlParts.append( SQL_PLACEHOLDER )    # Parameter placeholder, depends on DB-API
            sqlParts.append(",")
        sqlParts.pop(); # Remove extra end comma
        sqlParts.append(")")

        sql = str.join(" ",sqlParts)

        log.debug(sql)
        

        # Loop through file and execute insert statement everytime find enough delimited parameters.  
        nInserts = 0
        nCols = len(columnNames)
        params = list();
        progress = ProgressDots(total=estInput);
        for iLine, rowModel in enumerate(reader):
            # Parse out data values from strings
            for iCol, colName in enumerate(columnNames):
                value = parseValue(rowModel[colName], colName, dateColFormats, escapeStrings);
                params.append(value);

            log.debug(params)
            try:
                cur.execute(sql,tuple(params))
                nInserts += cur.rowcount

                if idFile != None:
                    rowId = None;
                    if iIdCol is not None:  # Look for manually assigned ID value first
                        rowId = params[iIdCol];
                    else:
                        cur.execute(identityQuery(tableName));
                        rowId = cur.fetchone()[0];
                    print >> idFile, rowId;

                # Need to "auto-commit" after each command, 
                #   otherwise a skipped error will rollback 
                #   any previous commands as well
                if skipErrors: 
                    conn.commit()    

                progress.Update()

            except Exception, err:
                log.info(sql);
                log.info(tuple(params))
                conn.rollback();    # Reset any changes since the last commit
                if skipErrors:
                    log.warning("Error Executing in Script: "+ sql )
                    log.warning(err)
                else:
                    raise;
            params = list();

        conn.commit()

        return nInserts
Ejemplo n.º 7
0
    def querySourceItems(self, patientIds=None, progress=None, conn=None):
        """Query the database for list of all patient demographics
        and yield the results one at a time.  If patientIds provided, only return items
        matching those IDs.
        """
        extConn = conn is not None
        if not extConn:
            conn = self.connFactory.connection()

        # Column headers to query for that map to respective fields in analysis table
        headers = [
            "pat_id", "birth_year", "gender", "death_date", "race", "ethnicity"
        ]

        query = SQLQuery()
        for header in headers:
            query.addSelect(header)
        query.addFrom("stride_patient as sp")
        if patientIds is not None:
            query.addWhereIn("sp.pat_id", patientIds)

        # Query to get an estimate of how long the process will be
        if progress is not None:
            progress.total = DBUtil.execute(query.totalQuery(),
                                            conn=conn)[0][0]

        cursor = conn.cursor()
        # Do one massive query, but yield data for one item at a time.
        cursor.execute(str(query), tuple(query.params))

        row = cursor.fetchone()
        while row is not None:
            rowModel = RowItemModel(row, headers)

            if rowModel["birth_year"] is None:
                # Blank values, doesn't make sense.  Skip it
                log.warning(rowModel)
            else:
                # Record birth at resolution of year
                rowModel["itemDate"] = datetime(rowModel["birth_year"], 1, 1)
                rowModel["name"] = "Birth"
                rowModel["description"] = "Birth Year"
                yield rowModel

                # Record another at resolution of decade
                decade = (rowModel["birth_year"] / 10) * 10
                rowModel["itemDate"] = datetime(rowModel["birth_year"], 1, 1)
                rowModel["name"] = "Birth%ds" % decade
                rowModel["description"] = "Birth Decade %ds" % decade
                yield rowModel

                # Summarize race and ethnicity information into single field of interest
                raceEthnicity = self.summarizeRaceEthnicity(rowModel)
                rowModel["itemDate"] = datetime(rowModel["birth_year"], 1, 1)
                rowModel["name"] = "Race" + (raceEthnicity.translate(
                    None, " ()-/"))
                # Strip off punctuation
                rowModel["description"] = "Race/Ethnicity: %s" % raceEthnicity
                yield rowModel

                gender = rowModel["gender"].title()
                rowModel["name"] = gender
                rowModel["description"] = "%s Gender" % gender
                yield rowModel

                if rowModel["death_date"] is not None:
                    rowModel["name"] = "Death"
                    rowModel["description"] = "Death Date"
                    rowModel["itemDate"] = rowModel["death_date"]
                    yield rowModel

            row = cursor.fetchone()
            progress.Update()

        # Slight risk here.  Normally DB connection closing should be in finally of a try block,
        #   but using the "yield" generator construct forbids us from using a try, finally construct.
        cursor.close()

        if not extConn:
            conn.close()