def mapFDictToArray(self, fd, arr, iRow):
     """Convenience to map a fd to a particular row in an array (where keys are the columns)"""
     for colIdx, val in fd.iteritems():
         try:
             arr[iRow, colIdx] = val;
         except Exception, e:
             myLog.critical('lData, iRow: %s, colIdx: %s, val: %s' % (pprint.pformat(iRow), pprint.pformat(colIdx), pprint.pformat(val)));
             raise e;
Ejemplo n.º 2
0
 def mapFDictToArray(self, fd, arr, iRow):
     """Convenience to map a fd to a particular row in an array (where keys are the columns)"""
     for colIdx, val in fd.iteritems():
         try:
             arr[iRow, colIdx] = val
         except Exception, e:
             myLog.critical('lData, iRow: %s, colIdx: %s, val: %s' %
                            (pprint.pformat(iRow), pprint.pformat(colIdx),
                             pprint.pformat(val)))
             raise e
 def gradChunkDataIterator(self, fDictList, targetArr, idxArr):
     """Convenience generator to return small chunks of data to minimize mem usage at a time.
     yields subData, subTarg
     
     Converts the fDictList into the dataArr.  (Assumes that the keys of the dictionary
     are the columns);
     """
     for rowStart in range(0, len(idxArr), self.gradientChunkSize):
         self.dataArr *= 0.0;
         rowEnd = rowStart + self.gradientChunkSize;
         subIdxArr = idxArr[rowStart:rowEnd];
         subTargArr = targetArr[rowStart:rowEnd];
         for iRow, fDictIdx in enumerate(subIdxArr):
             for colIdx, val in fDictList[fDictIdx].iteritems():
                 try:
                     self.dataArr[iRow, colIdx] = val;
                 except Exception, e:
                     myLog.critical('iRow: %s, colIdx: %s, val: %s' % (pprint.pformat(iRow), pprint.pformat(colIdx), pprint.pformat(val)));
                     raise e;
         numRows = len(subIdxArr);
         yield self.dataArr[:numRows, :].T, subTargArr[:numRows].T;
 def gradChunkDataIterator(self, fDictList, targetArr, idxArr):
     """Convenience generator to return small chunks of data to minimize mem usage at a time.
     yields subData, subTarg
     
     Converts the fDictList into the dataArr.  (Assumes that the keys of the dictionary
     are the columns);
     """
     for rowStart in range(0, len(idxArr), self.gradientChunkSize):
         self.dataArr *= 0.0
         rowEnd = rowStart + self.gradientChunkSize
         subIdxArr = idxArr[rowStart:rowEnd]
         subTargArr = targetArr[rowStart:rowEnd]
         for iRow, fDictIdx in enumerate(subIdxArr):
             for colIdx, val in fDictList[fDictIdx].iteritems():
                 try:
                     self.dataArr[iRow, colIdx] = val
                 except Exception, e:
                     myLog.critical(
                         'iRow: %s, colIdx: %s, val: %s' %
                         (pprint.pformat(iRow), pprint.pformat(colIdx),
                          pprint.pformat(val)))
                     raise e
         numRows = len(subIdxArr)
         yield self.dataArr[:numRows, :].T, subTargArr[:numRows].T
    def train(self):
        """Method to run through all the data and train away"""
        self.costTrajectory = [];
        
        # Set some things up if doing online
        c1Idx = None;
        c0Idx = None;
        if not self.batch:
            shuffIdx = arange(0, len(self.idxArr));
            c1Idx = shuffIdx[self.targetArr == 1];
            c0Idx = shuffIdx[self.targetArr == 0];
            #myLog.debug('len(c1Idx) : %d, len(c0Idx) : %d' % (len(c1Idx), len(c0Idx)));
            numOnlineRuns = int(len(self.idxArr)/float(self.onlineChunkSize)) + 1;
            c1Step = int(len(c1Idx) / numOnlineRuns) + 1;
            c0Step = int(len(c0Idx) / numOnlineRuns) + 1;
        
        try:
            for iEpoch in range(self.numEpochs):
                if self.batch:
                    self.trainer.step(self.fDictList, self.targetArr, self.idxArr);
                else:
                    # Want to balance each of the chunks used in the online learning.        
                    shuffle(c1Idx);
                    shuffle(c0Idx);
                    
                    
                    for iOnlineRun in range(numOnlineRuns):
                        c1RowStart = iOnlineRun * c1Step;
                        c1RowEnd = c1RowStart + c1Step;
                        c0RowStart = iOnlineRun * c0Step;
                        c0RowEnd = c0RowStart + c0Step;
                        theInds = concatenate((c1Idx[c1RowStart:c1RowEnd], c0Idx[c0RowStart:c0RowEnd]))
                        shuffle(theInds)
                        #myLog.debug('minshuffidx : %d, maxshuffidx: %d' % (min(theInds), max(theInds)))
                        subTargets = self.targetArr[theInds];
                        subIdx = self.idxArr[theInds];
                        self.trainer.step(self.fDictList, subTargets, subIdx);

                if self.epochlog:
                    myLog.debug('About to call cost in postEpoch call')
                self.postEpochCall(iEpoch);
                
                # Test for convergence
                if self.checkconverge and len(self.costTrajectory) > self.convergeEpochs + 1:
                    if std(self.costTrajectory[-self.convergeEpochs:]) < self.costEpsilon:
                        myLog.critical('Convergence after Epoch %d!!' % iEpoch);
                        return self.costTrajectory;
                
                if self.callback is not None:
                    self.callback(self);
            
            myLog.critical('Never completely converged after %d epochs!' % self.numEpochs);
        except KeyboardInterrupt, e:
            myLog.critical('Interrupted with Keyboard after %d epochs, stopping here, currCost = %f' % (iEpoch, self.costTrajectory[-1]))
            return self.costTrajectory;
Ejemplo n.º 6
0
def updateFromFile( sourceFile, tableName, columnNames=None, nIdCols=1, delim=None, skipErrors=False, connFactory=None  ):
    """Update the database with the contents of a whitespace-delimited text file.
    
    Updates the contents of the <tableName> with the data from the <sourceFile>.  
    One line is expected in the <sourceFile> per row in the database, with each item 
    delimited by the <delim> character (specify None for any whitespace).  
    These items will be inserted under the respective order of the given list of 
    <columnNames>.  If the columnNames parameter is not provided, assume the
    first line of the <sourceFile> contains the column names.

    To know which rows to update, assume the FIRST column listed in <columnNames> is
    the ID column to identify rows by.  In that case, the data value there from the
    <sourceFile> will not be used to update the row, but will instead be used to
    identify the row to update the rest of the data by.  If more than one column
    is necessary to identify a row (composite key), indicate how many of the
    first columns in <columnNames> should be used with <nIdCols>.  Note that these key ID 
    values must not be None / null.  The query looks for rows where columnname = value,
    and the = operator always returns false when the value is null.

    Returns the total number of rows successfully updated.
    """
    if columnNames is None or len(columnNames) < 1:
        headerLine = sourceFile.readline();
        columnNames = headerLine.split(delim);
    
    conn = None;
    if connFactory is not None:
        conn = connFactory.connection();
    else:
        conn = connection()
    cur  = conn.cursor()

    nCols = len(columnNames);
    
    try:
        # Prepare the SQL Statement
        sql = [];
        sql.append("update");
        sql.append( tableName );
        sql.append("set");

        # Data Columns
        for i in xrange(nIdCols,nCols):
            sql.append(columnNames[i]);
            sql.append("=");
            sql.append(Env.SQL_PLACEHOLDER);
            sql.append(",");
        sql.pop();  # Remove extra comma at end

        # ID Columns
        sql.append("where")
        for i in xrange(nIdCols):
            sql.append(columnNames[i]);
            sql.append("=");
            sql.append(Env.SQL_PLACEHOLDER);
            sql.append("and");
        sql.pop();  # Remove extra comma at end

        sql = str.join(" ",sql);

        log.debug(sql)

        # Loop through file and execute update statement for every line
        progress = ProgressDots()
        for iLine, line in enumerate(sourceFile):
            if not line.startswith(COMMENT_TAG):
                try:
                    line = line[:-1];    # Strip the newline character
                    params = line.split(delim);
                    
                    # Special handling for null / None string
                    for iParam in xrange(len(params)):
                        if params[iParam] == "" or params[iParam] == NULL_STRING:   # Treat blank strings as NULL
                            params[iParam] = None;

                    # Reposition ID columns to end of parameter list
                    idParams = params[:nIdCols];
                    dataParams = params[nIdCols:];
                    paramTuple = dataParams;
                    paramTuple.extend( idParams );
                    paramTuple = tuple(paramTuple);
                    
                    cur.execute(sql, paramTuple);

                    # Need to "auto-commit" after each command, 
                    #   otherwise a skipped error will rollback 
                    #   any previous commands as well
                    if skipErrors:
                        conn.commit()    

                    progress.Update()

                except Exception, err:
                    conn.rollback();    # Reset changes and connection state
                    log.critical(sql);
                    log.critical(paramTuple);
                    log.warning("Error Executing in Script: %s", parameterizeQueryString(sql,paramTuple) );
                    if skipErrors:
                        log.warning(err)
                    else:
                        raise err

        conn.commit()

        return progress.GetCounts();
    def train(self):
        """Method to run through all the data and train away"""
        self.costTrajectory = []

        # Set some things up if doing online
        c1Idx = None
        c0Idx = None
        if not self.batch:
            shuffIdx = arange(0, len(self.idxArr))
            c1Idx = shuffIdx[self.targetArr == 1]
            c0Idx = shuffIdx[self.targetArr == 0]
            #myLog.debug('len(c1Idx) : %d, len(c0Idx) : %d' % (len(c1Idx), len(c0Idx)));
            numOnlineRuns = int(
                len(self.idxArr) / float(self.onlineChunkSize)) + 1
            c1Step = int(len(c1Idx) / numOnlineRuns) + 1
            c0Step = int(len(c0Idx) / numOnlineRuns) + 1

        try:
            for iEpoch in range(self.numEpochs):
                if self.batch:
                    self.trainer.step(self.fDictList, self.targetArr,
                                      self.idxArr)
                else:
                    # Want to balance each of the chunks used in the online learning.
                    shuffle(c1Idx)
                    shuffle(c0Idx)

                    for iOnlineRun in range(numOnlineRuns):
                        c1RowStart = iOnlineRun * c1Step
                        c1RowEnd = c1RowStart + c1Step
                        c0RowStart = iOnlineRun * c0Step
                        c0RowEnd = c0RowStart + c0Step
                        theInds = concatenate((c1Idx[c1RowStart:c1RowEnd],
                                               c0Idx[c0RowStart:c0RowEnd]))
                        shuffle(theInds)
                        #myLog.debug('minshuffidx : %d, maxshuffidx: %d' % (min(theInds), max(theInds)))
                        subTargets = self.targetArr[theInds]
                        subIdx = self.idxArr[theInds]
                        self.trainer.step(self.fDictList, subTargets, subIdx)

                if self.epochlog:
                    myLog.debug('About to call cost in postEpoch call')
                self.postEpochCall(iEpoch)

                # Test for convergence
                if self.checkconverge and len(
                        self.costTrajectory) > self.convergeEpochs + 1:
                    if std(self.costTrajectory[-self.convergeEpochs:]
                           ) < self.costEpsilon:
                        myLog.critical('Convergence after Epoch %d!!' % iEpoch)
                        return self.costTrajectory

                if self.callback is not None:
                    self.callback(self)

            myLog.critical('Never completely converged after %d epochs!' %
                           self.numEpochs)
        except KeyboardInterrupt, e:
            myLog.critical(
                'Interrupted with Keyboard after %d epochs, stopping here, currCost = %f'
                % (iEpoch, self.costTrajectory[-1]))
            return self.costTrajectory
Ejemplo n.º 8
0
    def train(self):
        """Method to run through all the data and train away"""
        # Set some things up if doing online
        # will only calculate the gradient on the left hand side at a time
        # So make a target array and make sure we look at each ordered pair in both orders
        numOnlineRuns = int(
            (self.problemArr.shape[0]) / float(self.onlineChunkSize)) + 1
        theIdx = arange(0, self.problemArr.shape[0])
        theStep = int(len(theIdx) / numOnlineRuns) + 1

        ## Check if we have some null rows.  If we do, then stratify the rows trained on in each
        ## online step
        ## We don't want a complete step of left -1's or right -1's.
        isNullRows = (self.problemArr[:, 0] == -1) | (self.problemArr[:, 1]
                                                      == -1)
        isNullRows = any(isNullRows)
        if isNullRows:
            myLog.info(
                'Have -1 in problemArr, stratifying the online step data')
            nNullIdx = theIdx[(self.problemArr[:, 0] != -1)
                              & (self.problemArr[:, 1] != -1)]
            lNullIdx = theIdx[self.problemArr[:, 0] == -1]
            rNullIdx = theIdx[self.problemArr[:, 1] == -1]

            nNullStep = int(len(nNullIdx) / numOnlineRuns) + 1
            lNullStep = int(len(lNullIdx) / numOnlineRuns) + 1
            rNullStep = int(len(rNullIdx) / numOnlineRuns) + 1

        try:
            for iEpoch in range(self.numEpochs):
                if self.batch:
                    self.trainer.step(self.fDictList, self.problemArr)
                else:
                    # Want to balance each of the chunks used in the online learning.
                    if isNullRows:
                        shuffle(nNullIdx)
                        shuffle(lNullIdx)
                        shuffle(rNullIdx)
                    else:
                        shuffle(theIdx)

                    for iOnlineRun in range(numOnlineRuns):
                        if isNullRows:
                            nNullStart = iOnlineRun * nNullStep
                            nNullEnd = nNullStart + nNullStep
                            lNullStart = iOnlineRun * lNullStep
                            lNullEnd = lNullStart + lNullStep
                            rNullStart = iOnlineRun * rNullStep
                            rNullEnd = rNullStart + rNullStep

                            subProbArr = concatenate(
                                (self.problemArr[
                                    nNullIdx[nNullStart:nNullEnd], :],
                                 self.problemArr[
                                     lNullIdx[lNullStart:lNullEnd], :],
                                 self.problemArr[
                                     rNullIdx[rNullStart:rNullEnd], :]))

                        else:
                            rowStart = iOnlineRun * theStep
                            rowEnd = rowStart + theStep
                            subIdx = theIdx[rowStart:rowEnd]
                            subProbArr = self.problemArr[subIdx, :]
                        self.trainer.step(self.fDictList, subProbArr)

                myLog.debug('About to call cost in postEpoch call')
                self.postEpochCall(iEpoch)

                # Test for convergence
                if self.checkconverge and len(
                        self.costTrajectory) > self.nconvergesteps:
                    if std(self.costTrajectory[-self.nconvergesteps:]
                           ) < self.costEpsilon:
                        myLog.critical('Convergence after Epoch %d!!' % iEpoch)
                        return self.costTrajectory

                if self.callback is not None:
                    self.callback(self)

            myLog.critical('Never completely converged after %d epochs!' %
                           self.numEpochs)
        except KeyboardInterrupt, e:
            myLog.critical('Interrupted with Keyboard after %d epochs, stopping here, currCost = %f' % \
                           (iEpoch, self.costTrajectory[-1]))
            return self.costTrajectory
    def train(self):
        """Method to run through all the data and train away"""
        # Set some things up if doing online
        # will only calculate the gradient on the left hand side at a time 
        # So make a target array and make sure we look at each ordered pair in both orders
        numOnlineRuns = int((self.problemArr.shape[0])/float(self.onlineChunkSize)) + 1;
        theIdx = arange(0, self.problemArr.shape[0])
        theStep = int(len(theIdx) / numOnlineRuns) + 1;

        ## Check if we have some null rows.  If we do, then stratify the rows trained on in each 
        ## online step
        ## We don't want a complete step of left -1's or right -1's.
        isNullRows = (self.problemArr[:,0] == -1) |  (self.problemArr[:, 1] == -1)
        isNullRows = any(isNullRows)
        if isNullRows:
            myLog.info('Have -1 in problemArr, stratifying the online step data')
            nNullIdx = theIdx[(self.problemArr[:,0] != -1) & (self.problemArr[:,1] != -1)]
            lNullIdx = theIdx[self.problemArr[:,0] == -1]
            rNullIdx = theIdx[self.problemArr[:,1] == -1]

            nNullStep = int(len(nNullIdx)/numOnlineRuns) + 1
            lNullStep = int(len(lNullIdx)/numOnlineRuns) + 1
            rNullStep = int(len(rNullIdx)/numOnlineRuns) + 1
        
        try:
            for iEpoch in range(self.numEpochs):
                if self.batch:
                    self.trainer.step(self.fDictList, self.problemArr);
                else:
                    # Want to balance each of the chunks used in the online learning.
                    if isNullRows:
                        shuffle(nNullIdx);
                        shuffle(lNullIdx)
                        shuffle(rNullIdx)
                    else:
                        shuffle(theIdx);
                                        
                    for iOnlineRun in range(numOnlineRuns):
                        if isNullRows:
                            nNullStart = iOnlineRun * nNullStep
                            nNullEnd = nNullStart + nNullStep
                            lNullStart = iOnlineRun * lNullStep
                            lNullEnd = lNullStart + lNullStep
                            rNullStart = iOnlineRun * rNullStep
                            rNullEnd = rNullStart + rNullStep

                            subProbArr = concatenate((self.problemArr[nNullIdx[nNullStart:nNullEnd], :],
                                                      self.problemArr[lNullIdx[lNullStart:lNullEnd], :],
                                                      self.problemArr[rNullIdx[rNullStart:rNullEnd], :]))
                            
                        else:
                            rowStart = iOnlineRun * theStep;
                            rowEnd = rowStart + theStep;
                            subIdx = theIdx[rowStart:rowEnd];
                            subProbArr = self.problemArr[subIdx, :]
                        self.trainer.step(self.fDictList, subProbArr);
                
                myLog.debug('About to call cost in postEpoch call')
                self.postEpochCall(iEpoch)
                
                # Test for convergence
                if self.checkconverge and len(self.costTrajectory) > self.nconvergesteps:
                    if std(self.costTrajectory[-self.nconvergesteps:]) < self.costEpsilon:
                        myLog.critical('Convergence after Epoch %d!!' % iEpoch);
                        return self.costTrajectory;
                
                if self.callback is not None:
                    self.callback(self);
            
            myLog.critical('Never completely converged after %d epochs!' % self.numEpochs);
        except KeyboardInterrupt, e:
            myLog.critical('Interrupted with Keyboard after %d epochs, stopping here, currCost = %f' % \
                           (iEpoch, self.costTrajectory[-1]))
            return self.costTrajectory;
Ejemplo n.º 10
0
    def train(self):
        """Method to run through all the data and train away"""
        self.costTrajectory = []

        # Set some things up if doing online
        c1Idx = None
        c0Idx = None
        if not self.batch:
            shuffIdx = arange(0, self.dataArr.shape[0])
            c1Idx = shuffIdx[self.targetArr == 1]
            c0Idx = shuffIdx[self.targetArr == 0]
            numOnlineRuns = int(
                self.dataArr.shape[0] / self.onlineChunkSize) + 1
            c1Step = int(len(c1Idx) / numOnlineRuns) + 1
            c0Step = int(len(c0Idx) / numOnlineRuns) + 1

        try:
            for iEpoch in range(self.numEpochs):
                if self.batch:
                    self.trainer.step(self.dataArr, self.targetArr)
                else:
                    # Want to balance each of the chunks used in the online learning.
                    shuffle(c1Idx)
                    shuffle(c0Idx)

                    # Calculate an amount to adjust the gradient by because of online
                    onlineAdjustmentFactor = None
                    #float(self.onlineChunkSize) / float(self.dataArr.shape[0]);

                    for iOnlineRun in range(numOnlineRuns):
                        c1RowStart = iOnlineRun * c1Step
                        c1RowEnd = c1RowStart + c1Step
                        c0RowStart = iOnlineRun * c0Step
                        c0RowEnd = c0RowStart + c0Step
                        theInds = concatenate((c1Idx[c1RowStart:c1RowEnd],
                                               c0Idx[c0RowStart:c0RowEnd]))
                        subData = self.dataArr[theInds, :]
                        subTargets = self.targetArr[theInds]
                        self.trainer.step(subData, subTargets,
                                          onlineAdjustmentFactor)

                if self.epochlog:
                    myLog.debug('About to call cost in postEpoch call')
                currCost = self.cost(self.dataArr,
                                     self.targetArr,
                                     dolog=self.epochlog)

                if self.epochlog:
                    myLog.info('Epoch %d, curr Cost : %f' % (iEpoch, currCost))
                self.costTrajectory.append(currCost)

                # Test for convergence
                if len(self.costTrajectory) > 1:
                    if abs(self.costTrajectory[-1] -
                           self.costTrajectory[-2]) < self.costEpsilon:
                        myLog.critical('Convergence after Epoch %d!!' % iEpoch)
                        return self.costTrajectory

                if self.callback is not None:
                    self.callback(self)

            myLog.critical('Never completely converged after %d epochs!' %
                           self.numEpochs)
        except KeyboardInterrupt, e:
            myLog.critical(
                'Interrupted with Keyboard after %d epochs, stopping here, currCost = %f'
                % (iEpoch, self.costTrajectory[-1]))
            return self.costTrajectory