Esempio n. 1
0
 def patientItemFromSourceItem(self, sourceItem, clinicalItem, conn):
     # Produce a patient_item record model for the given sourceItem
     patientItem = \
         RowItemModel \
         (   {   "external_id":  sourceItem["order_med_id"],
                 "patient_id":  sourceItem["pat_id"],
                 "encounter_id":  sourceItem["pat_enc_csn_id"],
                 "clinical_item_id":  clinicalItem["clinical_item_id"],
                 "item_date":  sourceItem["ordering_date"],
             }
         )
     insertQuery = DBUtil.buildInsertQuery("patient_item",
                                           patientItem.keys())
     insertParams = patientItem.values()
     try:
         # Optimistic insert of a new unique item
         DBUtil.execute(insertQuery, insertParams, conn=conn)
         patientItem["patient_item_id"] = DBUtil.execute(
             DBUtil.identityQuery("patient_item"), conn=conn)[0][0]
     except IntegrityError, err:
         # If turns out to be a duplicate, okay, pull out existint ID and continue to insert whatever else is possible
         log.info(err)
         # Lookup just by the composite key components to avoid attempting duplicate insertion again
         searchPatientItem = \
             {   "patient_id":       patientItem["patient_id"],
                 "clinical_item_id": patientItem["clinical_item_id"],
                 "item_date":        patientItem["item_date"],
             }
         (patientItem["patient_item_id"],
          isNew) = DBUtil.findOrInsertItem("patient_item",
                                           searchPatientItem,
                                           conn=conn)
Esempio n. 2
0
    def main(self, argv):
        """Main method, callable from command line"""
        usageStr = "usage: %prog [options]\n"
        parser = OptionParser(usage=usageStr)
        parser.add_option(
            "-s",
            "--startDate",
            dest="startDate",
            metavar="<startDate>",
            help=
            "Date string (e.g., 2011-12-15), if provided, will only run conversion on items with start time on or after this date."
        )
        parser.add_option(
            "-e",
            "--endDate",
            dest="endDate",
            metavar="<endDate>",
            help=
            "Date string (e.g., 2011-12-15), if provided, will only run conversion on items with start time before this date."
        )
        (options, args) = parser.parse_args(argv[1:])

        log.info("Starting: " + str.join(" ", argv))
        timer = time.time()

        convOptions = ConversionOptions()
        convOptions.extractParserOptions(options)

        self.convertSourceItems(convOptions)

        timer = time.time() - timer
        log.info("%.3f seconds to complete", timer)
Esempio n. 3
0
    def _query_patient_episodes(self,
                                query,
                                pat_id_col=None,
                                index_time_col=None):
        # Initialize DB cursor.
        cursor = self._connection.cursor()

        # Fetch and return results.
        log.info('query: %s' % str(query))

        if isinstance(query, basestring):
            cursor.execute(query)
        else:
            log.info('query.params: %s' % str(query.params))
            cursor.execute(str(query), query.params)

        # Parse arguments.
        if pat_id_col is None:
            pat_id_col = 'pat_id'
        if index_time_col is None:
            index_time_col = 'index_time'

        self._factory.setPatientEpisodeInput(cursor, pat_id_col,
                                             index_time_col)
        num_episodes = self._factory.processPatientEpisodeInput()

        return num_episodes
Esempio n. 4
0
 def _add_sex_features(self):
     log.info('Adding sex features...')
     SEX_FEATURES = ["Male", "Female"]
     for feature in SEX_FEATURES:
         self._factory.addClinicalItemFeatures([feature],
                                               dayBins=[],
                                               features="pre")
Esempio n. 5
0
    def tearDown(self):
        """Restore state from any setUp or test steps"""
        log.info("Purge test records from the database")

        ######FIX THIS TO CLEANUP YOUR TEST DATA .... May be don't have to, since superclass will just drop the whole database anyway
        DBUtil.execute \
        (   """delete from patient_item 
            where clinical_item_id in 
            (   select clinical_item_id
                from clinical_item as ci, clinical_item_category as cic
                where ci.clinical_item_category_id = cic.clinical_item_category_id
                and cic.source_table = 'stride_culture_micro'
            );
            """
        )
        DBUtil.execute \
        (   """delete from clinical_item 
            where clinical_item_category_id in 
            (   select clinical_item_category_id 
                from clinical_item_category 
                where source_table = 'strid_culture_micro'
            );
            """
        )
        DBUtil.execute(
            "delete from clinical_item_category where source_table = 'stride_culture_micro ';"
        )

        DBUtil.execute(
            "delete from stride_culture_micro where order_proc_anon_id < 0")

        DBTestCase.tearDown(self)
 def loadUpdateBufferFromFile(self, filename):
     updateBuffer = None
     try:
         #print >> sys.stderr, filename
         log.info("Loading: %s" % filename)
         ifs = stdOpen(filename, "r")
         updateBuffer = json.load(ifs)
         updateBuffer["analyzedPatientItemIds"] = set(
             updateBuffer["analyzedPatientItemIds"])
         ifs.close()
     except IOError, exc:
         # Apparently could not find the named filename. See if instead it's a prefix
         #    for a series of enumerated files and then merge them into one mass buffer
         dirname = os.path.dirname(filename)
         if dirname == "":
             dirname = "."
             # Implicitly the current working directory
         basename = os.path.basename(filename)
         for nextFilename in os.listdir(dirname):
             if nextFilename.startswith(basename):
                 nextFilepath = os.path.join(dirname, nextFilename)
                 nextUpdateBuffer = self.loadUpdateBufferFromFile(
                     nextFilepath)
                 if updateBuffer is None:  # First update buffer, use it as base
                     updateBuffer = nextUpdateBuffer
                 else:  # Have existing update buffer. Just update it's contents with the next one
                     updateBuffer = self.mergeBuffers(
                         updateBuffer, nextUpdateBuffer)
                     del nextUpdateBuffer
Esempio n. 7
0
    def main(self, argv):
        """Main method, callable from command line"""
        usageStr = "usage: %prog [options]\n"
        parser = OptionParser(usage=usageStr)
        parser.add_option(
            "-p",
            "--patientIds",
            dest="patientIds",
            metavar="<patientIds>",
            help=
            "Comma separated list of patient IDs to convert demographics data for.  Leave blank to attempt conversion for all available"
        )
        (options, args) = parser.parse_args(argv[1:])

        log.info("Starting: " + str.join(" ", argv))
        timer = time.time()

        patientIds = None
        if options.patientIds is not None:
            patientIds = options.patientIds.split(",")

        self.convertSourceItems(patientIds)

        timer = time.time() - timer
        log.info("%.3f seconds to complete", timer)
Esempio n. 8
0
    def tearDown(self):
        """Restore state from any setUp or test steps"""
        log.info("Purge test records from the database")

        DBUtil.execute \
        (   """delete from patient_item 
            where clinical_item_id in 
            (   select clinical_item_id
                from clinical_item as ci, clinical_item_category as cic
                where ci.clinical_item_category_id = cic.clinical_item_category_id
                and cic.source_table = 'stride_preadmit_med'
            );
            """
        )
        DBUtil.execute \
        (   """delete from clinical_item 
            where clinical_item_category_id in 
            (   select clinical_item_category_id 
                from clinical_item_category 
                where source_table = 'stride_preadmit_med'
            );
            """
        )
        DBUtil.execute(
            "delete from clinical_item_category where source_table = 'stride_preadmit_med';"
        )

        DBUtil.execute("delete from stride_mapped_meds where rxcui < 0")
        DBUtil.execute(
            "delete from stride_preadmit_med where stride_preadmit_med_id < 0")

        DBTestCase.tearDown(self)
    def setUp(self):
        """Prepare state for test cases"""
        DBTestCase.setUp(self);

        self.manager = SimManager();  # Instance to test on

        from stride.clinical_item.ClinicalItemDataLoader import ClinicalItemDataLoader; 
        ClinicalItemDataLoader.build_clinical_item_psql_schemata();
        self.manager.buildCPOESimSchema();

        log.info("Populate the database with test data")

        #### Basically import a bunch of rigged CSV or TSV files that have realistic simulating case and grading data
        # Get that data into the test database

        dataTextStr = \
"""sim_result_id;name;description;group_string;priority
-10;Temp;Temperature (F);Flowsheet>Vitals;10
-20;Pulse;Pulse / Heart Rate (HR);Flowsheet>Vitals;20
-30;SBP;Blood Pressure, Systolic (SBP);Flowsheet>Vitals;30
-40;DBP;Blood Pressure, Diastolic (DBP);Flowsheet>Vitals;40
-50;Resp;Respirations (RR);Flowsheet>Vitals;50
-60;FiO2;Fraction Inspired Oxygen;Flowsheet>Vitals;60
-70;Urine;Urine Output (UOP);Flowsheet>Vitals;70
"""     # Parse into DB insertion object
        DBUtil.insertFile( StringIO(dataTextStr), "sim_result", delim=";");
Esempio n. 10
0
    def main(self, argv):
        """Main method, callable from command line"""
        usageStr =  "usage: %prog <inputFile> <outputFile>\n"+\
                    "   <inputFile>     Tab-delimited input file taken from schedule Excel file. Example data format as seen in test case examples. See support/extractExcelSheets.py for help on pulling out Excel sheets into tab-delimited data files.\n"+\
                    "   <outputFile>    File to output results to.  Designate '-' for stdout.";
        parser = OptionParser(usage=usageStr)
        parser.add_option("-i", "--providerIdFilename",  dest="providerIdFilename", help="Name of provider ID CSV file. If provided, then add column for prov_id based on resident first_name and last_name, match within first "+DEFAULT_INDEX_PREFIX_LENGTH+" characters, or generate ID value if no match found");
        parser.add_option("-y", "--baseYear",  dest="baseYear", help="Year expect dates to start in.");
        parser.add_option("-t", "--changeTime",  dest="changeTime", default=CHANGE_TIME, help="Hour of day that count as delimiter between rotations. Likely should NOT be midnight = 0, because night shifts span midnight. Default to 7 = 7am.");
        (options, args) = parser.parse_args(argv[1:])

        if len(args) >= 2 and options.baseYear:
            log.info("Starting: "+str.join(" ", argv))
            timer = time.time();

            baseYear = int(options.baseYear);

            if options.providerIdFilename is not None:
                providerReader = csv.DictReader(open(options.providerIdFilename));
                self.loadProviderModels( providerReader );

            inFile = stdOpen(args[0]);
            scheduleItems = self.parseScheduleItems(inFile, baseYear);

            outFile = stdOpen(args[1],"w");
            formatter = TextResultsFormatter(outFile);
            formatter.formatResultDicts(scheduleItems);
        else:
            parser.print_help()
            sys.exit(-1)

        timer = time.time() - timer;
        log.info("%.3f seconds to complete",timer);
    def tearDown(self):
        """Restore state from any setUp or test steps"""
        log.info("Purge test records from the database")

        DBUtil.execute \
        (   """delete from patient_item 
            where clinical_item_id in 
            (   select clinical_item_id
                from clinical_item as ci, clinical_item_category as cic
                where ci.clinical_item_category_id = cic.clinical_item_category_id
                and cic.source_table = '%s'
            );
            """ % TEST_SOURCE_TABLE
        )
        DBUtil.execute \
        (   """delete from clinical_item 
            where clinical_item_category_id in 
            (   select clinical_item_category_id 
                from clinical_item_category 
                where source_table = '%s'
            );
            """ % TEST_SOURCE_TABLE
        )
        DBUtil.execute(
            "delete from clinical_item_category where source_table = '%s';" %
            TEST_SOURCE_TABLE)

        query = SQLQuery()
        query.delete = True
        query.addFrom("stride_patient")
        query.addWhere("pat_id < 0")
        DBUtil.execute(query)

        DBTestCase.tearDown(self)
Esempio n. 12
0
def main(argv):
    """Main method, callable from command line"""
    usageStr =  "usage: %prog [options] <cgiFile1> <cgiFile2> ...\n"+\
                "   <cgiFileX>    CGI template files to convert to PSP\n"
    parser = OptionParser(usage=usageStr)
    parser.add_option(
        "-l",
        "--noLinkConversion",
        dest="noLinkConversion",
        action="store_true",
        help="If set, will NOT swap all .py (apparent links) to .psp.")
    (options, args) = parser.parse_args(argv[1:])

    timer = time.time()

    if len(args) > 0:
        for cgiTemplateFilename in args:
            log.info("Converting %s" % cgiTemplateFilename)
            generatePSPfromCGITemplate(cgiTemplateFilename,
                                       not options.noLinkConversion)
    else:
        parser.print_help()
        sys.exit(-1)

    timer = time.time() - timer
    log.info("%.3f seconds to complete", timer)
    def main(self, argv):
        """Main method, callable from command line"""
        usageStr =  "usage: %prog [options]\n"
        parser = OptionParser(usage=usageStr)
        parser.add_option("-i", "--userSIDs", dest="userSIDs", metavar="<userSIDs>",  help="Comma separated list of user SIDs to convert AccessLog data for.  Leave blank to attempt conversion for all available");
        parser.add_option("-l", "--limit", dest="limit", metavar="<limit>",  help="Number of records to process before stopping");
        parser.add_option("-o", "--offset", dest="offset", metavar="<offset>",  help="Number of records to skip before start converting");
        (options, args) = parser.parse_args(argv[1:])

        log.info("Starting: "+str.join(" ", argv))
        timer = time.time();

        userSIDs = None;
        if options.userSIDs is not None:
            userSIDs = options.userSIDs.split(",");

        limit = None;
        if options.limit is not None:
            limit = int(options.limit);
        offset = None;
        if options.offset is not None:
            offset = int(options.offset);

        self.convertSourceItems(userSIDs, limit, offset);

        timer = time.time() - timer;
        log.info("%.3f seconds to complete",timer);
    def main(self, argv):
        """Main method, callable from command line"""
        usageStr =  "usage: %prog [options]\n"+\
            "Beware that this module is intended to be run only ONCE ever on a database. Currently will end up with duplicate clinical item keys if you try to run it in parallel or even serially."
        parser = OptionParser(usage=usageStr)
        parser.add_option("-s", "--startDate", dest="startDate", metavar="<startDate>",  help="Date string (e.g., 2011-12-15), if provided, will only run conversion on items with ordering time on or after this date.");
        parser.add_option("-e", "--endDate", dest="endDate", metavar="<endDate>",  help="Date string (e.g., 2011-12-15), if provided, will only run conversion on items with ordering time before this date.");
        (options, args) = parser.parse_args(argv[1:])

        log.info("Starting: "+str.join(" ", argv))
        timer = time.time();
        startDate = None;
        if options.startDate is not None:
            # Parse out the start date parameter
            timeTuple = time.strptime(options.startDate, DATE_FORMAT);
            startDate = datetime(*timeTuple[0:3]);
        endDate = None;
        if options.endDate is not None:
            # Parse out the end date parameter
            timeTuple = time.strptime(options.endDate, DATE_FORMAT);
            endDate = datetime(*timeTuple[0:3]);
        self.convertSourceItems(startDate,endDate);

        timer = time.time() - timer;
        log.info("%.3f seconds to complete",timer);
Esempio n. 15
0
    def purgeTestRecords(self):
        log.info("Purge test records from the database")
        if self.testPatientId is not None:
            # Delete test generated data
            DBUtil.execute(
                "delete from sim_patient_order where sim_patient_id = %s",
                (self.testPatientId, ))
            DBUtil.execute(
                "delete from sim_patient_state where sim_patient_id = %s",
                (self.testPatientId, ))
            DBUtil.execute("delete from sim_patient where sim_patient_id = %s",
                           (self.testPatientId, ))

        DBUtil.execute("delete from sim_note where sim_note_id < 0")
        DBUtil.execute(
            "delete from sim_state_result where sim_state_result_id < 0")
        DBUtil.execute(
            "delete from sim_patient_order where sim_state_id < 0 or sim_patient_order_id < 0"
        )
        DBUtil.execute(
            "delete from sim_order_result_map where sim_order_result_map_id < 0"
        )
        DBUtil.execute("delete from sim_result where sim_result_id < 0")
        DBUtil.execute(
            "delete from sim_patient_state where sim_state_id < 0 or sim_patient_state_id < 0"
        )
        DBUtil.execute(
            "delete from sim_state_transition where pre_state_id < 0")
        DBUtil.execute("delete from sim_state where sim_state_id <= 0")
        DBUtil.execute("delete from sim_user where sim_user_id < 0")
        DBUtil.execute("delete from sim_patient where sim_patient_id < 0")
        DBUtil.execute("delete from clinical_item where clinical_item_id < 0")
Esempio n. 16
0
 def banHost(self, host, duration=None):
     """
     Bans a host from connecting.
     """
     #TODO: Timed bans?
     log.info("Banning IP {0}".format(host))
     banlist.append(host)
Esempio n. 17
0
    def grad(self, fDictList, problemArr):
        """Evaluate the gradient of the error function wrt the params.
        
        The first error to pass back to the NN will be the 
        (target - sigmoid(lOut - rOut)) * (abs(lOut - rOut))
        
        NOTE:  all targs are 1.  So calc the gradient with l/r then with r/l
        """
        currGrad = zeros(self.params.shape)

        meanDOut = []
        minDOut = []
        maxDOut = []
        for lSubData, rSubData, subProbArr in self.gradChunkDataIterator(
                fDictList, problemArr):
            # First calc the contribution when looking at the left to right
            ## Only include data that comes from non-NULL on the left
            nonLNullRows = subProbArr[:, 0] > -1
            rd = rSubData[:, nonLNullRows]
            ld = lSubData[:, nonLNullRows]
            rOut = self.layerModel.fprop(rd)
            lOut = self.layerModel.fprop(ld)
            sigOut = sigmoid(lOut - rOut)
            absDiffOut = abs(lOut - rOut)

            d_outputs = (sigOut - 1)
            if len(d_outputs) > 0 and d_outputs.shape[1] > 0:
                try:
                    meanDOut.append(mean(abs(d_outputs)))
                    minDOut.append(min(d_outputs))
                    maxDOut.append(max(d_outputs))
                except Exception, e:
                    myLog.error(e)
                    myLog.info('Some error, d_outputs : %s' %
                               pprint.pformat(d_outputs))
                    myLog.info('Some error, len(d_outputs) : %s' %
                               len(d_outputs))
                    raise e

                self.layerModel.bprop(d_outputs, ld)
                currGradContrib = self.layerModel.grad(d_outputs, ld)
                currGradContrib = currGradContrib.sum(1)
                currGrad += currGradContrib

            ## Then do the same, but going to the right.
            # need to fprop on the right side again
            # Only use rows that are non-null on the right
            nonRNullRows = subProbArr[:, 1] > -1
            rd = rSubData[:, nonRNullRows]
            ld = lSubData[:, nonRNullRows]

            lOut = self.layerModel.fprop(ld)
            rOut = self.layerModel.fprop(rd)
            sigOut = sigmoid(rOut - lOut)
            d_outputs = sigOut
            if len(d_outputs > 0) and d_outputs.shape[1] > 0:
                self.layerModel.bprop(d_outputs, rd)
                currGradContrib = self.layerModel.grad(d_outputs, rd)
                currGradContrib = currGradContrib.sum(1)
                currGrad += currGradContrib
Esempio n. 18
0
    def postEpochCall(self, epoch):
        """Convenience to run some stats at the end of each epoch."""
        if self.saveparams:
            self.paramHistory.append(self.params[:])
        outputs = self.apply(self.fDictList)
        lOut = outputs[self.problemArr[:, 0]]
        rOut = outputs[self.problemArr[:, 1]]
        sigOut = sigmoid(lOut - rOut)
        sigOut = where(sigOut < OFFSET_EPSILON, OFFSET_EPSILON, sigOut)
        sigOut = where(sigOut > 1 - OFFSET_EPSILON, 1 - OFFSET_EPSILON, sigOut)
        # Cross-Entropy
        # NOTE here that all targs are 1.
        error = log(sigOut)
        currCost = -error.sum()

        decayContrib = self.totalL2Decay * (self.params**2).sum()
        theAcc = accuracy(sigOut, 1)
        theRMSE = rmse(sigOut, 1)

        if self.epochlog:
            myLog.info('Epoch %d, curr Cost: %f, decayCont: %f ' %
                       (epoch, currCost, decayContrib))
            myLog.info('Epoch %d, theAcc: %f, theRMSE: %f' %
                       (epoch, theAcc, theRMSE))
        self.costTrajectory.append(currCost)
Esempio n. 19
0
    def main(self, argv):
        """Main method, callable from command line"""
        usageStr =  "usage: %prog [options] <inputFile1> <inputFile2> ... <inputFileN>\n"+\
                    "   <inputFileX>    Tab-delimited file of data.  Initial comment lines will be scanned for list of argv parameters to add as data columns.\n"+\
                    "                   If only a single input is given, interpret this as an index file which lists the names of the other files to concatenate (e.g., obtained with dir * /b or ls).\n"
        parser = OptionParser(usage=usageStr)
        parser.add_option(
            "-o",
            "--outputFile",
            dest="outputFile",
            help=
            "Tab-delimited file matching concatenated contents of input files.  Specify \"-\" to send to stdout."
        )
        (options, args) = parser.parse_args(argv[1:])

        log.info("Starting: " + str.join(" ", argv))
        timer = time.time()
        if len(args) > 0:
            inputFiles = list()
            if len(args) > 1:
                for inputFilename in args:
                    inputFiles.append(stdOpen(inputFilename))
            else:  # len(argvs) == 1, Single index file rather than list of all files on command-line
                indexFile = stdOpen(args[0])
                for line in indexFile:
                    inputFilename = line.strip()
                    inputFiles.append(stdOpen(inputFilename))

            # Format the results for output
            outputFile = stdOpen(options.outputFile, "w")

            # Print comment line with arguments to allow for deconstruction later as well as extra results
            summaryData = {
                "argv": argv
            }
            print >> outputFile, COMMENT_TAG, json.dumps(summaryData)

            # Tab-delimited output formatting
            formatter = TextResultsFormatter(outputFile)

            # Begin the file parsing so can at least get the total list of column headers
            rowGenerator = self(inputFiles)
            firstRow = rowGenerator.next()

            # Insert a mock record to get a header / label row
            colNames = self.resultHeaders()
            formatter.formatTuple(colNames)

            # Stream the concatenated data rows to the output to avoid storing all in memory
            formatter.formatResultDict(firstRow, colNames)
            for outputDict in rowGenerator:
                formatter.formatResultDict(outputDict, colNames)

        else:
            parser.print_help()
            sys.exit(-1)

        timer = time.time() - timer
        log.info("%.3f seconds to complete", timer)
Esempio n. 20
0
 def _add_demographic_features(self):
     log.info('Adding demographic features...')
     # Add birth and death.
     self._add_lifespan_features()
     # Add sex features.
     self._add_sex_features()
     # Add race features.
     self._add_race_features()
Esempio n. 21
0
 def runStats(self):
     """Convenience method to print out some stats about the training."""
     # And log some stats about accuracy, etc.
     theOut = self.classifier.apply(self.fDictList)
     theOut = theOut[self.idxArr]
     theAcc = accuracy(theOut, self.targArr)
     theRMSE = rmse(theOut, self.targArr)
     log.info('theAcc : %.4f, theRMSE : %.4f' % (theAcc, theRMSE))
Esempio n. 22
0
 def _add_lifespan_features(self):
     log.info('Adding lifespan features...')
     self._factory.addClinicalItemFeatures(['Birth'],
                                           dayBins=[],
                                           features="pre")
     self._factory.addClinicalItemFeatures(['Death'],
                                           dayBins=[],
                                           features="post")
Esempio n. 23
0
 def isBadUsername(self, user):
     is_bad = user.lower() in ("root", "asdmin", "ubnt", "support", "user", "pi")
     if is_bad:
         log.info('Disconnecting "{0}" from {1}: blacklisted username'.format(user, self.ip))
         self.transport.sendDisconnect(
             transport.DISCONNECT_ILLEGAL_USER_NAME,
             "You can't use that username ({0}) here. Please reconnect using a different one.".format(user))
     return is_bad
Esempio n. 24
0
 def buildProtocol(self, addr):
     # Reject this connection if the IP is banned.
     if addr.host in banlist:
         log.info("Rejecting connection from banned IP {0}".format(addr.host))
         return None
     # otherwise all good; let superclass do the rest
     log.info("Incoming SSH connection from {0}".format(addr.host))
     return conch_factory.SSHFactory.buildProtocol(self, addr)
Esempio n. 25
0
    def generatePatientItemsForCompositeId(self,
                                           clinicalItemIds,
                                           compositeId,
                                           conn=None):
        """Create patient_item records for the composite to match the given clinical item ID patient items.
        """
        extConn = True
        if conn is None:
            conn = self.connFactory.connection()
            extConn = False
        try:
            # Record linking information
            for componentId in clinicalItemIds:
                linkModel = RowItemModel()
                linkModel["clinical_item_id"] = compositeId
                linkModel["linked_item_id"] = componentId

                insertQuery = DBUtil.buildInsertQuery("clinical_item_link",
                                                      linkModel.keys())
                insertParams = linkModel.values()
                DBUtil.execute(insertQuery, insertParams, conn=conn)

            # Extract back link information, which will also flatten out any potential inherited links
            linkedItemIdsByBaseId = self.loadLinkedItemIdsByBaseId(conn=conn)
            linkedItemIds = linkedItemIdsByBaseId[compositeId]

            # Create patienItem records for the composite clinical item to overlap existing component ones
            # First query for the existing component records
            query = SQLQuery()
            query.addSelect("*")
            query.addFrom("patient_item")
            query.addWhereIn("clinical_item_id", linkedItemIds)
            results = DBUtil.execute(query, includeColumnNames=True, conn=conn)
            patientItems = modelListFromTable(results)

            # Patch component records to instead become composite item records then insert back into database
            progress = ProgressDots(total=len(patientItems))
            for patientItem in patientItems:
                del patientItem["patient_item_id"]
                patientItem["clinical_item_id"] = compositeId
                patientItem["analyze_date"] = None

                insertQuery = DBUtil.buildInsertQuery("patient_item",
                                                      patientItem.keys())
                insertParams = patientItem.values()

                try:
                    # Optimistic insert of a new unique item
                    DBUtil.execute(insertQuery, insertParams, conn=conn)
                except conn.IntegrityError, err:
                    # If turns out to be a duplicate, okay, just note it and continue to insert whatever else is possible
                    log.info(err)
                progress.Update()

            # progress.PrintStatus();
        finally:
            if not extConn:
                conn.close()
    def tearDown(self):
        """Restore state from any setUp or test steps"""
        log.info("Purge test records from the database")

        DBUtil.execute \
        (   """delete from patient_item_collection_link
            where item_collection_item_id in 
            (   select item_collection_item_id
                from item_collection_item as ici, item_collection as ic
                where ici.item_collection_id = ic.item_collection_id
                and ic.external_id < 0
            );
            """
        )
        DBUtil.execute \
        (   """delete from item_collection_item
            where item_collection_id in 
            (   select item_collection_id
                from item_collection as ic
                where ic.external_id < 0
            );
            """
        )
        DBUtil.execute("delete from item_collection where external_id < 0;")


        DBUtil.execute \
        (   """delete from patient_item 
            where clinical_item_id in 
            (   select clinical_item_id
                from clinical_item as ci, clinical_item_category as cic
                where ci.clinical_item_category_id = cic.clinical_item_category_id
                and cic.source_table = 'stride_order_proc'
            );
            """
        )
        DBUtil.execute \
        (   """delete from clinical_item 
            where clinical_item_category_id in 
            (   select clinical_item_category_id 
                from clinical_item_category 
                where source_table = 'stride_order_proc'
            );
            """
        )
        DBUtil.execute(
            "delete from clinical_item_category where source_table = 'stride_order_proc';"
        )

        DBUtil.execute(
            "delete from stride_orderset_order_proc where order_proc_id in (%s)"
            % str.join(",", self.orderProcIdStrList))
        DBUtil.execute(
            "delete from stride_order_proc where order_proc_id in (%s)" %
            str.join(",", self.orderProcIdStrList))

        DBTestCase.tearDown(self)
Esempio n. 27
0
 def _add_race_features(self):
     log.info('Adding race features...')
     for feature in self._factory.queryAllRaces():
         if LocalEnv.DATASET_SOURCE_NAME == 'STRIDE':  # TODO
             self._factory.addClinicalItemFeatures([feature], dayBins=[], features="pre")
         else:
         #elif LocalEnv.DATASET_SOURCE_NAME == 'UMich':
             self._factory.addClinicalItemFeatures_UMich([feature], dayBins=[], features="pre",
                                               clinicalItemType='RaceName', clinicalItemTime=None, tableName='demographics')
    def grad(self, fDictList, problemArr):
        """Evaluate the gradient of the error function wrt the params.
        
        The first error to pass back to the NN will be the 
        (target - sigmoid(lOut - rOut)) * (abs(lOut - rOut))
        
        NOTE:  all targs are 1.  So calc the gradient with l/r then with r/l
        """
        currGrad = zeros(self.params.shape);
        
        meanDOut = [];
        minDOut = [];
        maxDOut = [];
        for lSubData, rSubData, subProbArr in self.gradChunkDataIterator(fDictList, problemArr):
            # First calc the contribution when looking at the left to right
            ## Only include data that comes from non-NULL on the left
            nonLNullRows = subProbArr[:,0]>-1 
            rd = rSubData[:, nonLNullRows]
            ld = lSubData[:, nonLNullRows]
            rOut = self.layerModel.fprop(rd)
            lOut = self.layerModel.fprop(ld)
            sigOut = sigmoid(lOut - rOut);
            absDiffOut = abs(lOut - rOut);

            d_outputs = (sigOut - 1);
            if len(d_outputs) > 0 and d_outputs.shape[1] > 0:
                try:
                    meanDOut.append(mean(abs(d_outputs)));
                    minDOut.append(min(d_outputs));
                    maxDOut.append(max(d_outputs));
                except Exception, e:
                    myLog.error(e)
                    myLog.info('Some error, d_outputs : %s' % pprint.pformat(d_outputs))
                    myLog.info('Some error, len(d_outputs) : %s' % len(d_outputs))
                    raise e
            
                self.layerModel.bprop(d_outputs, ld);
                currGradContrib = self.layerModel.grad(d_outputs, ld)
                currGradContrib = currGradContrib.sum(1);
                currGrad += currGradContrib;
            
            ## Then do the same, but going to the right.
            # need to fprop on the right side again
            # Only use rows that are non-null on the right
            nonRNullRows = subProbArr[:,1]>-1 
            rd = rSubData[:, nonRNullRows]
            ld = lSubData[:, nonRNullRows]

            lOut = self.layerModel.fprop(ld)
            rOut = self.layerModel.fprop(rd)
            sigOut = sigmoid(rOut - lOut);
            d_outputs = sigOut;
            if len(d_outputs > 0) and d_outputs.shape[1] > 0:
                self.layerModel.bprop(d_outputs, rd)
                currGradContrib = self.layerModel.grad(d_outputs, rd)
                currGradContrib = currGradContrib.sum(1)
                currGrad += currGradContrib;
Esempio n. 29
0
 def _add_lifespan_features(self):
     log.info('Adding lifespan features...')
     self._factory.addClinicalItemFeatures(['Birth'],
                                           dayBins=[],
                                           features="pre",
                                           is_Michigan_data=True,
                                           clinicalItemType=None,
                                           clinicalItemTime='Birth',
                                           tableName='pt_info')
Esempio n. 30
0
    def setUp(self):
        """Prepare state for test cases"""
        DBTestCase.setUp(self)

        log.info(
            "Populate the database with test data (Assumes MySQL data structure)"
        )
        DBUtil.execute \
        ("""create table %s
            (
                USER_ID varchar(255),
                USER_NAME varchar(255),
                DE_PAT_ID bigint,
                ACCESS_DATETIME datetime,
                METRIC_ID integer,
                METRIC_NAME text,
                LINE_COUNT integer,
                DESCRIPTION text,
                METRIC_GROUP_NUM integer,
                METRIC_GROUP_NAME text
            );
         """ % TEST_SOURCE_TABLE
        )

        self.testUserIDs = list()
        headers = [
            "user_id", "user_name", "de_pat_id", "access_datetime",
            "metric_id", "metric_name", "line_count", "description",
            "metric_group_num", "metric_group_name"
        ]
        dataModels = \
            [
                RowItemModel( ['S-7', 'CJ', None, '2013-10-14 08:44:47', '33006', 'ME_IBGLANCE', '1', 'IN BASKET GLANCE PLUGIN ACCESSED IN RADAR', '33000', 'Radar'], headers ),
                RowItemModel( ['S-7', 'CJ', '3289034', '2014-03-20 00:40:18', '34127', 'IP_ORDERSSECTION', '1', 'Inpatient Orders section opened ', '17001', 'PATIENT CLINICAL INFO'], headers ),
                RowItemModel( ['S-7', 'CJ', None, '2014-01-01 10:10:56', '20008', 'AC_IB_CREATEMSG', '1', 'In Basket message of any type created.', '20000', 'In Basket Report'], headers ),
                RowItemModel( ['S-7', 'CJ', None, '2014-01-01 10:10:56', '20008', 'AC_IB_CREATEMSG', '2', '(Created messages counted.) ', '20000', 'In Basket Report'], headers ),
                RowItemModel( ['S-7', 'CJ', '1853397', '2013-06-29 11:25:02', '20075', 'AC_DOCUMENTLIST_SUBC', '1', 'Prelude Documents list accessed for patient.', None, None], headers ),
                RowItemModel( ['S-4', 'AB', '3133593', '2013-10-22 06:46:29', '17008', 'MR_REPORTS', '1', 'A report with patient data accessed.', '17001', 'PATIENT CLINICAL INFO'], headers ),
                RowItemModel( ['S-4', 'AB', '3047429', '2014-03-16 20:56:54', '17016', 'MR_RESULTS_REVIEW', '1', 'Results Review activity accessed.', '17002', 'Patient Chart Review'], headers ),
                RowItemModel( ['S-4', 'AB', '3408732', '2014-04-08 08:47:38', '17016', 'MR_RESULTS_REVIEW', '1', 'Results Review activity accessed.', '17002', 'Patient Chart Review'], headers ),
                RowItemModel( ['S-4', 'AB', None, '2014-02-26 19:27:48', '34140', 'IP_SYSTEM_LIST', '1', 'Inpatient system list accessed.', '20001', 'PATIENT DEMOGRAPHICS'], headers ),
                RowItemModel( ['S-4', 'AB', '2487184', '2013-10-11 08:45:46', '17008', 'MR_REPORTS', '1', 'A report with patient data accessed.', '17001', 'PATIENT CLINICAL INFO'], headers ),

            ]
        for dataModel in dataModels:
            (dataItemId,
             isNew) = DBUtil.findOrInsertItem(TEST_SOURCE_TABLE,
                                              dataModel,
                                              retrieveCol="user_id")
            userID = int(dataItemId[1:])
            # Trim leading S and parse remainder as an integer
            self.testUserIDs.append(userID)

        self.converter = ProviderRotationConversion()
        # Instance to test on
        self.converter.sourceTableName = TEST_SOURCE_TABLE
Esempio n. 31
0
 def _add_race_features(self):
     log.info('Adding race features...')
     RACE_FEATURES = [
         "RaceWhiteHispanicLatino", "RaceWhiteNonHispanicLatino",
         "RaceHispanicLatino", "RaceBlack", "RaceAsian",
         "RacePacificIslander", "RaceNativeAmerican",
         "RaceOther", "RaceUnknown"
     ]
     for feature in RACE_FEATURES:
         self._factory.addClinicalItemFeatures([feature], dayBins=[], features="pre")
Esempio n. 32
0
    def disconnect_hostNotAllowed(self, msg):
        """
        Sends a "Host Not Allowed To Connect" disconnect packet to the client.

        The parameter is a string to send as the disconnection message.
        """
        log.info("Disconnecting {0}".format(self.ip))
        self.transport.sendDisconnect(
            transport.DISCONNECT_HOST_NOT_ALLOWED_TO_CONNECT,
            msg)
Esempio n. 33
0
 def runStats(self):
     """Convenience method to print out some stats about the training."""
     # And log some stats about accuracy, etc.
     theOut = self.classifier.apply(self.fDictList)
     lOut = theOut[self.probArr[:, 0]]
     rOut = theOut[self.probArr[:, 1]]
     sigOut = sigmoid(lOut - rOut)
     theAcc = accuracy(sigOut, 1)
     theRMSE = rmse(sigOut, 1)
     log.info('theAcc : %.4f, theRMSE : %.4f' % (theAcc, theRMSE))
Esempio n. 34
0
 def _add_sex_features(self):
     log.info('Adding sex features...')
     SEX_FEATURES = ["Male", "Female"]
     for feature in SEX_FEATURES:
         if LocalEnv.DATASET_SOURCE_NAME == 'STRIDE':  # TODO
             self._factory.addClinicalItemFeatures([feature], dayBins=[], features="pre")
         else:
         #elif LocalEnv.DATASET_SOURCE_NAME == 'UMich':
             self._factory.addClinicalItemFeatures_UMich([feature], dayBins=[], features="pre",
                                                clinicalItemType='GenderName', clinicalItemTime=None, tableName="demographics")
Esempio n. 35
0
 def _add_race_features(self):
     log.info('Adding race features...')
     for feature in self._factory.RACE_FEATURES:
         self._factory.addClinicalItemFeatures([feature],
                                               dayBins=[],
                                               features="pre",
                                               is_Michigan_data=True,
                                               clinicalItemType='RaceName',
                                               clinicalItemTime=None,
                                               tableName='demographics')
Esempio n. 36
0
 def runStats(self):
     """Convenience method to print out some stats about the training."""
     # And log some stats about accuracy, etc.
     theOut = self.classifier.apply(self.fDictList)
     lOut = theOut[self.probArr[:, 0]]
     rOut = theOut[self.probArr[:, 1]]
     sigOut = sigmoid(lOut - rOut)
     theAcc = accuracy(sigOut, 1)
     theRMSE = rmse(sigOut, 1)
     log.info('theAcc : %.4f, theRMSE : %.4f' % (theAcc, theRMSE))
Esempio n. 37
0
    def processFeatureDictList(self, featureReader):
        """Given a feature dict reader, go through and calculate a minMax dict for each feature."""
        log.info('Beginning processing a featDict');
        minMaxDict = {};
        idList = [];
        fDictList = [];
        progress = ProgressDots();
        for iRow, fDict in enumerate(featureReader):
            minMaxDict = self.updateMinMaxDict(minMaxDict, fDict);
            progress.Update();

        progress.PrintStatus();
        
        minRangeDict = {};
        minRangeDict = self.convertMinMaxDictToMinRange(minMaxDict)
        return minRangeDict;
 def __generalTest(self, featDictList, probArr, archModel):
     """Convenience to test things multiple times"""
     classifier = PairMonteFeatDictClassifier(archModel, featDictList, probArr);
     classifier.setupModels();
     
     currOut = classifier.apply(featDictList);
     
     lData = currOut[probArr[:, 0]]
     rData = currOut[probArr[:, 1]]
     
     outputs = sigmoid(lData - rData);
     outputs = where(outputs < OFFSET_EPSILON, OFFSET_EPSILON, outputs);
     outputs = where(outputs > 1-OFFSET_EPSILON, 1-OFFSET_EPSILON, outputs);
     
     # Assuming all is 1
     currCost =  2 * -(log(outputs)).sum();
     decayContrib = classifier.l2decay * (classifier.params**2).sum();
     currCost += decayContrib;
     currRMSE = rmse(outputs, 1);
     currAcc = accuracy(outputs, 1);
     
     beginRMSE = currRMSE;
     
     myLogger.info('(Cost, Acc, RMSE, decay) before training: (%.4f, %.4f, %.4f, %.4f)' % (currCost, currAcc, currRMSE, decayContrib));
     
     classifier.train();
     
     currOut = classifier.apply(featDictList);
     
     lData = currOut[probArr[:, 0]]
     rData = currOut[probArr[:, 1]]
     
     outputs = sigmoid(lData - rData);
     outputs = where(outputs < OFFSET_EPSILON, OFFSET_EPSILON, outputs);
     outputs = where(outputs > 1-OFFSET_EPSILON, 1-OFFSET_EPSILON, outputs);
     
     # Assuming all is 1
     currCost =  -(log(outputs)).sum();
     decayContrib = classifier.l2decay * (classifier.params**2).sum();
     currCost += decayContrib;
     currRMSE = rmse(outputs, 1);
     currAcc = accuracy(outputs, 1);
     
     myLogger.info('(Cost, Acc, RMSE, decay) after training: (%.4f, %.4f, %.4f, %.4f)' % (currCost, currAcc, currRMSE, decayContrib));
     
     self.assert_(currRMSE < beginRMSE, 'RMSE did not decrease.');
    def postEpochCall(self, epoch):
        """Convenience to run some stats at the end of each epoch."""
        outputs = self.apply(self.fDictList);
        outputs = outputs[self.idxArr];
        outputs = where(outputs < OFFSET_EPSILON, OFFSET_EPSILON, outputs);
        outputs = where(outputs > 1-OFFSET_EPSILON, 1-OFFSET_EPSILON, outputs);
        
        error = multiply(self.targetArr, log(outputs)) + multiply(1 - self.targetArr, log(1-outputs));
        currCost = -error.sum();
        decayContrib = self.totalL2Decay * (self.params**2).sum();
        theAcc = accuracy(outputs, self.targetArr);
        theRMSE = rmse(outputs, self.targetArr);

        if self.epochlog:
            myLog.info('Epoch %d, curr Cost: %f, decayCont: %f ' % (epoch, currCost, decayContrib));
            myLog.info('Epoch %d, theAcc: %f, theRMSE: %f' % (epoch, theAcc, theRMSE));
        self.costTrajectory.append(currCost);
Esempio n. 40
0
    def ssh_USERAUTH_REQUEST(self, packet):
        """
        This method is called when a packet is received.
        The client has requested authentication.  Payload:
            string user
            string next service
            string method
            [authentication specific data]
        """
        self.packet_count += 1
        user, nextService, method, rest = getNS(packet, 3)
        if self.isBadUsername(user): return
        first = False
        if self.state is None or self.state.is_invalid( user, nextService ):
            # If username or desired service has changed during auth,
            # the RFC says we must discard all state.
            self.state = UserAuthState( self, user, nextService )
            # We do keep track of how many state changes there have been.
            # This is used to thwart bots.
            self.state_changes += 1
            #log.debug(dir(self.transport.factory.portal))
            self.firstContact()
            first = True
        log.debug( "Auth request for user {0}, service {1}, method {2}.".format(user, nextService, method) )
        if self.state_changes > 3 or self.packet_count > 20:
            log.info("Disconnecting user: too many attempts")
            self.disconnect_hostNotAllowed("You are doing that too much!")

        if first and self.state.user_is_known:
            self.supportedAuthentications.append("password")

        if method == "none":
            # We want to push the user through keyboard-interactive.
            # This lets the client know what methods we do support.
            return self.send_authFail()

        if self.state.user_is_known:
            # Username is known to us! Do normal login.
            return self.handle_known_user( method, rest )

        else:
            # This user is not known to us.
            return self.handle_new_user(method, rest)
 def __generalTest(self, featDictList, targArr, archModel, idxArr):
     """Convenience to test things multiple times"""
     classifier = MonteFeatDictClassifier(archModel, featDictList, targArr, idxArr);
     classifier.setupModels();
     
     currOut = classifier.apply(featDictList);
     currCost =  -(multiply(targArr, log(currOut)) + multiply(1 - targArr, log(1-currOut))).sum();
     decayContrib = classifier.l2decay * (classifier.params**2).sum();
     currCost += decayContrib;
     currRMSE = rmse(currOut, targArr);
     currAcc = accuracy(currOut, targArr);
     
     myLogger.info('(Cost, Acc, RMSE, decay) before training: (%.4f, %.4f, %.4f, %.4f)' % (currCost, currAcc, currRMSE, decayContrib));
     #myLogger.info('head(targArr) : %s, tail(targArr) : %s' % (pprint.pformat(targArr[:5]), pprint.pformat(targArr[-5:])))
     #myLogger.info('head(dataArr) : %s, tail(dataArr) : %s' % (pprint.pformat(featDictList[:5]), pprint.pformat(featDictList[-5:])))        
     #myLogger.info('head(currOut) : %s, tail(currOut) : %s' % (pprint.pformat(currOut[:5]), pprint.pformat(currOut[-5:])))
     #myLogger.info('Starting params : %s' % pprint.pformat(classifier.params))
     
     classifier.train();
     
     fOut = classifier.apply(featDictList);
     fCost =  -(multiply(targArr, log(fOut)) + multiply(1 - targArr, log(1-fOut))).sum();
     decayContrib = classifier.l2decay * (classifier.params**2).sum();
     fCost += decayContrib;
     fRMSE = rmse(fOut, targArr);
     fAcc = accuracy(fOut, targArr);
     
     myLogger.info('(Cost, Acc, RMSE, decay) after training: (%.4f, %.4f, %.4f, %.4f)' % (fCost, fAcc, fRMSE, decayContrib));
     #myLogger.info('head(fOut) : %s, tail(fOut) : %s' % (pprint.pformat(fOut[:5]), pprint.pformat(fOut[-5:])))
     myLogger.info('Final params : %s' % pprint.pformat(classifier.params))
     
     self.assert_(fRMSE < currRMSE, 'RMSE did not decrease.')
    def postEpochCall(self, epoch):
        """Convenience to run some stats at the end of each epoch."""
        if self.saveparams:
            self.paramHistory.append(self.params[:])
        outputs = self.apply(self.fDictList);
        lOut = outputs[self.problemArr[:, 0]]
        rOut = outputs[self.problemArr[:, 1]]
        sigOut = sigmoid(lOut - rOut);
        sigOut = where(sigOut < OFFSET_EPSILON, OFFSET_EPSILON, sigOut);
        sigOut = where(sigOut > 1-OFFSET_EPSILON, 1-OFFSET_EPSILON, sigOut);
        # Cross-Entropy
        # NOTE here that all targs are 1.
        error = log(sigOut)
        currCost = -error.sum();
        
        decayContrib = self.totalL2Decay * (self.params**2).sum();
        theAcc = accuracy(sigOut, 1);
        theRMSE = rmse(sigOut, 1);

        if self.epochlog:
            myLog.info('Epoch %d, curr Cost: %f, decayCont: %f ' % (epoch, currCost, decayContrib));
            myLog.info('Epoch %d, theAcc: %f, theRMSE: %f' % (epoch, theAcc, theRMSE));
        self.costTrajectory.append(currCost);
Esempio n. 43
0
 def handle_new_user(self, method, rest):
     """
     Handles incoming auth from a new, unknown username.
     """
     if method == "publickey":
         # Store their pubkeys so they can use one to register with us.
         log.debug( "Pubkey attempt" )
         self.store_pubkey( rest )
     elif method == "keyboard-interactive":
         log.debug( "Interactive attempt")
         # Start up the keyboard-interactive state machine.
         # This will take care of asking questions.
         self.state.begin_interactive()
     elif method == "password":
         # We told this client we don't support passwords
         # but they are ignoring us. Probably a bot.
         log.info("Disconnecting user: illegal password attempt")
         self.disconnect_noAuthAllowed("This auth method is not allowed")
         self.transport.factory.banHost(self.ip)
     else:
         # No idea what this is, but we don't support it.
         log.debug( "Unknown {0} attempt".format(method) )
         self.send_authFail()
Esempio n. 44
0
 def balanceIdList(self, idList, balanceByCol=2):
     """Given an IdList object, repeat the entries labeled with 1 so that we have close to the same number
     of entries with label 0."""
     numZeros = 0;
     numOnes = 0;
     
     for row in idList:
         if int(row[balanceByCol]) == 0:
             numZeros += 1;
         else:
             numOnes += 1;
     
     numTimesToRepeat = int(float(numZeros)/numOnes) - 1;
     ## Artifically up this a little bit.., 
     numTimesToRepeat += 3;
     log.info('In balance, will repeat %d times' % numTimesToRepeat);
     
     for row in idList:
         yield row;
         if int(row[balanceByCol]) != 0:
             for iRepeat in range(numTimesToRepeat):
                 yield row;
     
     return;
Esempio n. 45
0
 def main(self, argv):
     """Callable from Command line"""
     if argv is None:
         argv = sys.argv
     
     usageStr = \
         """usage: %prog [options] mapFile inFile outFile
         """
     
     parser = OptionParser(usage = usageStr);
     parser.add_option('-p', '--pklfile', dest='pklfile', default=None, 
         help='Set to save the normalization params');
     parser.add_option('-t', '--testFile', dest='testFile', default=None,
         help='Set to load in this test file to normalize as well.')
     parser.add_option('-o', '--outTestFile', dest='outTestFile', default=None,
         help='Where to save the outTestFile')
     parser.add_option('-w', '--logwarning', dest='logwarning', default=False,
                       action='store_true',
                       help='Log warnings about features missing from maps/data')
     
     (options, args) = parser.parse_args(argv[1:])
     
     if len(args) == 3:
         mapFile = args[0];
         inFile = args[1];
         outFile = args[2];
         self.logwarning = options.logwarning
         
         self.loadFeatKeyToColMap(mapFile);
         
         # First read through the original file and calc the params
         log.info('Calculating the norm params on %s' % inFile)
         ifs = gzip.open(inFile)
         reader = FeatureDictReader(ifs);
         normParamsDict = self.processFeatureDictList(reader)
         ifs.close();
         
         #Save the params
         if options.pklfile is not None:
             self.saveNormalizationParams(normParamsDict, options.pklfile)
         
         # Then process the inFile
         log.info('About to normalize : %s' % outFile)
         ofs = gzip.open(outFile, 'w')
         writer = FeatureDictWriter(ofs);
         ifs = gzip.open(inFile)
         reader = FeatureDictReader(ifs)
         progress = ProgressDots();
         for idx, fDict in self.normalizeFeatDictList(reader, normParamsDict, self.featKeyToColMap):
             writer.update(fDict, str(idx));
             progress.Update();
         progress.PrintStatus();
         ifs.close();
         ofs.close();
         
         # then if the -t and -o options were set do the same on the test Data
         if options.testFile is not None and options.outTestFile is not None:
             ifs = gzip.open(options.testFile)
             ofs = gzip.open(options.outTestFile, 'w')
             reader = FeatureDictReader(ifs)
             writer = FeatureDictWriter(ofs)
             log.info('About normalize : %s' % options.testFile);
             progress = ProgressDots();
             for idx, fDict in self.normalizeFeatDictList(reader, normParamsDict, self.featKeyToColMap):
                 writer.update(fDict, str(idx));
                 progress.Update();
             progress.PrintStatus();
             ifs.close();
             ofs.close();
         
         
     else:
         parser.print_help();
         sys.exit(2);
    def train(self):
        """Method to run through all the data and train away"""
        # Set some things up if doing online
        # will only calculate the gradient on the left hand side at a time 
        # So make a target array and make sure we look at each ordered pair in both orders
        numOnlineRuns = int((self.problemArr.shape[0])/float(self.onlineChunkSize)) + 1;
        theIdx = arange(0, self.problemArr.shape[0])
        theStep = int(len(theIdx) / numOnlineRuns) + 1;

        ## Check if we have some null rows.  If we do, then stratify the rows trained on in each 
        ## online step
        ## We don't want a complete step of left -1's or right -1's.
        isNullRows = (self.problemArr[:,0] == -1) |  (self.problemArr[:, 1] == -1)
        isNullRows = any(isNullRows)
        if isNullRows:
            myLog.info('Have -1 in problemArr, stratifying the online step data')
            nNullIdx = theIdx[(self.problemArr[:,0] != -1) & (self.problemArr[:,1] != -1)]
            lNullIdx = theIdx[self.problemArr[:,0] == -1]
            rNullIdx = theIdx[self.problemArr[:,1] == -1]

            nNullStep = int(len(nNullIdx)/numOnlineRuns) + 1
            lNullStep = int(len(lNullIdx)/numOnlineRuns) + 1
            rNullStep = int(len(rNullIdx)/numOnlineRuns) + 1
        
        try:
            for iEpoch in range(self.numEpochs):
                if self.batch:
                    self.trainer.step(self.fDictList, self.problemArr);
                else:
                    # Want to balance each of the chunks used in the online learning.
                    if isNullRows:
                        shuffle(nNullIdx);
                        shuffle(lNullIdx)
                        shuffle(rNullIdx)
                    else:
                        shuffle(theIdx);
                                        
                    for iOnlineRun in range(numOnlineRuns):
                        if isNullRows:
                            nNullStart = iOnlineRun * nNullStep
                            nNullEnd = nNullStart + nNullStep
                            lNullStart = iOnlineRun * lNullStep
                            lNullEnd = lNullStart + lNullStep
                            rNullStart = iOnlineRun * rNullStep
                            rNullEnd = rNullStart + rNullStep

                            subProbArr = concatenate((self.problemArr[nNullIdx[nNullStart:nNullEnd], :],
                                                      self.problemArr[lNullIdx[lNullStart:lNullEnd], :],
                                                      self.problemArr[rNullIdx[rNullStart:rNullEnd], :]))
                            
                        else:
                            rowStart = iOnlineRun * theStep;
                            rowEnd = rowStart + theStep;
                            subIdx = theIdx[rowStart:rowEnd];
                            subProbArr = self.problemArr[subIdx, :]
                        self.trainer.step(self.fDictList, subProbArr);
                
                myLog.debug('About to call cost in postEpoch call')
                self.postEpochCall(iEpoch)
                
                # Test for convergence
                if self.checkconverge and len(self.costTrajectory) > self.nconvergesteps:
                    if std(self.costTrajectory[-self.nconvergesteps:]) < self.costEpsilon:
                        myLog.critical('Convergence after Epoch %d!!' % iEpoch);
                        return self.costTrajectory;
                
                if self.callback is not None:
                    self.callback(self);
            
            myLog.critical('Never completely converged after %d epochs!' % self.numEpochs);
        except KeyboardInterrupt, e:
            myLog.critical('Interrupted with Keyboard after %d epochs, stopping here, currCost = %f' % \
                           (iEpoch, self.costTrajectory[-1]))
            return self.costTrajectory;
Esempio n. 47
0
 def firstContact(self):
     """
     Called the first time a user sends us a userauth request.
     """
     known_text = "Known" if self.state.user_is_known else "Unknown"
     log.info("{0} user {1} is authenticating".format(known_text, self.state.username))