def patientItemFromSourceItem(self, sourceItem, clinicalItem, conn): # Produce a patient_item record model for the given sourceItem patientItem = \ RowItemModel \ ( { "external_id": sourceItem["order_med_id"], "patient_id": sourceItem["pat_id"], "encounter_id": sourceItem["pat_enc_csn_id"], "clinical_item_id": clinicalItem["clinical_item_id"], "item_date": sourceItem["ordering_date"], } ) insertQuery = DBUtil.buildInsertQuery("patient_item", patientItem.keys()) insertParams = patientItem.values() try: # Optimistic insert of a new unique item DBUtil.execute(insertQuery, insertParams, conn=conn) patientItem["patient_item_id"] = DBUtil.execute( DBUtil.identityQuery("patient_item"), conn=conn)[0][0] except IntegrityError, err: # If turns out to be a duplicate, okay, pull out existint ID and continue to insert whatever else is possible log.info(err) # Lookup just by the composite key components to avoid attempting duplicate insertion again searchPatientItem = \ { "patient_id": patientItem["patient_id"], "clinical_item_id": patientItem["clinical_item_id"], "item_date": patientItem["item_date"], } (patientItem["patient_item_id"], isNew) = DBUtil.findOrInsertItem("patient_item", searchPatientItem, conn=conn)
def main(self, argv): """Main method, callable from command line""" usageStr = "usage: %prog [options]\n" parser = OptionParser(usage=usageStr) parser.add_option( "-s", "--startDate", dest="startDate", metavar="<startDate>", help= "Date string (e.g., 2011-12-15), if provided, will only run conversion on items with start time on or after this date." ) parser.add_option( "-e", "--endDate", dest="endDate", metavar="<endDate>", help= "Date string (e.g., 2011-12-15), if provided, will only run conversion on items with start time before this date." ) (options, args) = parser.parse_args(argv[1:]) log.info("Starting: " + str.join(" ", argv)) timer = time.time() convOptions = ConversionOptions() convOptions.extractParserOptions(options) self.convertSourceItems(convOptions) timer = time.time() - timer log.info("%.3f seconds to complete", timer)
def _query_patient_episodes(self, query, pat_id_col=None, index_time_col=None): # Initialize DB cursor. cursor = self._connection.cursor() # Fetch and return results. log.info('query: %s' % str(query)) if isinstance(query, basestring): cursor.execute(query) else: log.info('query.params: %s' % str(query.params)) cursor.execute(str(query), query.params) # Parse arguments. if pat_id_col is None: pat_id_col = 'pat_id' if index_time_col is None: index_time_col = 'index_time' self._factory.setPatientEpisodeInput(cursor, pat_id_col, index_time_col) num_episodes = self._factory.processPatientEpisodeInput() return num_episodes
def _add_sex_features(self): log.info('Adding sex features...') SEX_FEATURES = ["Male", "Female"] for feature in SEX_FEATURES: self._factory.addClinicalItemFeatures([feature], dayBins=[], features="pre")
def tearDown(self): """Restore state from any setUp or test steps""" log.info("Purge test records from the database") ######FIX THIS TO CLEANUP YOUR TEST DATA .... May be don't have to, since superclass will just drop the whole database anyway DBUtil.execute \ ( """delete from patient_item where clinical_item_id in ( select clinical_item_id from clinical_item as ci, clinical_item_category as cic where ci.clinical_item_category_id = cic.clinical_item_category_id and cic.source_table = 'stride_culture_micro' ); """ ) DBUtil.execute \ ( """delete from clinical_item where clinical_item_category_id in ( select clinical_item_category_id from clinical_item_category where source_table = 'strid_culture_micro' ); """ ) DBUtil.execute( "delete from clinical_item_category where source_table = 'stride_culture_micro ';" ) DBUtil.execute( "delete from stride_culture_micro where order_proc_anon_id < 0") DBTestCase.tearDown(self)
def loadUpdateBufferFromFile(self, filename): updateBuffer = None try: #print >> sys.stderr, filename log.info("Loading: %s" % filename) ifs = stdOpen(filename, "r") updateBuffer = json.load(ifs) updateBuffer["analyzedPatientItemIds"] = set( updateBuffer["analyzedPatientItemIds"]) ifs.close() except IOError, exc: # Apparently could not find the named filename. See if instead it's a prefix # for a series of enumerated files and then merge them into one mass buffer dirname = os.path.dirname(filename) if dirname == "": dirname = "." # Implicitly the current working directory basename = os.path.basename(filename) for nextFilename in os.listdir(dirname): if nextFilename.startswith(basename): nextFilepath = os.path.join(dirname, nextFilename) nextUpdateBuffer = self.loadUpdateBufferFromFile( nextFilepath) if updateBuffer is None: # First update buffer, use it as base updateBuffer = nextUpdateBuffer else: # Have existing update buffer. Just update it's contents with the next one updateBuffer = self.mergeBuffers( updateBuffer, nextUpdateBuffer) del nextUpdateBuffer
def main(self, argv): """Main method, callable from command line""" usageStr = "usage: %prog [options]\n" parser = OptionParser(usage=usageStr) parser.add_option( "-p", "--patientIds", dest="patientIds", metavar="<patientIds>", help= "Comma separated list of patient IDs to convert demographics data for. Leave blank to attempt conversion for all available" ) (options, args) = parser.parse_args(argv[1:]) log.info("Starting: " + str.join(" ", argv)) timer = time.time() patientIds = None if options.patientIds is not None: patientIds = options.patientIds.split(",") self.convertSourceItems(patientIds) timer = time.time() - timer log.info("%.3f seconds to complete", timer)
def tearDown(self): """Restore state from any setUp or test steps""" log.info("Purge test records from the database") DBUtil.execute \ ( """delete from patient_item where clinical_item_id in ( select clinical_item_id from clinical_item as ci, clinical_item_category as cic where ci.clinical_item_category_id = cic.clinical_item_category_id and cic.source_table = 'stride_preadmit_med' ); """ ) DBUtil.execute \ ( """delete from clinical_item where clinical_item_category_id in ( select clinical_item_category_id from clinical_item_category where source_table = 'stride_preadmit_med' ); """ ) DBUtil.execute( "delete from clinical_item_category where source_table = 'stride_preadmit_med';" ) DBUtil.execute("delete from stride_mapped_meds where rxcui < 0") DBUtil.execute( "delete from stride_preadmit_med where stride_preadmit_med_id < 0") DBTestCase.tearDown(self)
def setUp(self): """Prepare state for test cases""" DBTestCase.setUp(self); self.manager = SimManager(); # Instance to test on from stride.clinical_item.ClinicalItemDataLoader import ClinicalItemDataLoader; ClinicalItemDataLoader.build_clinical_item_psql_schemata(); self.manager.buildCPOESimSchema(); log.info("Populate the database with test data") #### Basically import a bunch of rigged CSV or TSV files that have realistic simulating case and grading data # Get that data into the test database dataTextStr = \ """sim_result_id;name;description;group_string;priority -10;Temp;Temperature (F);Flowsheet>Vitals;10 -20;Pulse;Pulse / Heart Rate (HR);Flowsheet>Vitals;20 -30;SBP;Blood Pressure, Systolic (SBP);Flowsheet>Vitals;30 -40;DBP;Blood Pressure, Diastolic (DBP);Flowsheet>Vitals;40 -50;Resp;Respirations (RR);Flowsheet>Vitals;50 -60;FiO2;Fraction Inspired Oxygen;Flowsheet>Vitals;60 -70;Urine;Urine Output (UOP);Flowsheet>Vitals;70 """ # Parse into DB insertion object DBUtil.insertFile( StringIO(dataTextStr), "sim_result", delim=";");
def main(self, argv): """Main method, callable from command line""" usageStr = "usage: %prog <inputFile> <outputFile>\n"+\ " <inputFile> Tab-delimited input file taken from schedule Excel file. Example data format as seen in test case examples. See support/extractExcelSheets.py for help on pulling out Excel sheets into tab-delimited data files.\n"+\ " <outputFile> File to output results to. Designate '-' for stdout."; parser = OptionParser(usage=usageStr) parser.add_option("-i", "--providerIdFilename", dest="providerIdFilename", help="Name of provider ID CSV file. If provided, then add column for prov_id based on resident first_name and last_name, match within first "+DEFAULT_INDEX_PREFIX_LENGTH+" characters, or generate ID value if no match found"); parser.add_option("-y", "--baseYear", dest="baseYear", help="Year expect dates to start in."); parser.add_option("-t", "--changeTime", dest="changeTime", default=CHANGE_TIME, help="Hour of day that count as delimiter between rotations. Likely should NOT be midnight = 0, because night shifts span midnight. Default to 7 = 7am."); (options, args) = parser.parse_args(argv[1:]) if len(args) >= 2 and options.baseYear: log.info("Starting: "+str.join(" ", argv)) timer = time.time(); baseYear = int(options.baseYear); if options.providerIdFilename is not None: providerReader = csv.DictReader(open(options.providerIdFilename)); self.loadProviderModels( providerReader ); inFile = stdOpen(args[0]); scheduleItems = self.parseScheduleItems(inFile, baseYear); outFile = stdOpen(args[1],"w"); formatter = TextResultsFormatter(outFile); formatter.formatResultDicts(scheduleItems); else: parser.print_help() sys.exit(-1) timer = time.time() - timer; log.info("%.3f seconds to complete",timer);
def tearDown(self): """Restore state from any setUp or test steps""" log.info("Purge test records from the database") DBUtil.execute \ ( """delete from patient_item where clinical_item_id in ( select clinical_item_id from clinical_item as ci, clinical_item_category as cic where ci.clinical_item_category_id = cic.clinical_item_category_id and cic.source_table = '%s' ); """ % TEST_SOURCE_TABLE ) DBUtil.execute \ ( """delete from clinical_item where clinical_item_category_id in ( select clinical_item_category_id from clinical_item_category where source_table = '%s' ); """ % TEST_SOURCE_TABLE ) DBUtil.execute( "delete from clinical_item_category where source_table = '%s';" % TEST_SOURCE_TABLE) query = SQLQuery() query.delete = True query.addFrom("stride_patient") query.addWhere("pat_id < 0") DBUtil.execute(query) DBTestCase.tearDown(self)
def main(argv): """Main method, callable from command line""" usageStr = "usage: %prog [options] <cgiFile1> <cgiFile2> ...\n"+\ " <cgiFileX> CGI template files to convert to PSP\n" parser = OptionParser(usage=usageStr) parser.add_option( "-l", "--noLinkConversion", dest="noLinkConversion", action="store_true", help="If set, will NOT swap all .py (apparent links) to .psp.") (options, args) = parser.parse_args(argv[1:]) timer = time.time() if len(args) > 0: for cgiTemplateFilename in args: log.info("Converting %s" % cgiTemplateFilename) generatePSPfromCGITemplate(cgiTemplateFilename, not options.noLinkConversion) else: parser.print_help() sys.exit(-1) timer = time.time() - timer log.info("%.3f seconds to complete", timer)
def main(self, argv): """Main method, callable from command line""" usageStr = "usage: %prog [options]\n" parser = OptionParser(usage=usageStr) parser.add_option("-i", "--userSIDs", dest="userSIDs", metavar="<userSIDs>", help="Comma separated list of user SIDs to convert AccessLog data for. Leave blank to attempt conversion for all available"); parser.add_option("-l", "--limit", dest="limit", metavar="<limit>", help="Number of records to process before stopping"); parser.add_option("-o", "--offset", dest="offset", metavar="<offset>", help="Number of records to skip before start converting"); (options, args) = parser.parse_args(argv[1:]) log.info("Starting: "+str.join(" ", argv)) timer = time.time(); userSIDs = None; if options.userSIDs is not None: userSIDs = options.userSIDs.split(","); limit = None; if options.limit is not None: limit = int(options.limit); offset = None; if options.offset is not None: offset = int(options.offset); self.convertSourceItems(userSIDs, limit, offset); timer = time.time() - timer; log.info("%.3f seconds to complete",timer);
def main(self, argv): """Main method, callable from command line""" usageStr = "usage: %prog [options]\n"+\ "Beware that this module is intended to be run only ONCE ever on a database. Currently will end up with duplicate clinical item keys if you try to run it in parallel or even serially." parser = OptionParser(usage=usageStr) parser.add_option("-s", "--startDate", dest="startDate", metavar="<startDate>", help="Date string (e.g., 2011-12-15), if provided, will only run conversion on items with ordering time on or after this date."); parser.add_option("-e", "--endDate", dest="endDate", metavar="<endDate>", help="Date string (e.g., 2011-12-15), if provided, will only run conversion on items with ordering time before this date."); (options, args) = parser.parse_args(argv[1:]) log.info("Starting: "+str.join(" ", argv)) timer = time.time(); startDate = None; if options.startDate is not None: # Parse out the start date parameter timeTuple = time.strptime(options.startDate, DATE_FORMAT); startDate = datetime(*timeTuple[0:3]); endDate = None; if options.endDate is not None: # Parse out the end date parameter timeTuple = time.strptime(options.endDate, DATE_FORMAT); endDate = datetime(*timeTuple[0:3]); self.convertSourceItems(startDate,endDate); timer = time.time() - timer; log.info("%.3f seconds to complete",timer);
def purgeTestRecords(self): log.info("Purge test records from the database") if self.testPatientId is not None: # Delete test generated data DBUtil.execute( "delete from sim_patient_order where sim_patient_id = %s", (self.testPatientId, )) DBUtil.execute( "delete from sim_patient_state where sim_patient_id = %s", (self.testPatientId, )) DBUtil.execute("delete from sim_patient where sim_patient_id = %s", (self.testPatientId, )) DBUtil.execute("delete from sim_note where sim_note_id < 0") DBUtil.execute( "delete from sim_state_result where sim_state_result_id < 0") DBUtil.execute( "delete from sim_patient_order where sim_state_id < 0 or sim_patient_order_id < 0" ) DBUtil.execute( "delete from sim_order_result_map where sim_order_result_map_id < 0" ) DBUtil.execute("delete from sim_result where sim_result_id < 0") DBUtil.execute( "delete from sim_patient_state where sim_state_id < 0 or sim_patient_state_id < 0" ) DBUtil.execute( "delete from sim_state_transition where pre_state_id < 0") DBUtil.execute("delete from sim_state where sim_state_id <= 0") DBUtil.execute("delete from sim_user where sim_user_id < 0") DBUtil.execute("delete from sim_patient where sim_patient_id < 0") DBUtil.execute("delete from clinical_item where clinical_item_id < 0")
def banHost(self, host, duration=None): """ Bans a host from connecting. """ #TODO: Timed bans? log.info("Banning IP {0}".format(host)) banlist.append(host)
def grad(self, fDictList, problemArr): """Evaluate the gradient of the error function wrt the params. The first error to pass back to the NN will be the (target - sigmoid(lOut - rOut)) * (abs(lOut - rOut)) NOTE: all targs are 1. So calc the gradient with l/r then with r/l """ currGrad = zeros(self.params.shape) meanDOut = [] minDOut = [] maxDOut = [] for lSubData, rSubData, subProbArr in self.gradChunkDataIterator( fDictList, problemArr): # First calc the contribution when looking at the left to right ## Only include data that comes from non-NULL on the left nonLNullRows = subProbArr[:, 0] > -1 rd = rSubData[:, nonLNullRows] ld = lSubData[:, nonLNullRows] rOut = self.layerModel.fprop(rd) lOut = self.layerModel.fprop(ld) sigOut = sigmoid(lOut - rOut) absDiffOut = abs(lOut - rOut) d_outputs = (sigOut - 1) if len(d_outputs) > 0 and d_outputs.shape[1] > 0: try: meanDOut.append(mean(abs(d_outputs))) minDOut.append(min(d_outputs)) maxDOut.append(max(d_outputs)) except Exception, e: myLog.error(e) myLog.info('Some error, d_outputs : %s' % pprint.pformat(d_outputs)) myLog.info('Some error, len(d_outputs) : %s' % len(d_outputs)) raise e self.layerModel.bprop(d_outputs, ld) currGradContrib = self.layerModel.grad(d_outputs, ld) currGradContrib = currGradContrib.sum(1) currGrad += currGradContrib ## Then do the same, but going to the right. # need to fprop on the right side again # Only use rows that are non-null on the right nonRNullRows = subProbArr[:, 1] > -1 rd = rSubData[:, nonRNullRows] ld = lSubData[:, nonRNullRows] lOut = self.layerModel.fprop(ld) rOut = self.layerModel.fprop(rd) sigOut = sigmoid(rOut - lOut) d_outputs = sigOut if len(d_outputs > 0) and d_outputs.shape[1] > 0: self.layerModel.bprop(d_outputs, rd) currGradContrib = self.layerModel.grad(d_outputs, rd) currGradContrib = currGradContrib.sum(1) currGrad += currGradContrib
def postEpochCall(self, epoch): """Convenience to run some stats at the end of each epoch.""" if self.saveparams: self.paramHistory.append(self.params[:]) outputs = self.apply(self.fDictList) lOut = outputs[self.problemArr[:, 0]] rOut = outputs[self.problemArr[:, 1]] sigOut = sigmoid(lOut - rOut) sigOut = where(sigOut < OFFSET_EPSILON, OFFSET_EPSILON, sigOut) sigOut = where(sigOut > 1 - OFFSET_EPSILON, 1 - OFFSET_EPSILON, sigOut) # Cross-Entropy # NOTE here that all targs are 1. error = log(sigOut) currCost = -error.sum() decayContrib = self.totalL2Decay * (self.params**2).sum() theAcc = accuracy(sigOut, 1) theRMSE = rmse(sigOut, 1) if self.epochlog: myLog.info('Epoch %d, curr Cost: %f, decayCont: %f ' % (epoch, currCost, decayContrib)) myLog.info('Epoch %d, theAcc: %f, theRMSE: %f' % (epoch, theAcc, theRMSE)) self.costTrajectory.append(currCost)
def main(self, argv): """Main method, callable from command line""" usageStr = "usage: %prog [options] <inputFile1> <inputFile2> ... <inputFileN>\n"+\ " <inputFileX> Tab-delimited file of data. Initial comment lines will be scanned for list of argv parameters to add as data columns.\n"+\ " If only a single input is given, interpret this as an index file which lists the names of the other files to concatenate (e.g., obtained with dir * /b or ls).\n" parser = OptionParser(usage=usageStr) parser.add_option( "-o", "--outputFile", dest="outputFile", help= "Tab-delimited file matching concatenated contents of input files. Specify \"-\" to send to stdout." ) (options, args) = parser.parse_args(argv[1:]) log.info("Starting: " + str.join(" ", argv)) timer = time.time() if len(args) > 0: inputFiles = list() if len(args) > 1: for inputFilename in args: inputFiles.append(stdOpen(inputFilename)) else: # len(argvs) == 1, Single index file rather than list of all files on command-line indexFile = stdOpen(args[0]) for line in indexFile: inputFilename = line.strip() inputFiles.append(stdOpen(inputFilename)) # Format the results for output outputFile = stdOpen(options.outputFile, "w") # Print comment line with arguments to allow for deconstruction later as well as extra results summaryData = { "argv": argv } print >> outputFile, COMMENT_TAG, json.dumps(summaryData) # Tab-delimited output formatting formatter = TextResultsFormatter(outputFile) # Begin the file parsing so can at least get the total list of column headers rowGenerator = self(inputFiles) firstRow = rowGenerator.next() # Insert a mock record to get a header / label row colNames = self.resultHeaders() formatter.formatTuple(colNames) # Stream the concatenated data rows to the output to avoid storing all in memory formatter.formatResultDict(firstRow, colNames) for outputDict in rowGenerator: formatter.formatResultDict(outputDict, colNames) else: parser.print_help() sys.exit(-1) timer = time.time() - timer log.info("%.3f seconds to complete", timer)
def _add_demographic_features(self): log.info('Adding demographic features...') # Add birth and death. self._add_lifespan_features() # Add sex features. self._add_sex_features() # Add race features. self._add_race_features()
def runStats(self): """Convenience method to print out some stats about the training.""" # And log some stats about accuracy, etc. theOut = self.classifier.apply(self.fDictList) theOut = theOut[self.idxArr] theAcc = accuracy(theOut, self.targArr) theRMSE = rmse(theOut, self.targArr) log.info('theAcc : %.4f, theRMSE : %.4f' % (theAcc, theRMSE))
def _add_lifespan_features(self): log.info('Adding lifespan features...') self._factory.addClinicalItemFeatures(['Birth'], dayBins=[], features="pre") self._factory.addClinicalItemFeatures(['Death'], dayBins=[], features="post")
def isBadUsername(self, user): is_bad = user.lower() in ("root", "asdmin", "ubnt", "support", "user", "pi") if is_bad: log.info('Disconnecting "{0}" from {1}: blacklisted username'.format(user, self.ip)) self.transport.sendDisconnect( transport.DISCONNECT_ILLEGAL_USER_NAME, "You can't use that username ({0}) here. Please reconnect using a different one.".format(user)) return is_bad
def buildProtocol(self, addr): # Reject this connection if the IP is banned. if addr.host in banlist: log.info("Rejecting connection from banned IP {0}".format(addr.host)) return None # otherwise all good; let superclass do the rest log.info("Incoming SSH connection from {0}".format(addr.host)) return conch_factory.SSHFactory.buildProtocol(self, addr)
def generatePatientItemsForCompositeId(self, clinicalItemIds, compositeId, conn=None): """Create patient_item records for the composite to match the given clinical item ID patient items. """ extConn = True if conn is None: conn = self.connFactory.connection() extConn = False try: # Record linking information for componentId in clinicalItemIds: linkModel = RowItemModel() linkModel["clinical_item_id"] = compositeId linkModel["linked_item_id"] = componentId insertQuery = DBUtil.buildInsertQuery("clinical_item_link", linkModel.keys()) insertParams = linkModel.values() DBUtil.execute(insertQuery, insertParams, conn=conn) # Extract back link information, which will also flatten out any potential inherited links linkedItemIdsByBaseId = self.loadLinkedItemIdsByBaseId(conn=conn) linkedItemIds = linkedItemIdsByBaseId[compositeId] # Create patienItem records for the composite clinical item to overlap existing component ones # First query for the existing component records query = SQLQuery() query.addSelect("*") query.addFrom("patient_item") query.addWhereIn("clinical_item_id", linkedItemIds) results = DBUtil.execute(query, includeColumnNames=True, conn=conn) patientItems = modelListFromTable(results) # Patch component records to instead become composite item records then insert back into database progress = ProgressDots(total=len(patientItems)) for patientItem in patientItems: del patientItem["patient_item_id"] patientItem["clinical_item_id"] = compositeId patientItem["analyze_date"] = None insertQuery = DBUtil.buildInsertQuery("patient_item", patientItem.keys()) insertParams = patientItem.values() try: # Optimistic insert of a new unique item DBUtil.execute(insertQuery, insertParams, conn=conn) except conn.IntegrityError, err: # If turns out to be a duplicate, okay, just note it and continue to insert whatever else is possible log.info(err) progress.Update() # progress.PrintStatus(); finally: if not extConn: conn.close()
def tearDown(self): """Restore state from any setUp or test steps""" log.info("Purge test records from the database") DBUtil.execute \ ( """delete from patient_item_collection_link where item_collection_item_id in ( select item_collection_item_id from item_collection_item as ici, item_collection as ic where ici.item_collection_id = ic.item_collection_id and ic.external_id < 0 ); """ ) DBUtil.execute \ ( """delete from item_collection_item where item_collection_id in ( select item_collection_id from item_collection as ic where ic.external_id < 0 ); """ ) DBUtil.execute("delete from item_collection where external_id < 0;") DBUtil.execute \ ( """delete from patient_item where clinical_item_id in ( select clinical_item_id from clinical_item as ci, clinical_item_category as cic where ci.clinical_item_category_id = cic.clinical_item_category_id and cic.source_table = 'stride_order_proc' ); """ ) DBUtil.execute \ ( """delete from clinical_item where clinical_item_category_id in ( select clinical_item_category_id from clinical_item_category where source_table = 'stride_order_proc' ); """ ) DBUtil.execute( "delete from clinical_item_category where source_table = 'stride_order_proc';" ) DBUtil.execute( "delete from stride_orderset_order_proc where order_proc_id in (%s)" % str.join(",", self.orderProcIdStrList)) DBUtil.execute( "delete from stride_order_proc where order_proc_id in (%s)" % str.join(",", self.orderProcIdStrList)) DBTestCase.tearDown(self)
def _add_race_features(self): log.info('Adding race features...') for feature in self._factory.queryAllRaces(): if LocalEnv.DATASET_SOURCE_NAME == 'STRIDE': # TODO self._factory.addClinicalItemFeatures([feature], dayBins=[], features="pre") else: #elif LocalEnv.DATASET_SOURCE_NAME == 'UMich': self._factory.addClinicalItemFeatures_UMich([feature], dayBins=[], features="pre", clinicalItemType='RaceName', clinicalItemTime=None, tableName='demographics')
def grad(self, fDictList, problemArr): """Evaluate the gradient of the error function wrt the params. The first error to pass back to the NN will be the (target - sigmoid(lOut - rOut)) * (abs(lOut - rOut)) NOTE: all targs are 1. So calc the gradient with l/r then with r/l """ currGrad = zeros(self.params.shape); meanDOut = []; minDOut = []; maxDOut = []; for lSubData, rSubData, subProbArr in self.gradChunkDataIterator(fDictList, problemArr): # First calc the contribution when looking at the left to right ## Only include data that comes from non-NULL on the left nonLNullRows = subProbArr[:,0]>-1 rd = rSubData[:, nonLNullRows] ld = lSubData[:, nonLNullRows] rOut = self.layerModel.fprop(rd) lOut = self.layerModel.fprop(ld) sigOut = sigmoid(lOut - rOut); absDiffOut = abs(lOut - rOut); d_outputs = (sigOut - 1); if len(d_outputs) > 0 and d_outputs.shape[1] > 0: try: meanDOut.append(mean(abs(d_outputs))); minDOut.append(min(d_outputs)); maxDOut.append(max(d_outputs)); except Exception, e: myLog.error(e) myLog.info('Some error, d_outputs : %s' % pprint.pformat(d_outputs)) myLog.info('Some error, len(d_outputs) : %s' % len(d_outputs)) raise e self.layerModel.bprop(d_outputs, ld); currGradContrib = self.layerModel.grad(d_outputs, ld) currGradContrib = currGradContrib.sum(1); currGrad += currGradContrib; ## Then do the same, but going to the right. # need to fprop on the right side again # Only use rows that are non-null on the right nonRNullRows = subProbArr[:,1]>-1 rd = rSubData[:, nonRNullRows] ld = lSubData[:, nonRNullRows] lOut = self.layerModel.fprop(ld) rOut = self.layerModel.fprop(rd) sigOut = sigmoid(rOut - lOut); d_outputs = sigOut; if len(d_outputs > 0) and d_outputs.shape[1] > 0: self.layerModel.bprop(d_outputs, rd) currGradContrib = self.layerModel.grad(d_outputs, rd) currGradContrib = currGradContrib.sum(1) currGrad += currGradContrib;
def _add_lifespan_features(self): log.info('Adding lifespan features...') self._factory.addClinicalItemFeatures(['Birth'], dayBins=[], features="pre", is_Michigan_data=True, clinicalItemType=None, clinicalItemTime='Birth', tableName='pt_info')
def setUp(self): """Prepare state for test cases""" DBTestCase.setUp(self) log.info( "Populate the database with test data (Assumes MySQL data structure)" ) DBUtil.execute \ ("""create table %s ( USER_ID varchar(255), USER_NAME varchar(255), DE_PAT_ID bigint, ACCESS_DATETIME datetime, METRIC_ID integer, METRIC_NAME text, LINE_COUNT integer, DESCRIPTION text, METRIC_GROUP_NUM integer, METRIC_GROUP_NAME text ); """ % TEST_SOURCE_TABLE ) self.testUserIDs = list() headers = [ "user_id", "user_name", "de_pat_id", "access_datetime", "metric_id", "metric_name", "line_count", "description", "metric_group_num", "metric_group_name" ] dataModels = \ [ RowItemModel( ['S-7', 'CJ', None, '2013-10-14 08:44:47', '33006', 'ME_IBGLANCE', '1', 'IN BASKET GLANCE PLUGIN ACCESSED IN RADAR', '33000', 'Radar'], headers ), RowItemModel( ['S-7', 'CJ', '3289034', '2014-03-20 00:40:18', '34127', 'IP_ORDERSSECTION', '1', 'Inpatient Orders section opened ', '17001', 'PATIENT CLINICAL INFO'], headers ), RowItemModel( ['S-7', 'CJ', None, '2014-01-01 10:10:56', '20008', 'AC_IB_CREATEMSG', '1', 'In Basket message of any type created.', '20000', 'In Basket Report'], headers ), RowItemModel( ['S-7', 'CJ', None, '2014-01-01 10:10:56', '20008', 'AC_IB_CREATEMSG', '2', '(Created messages counted.) ', '20000', 'In Basket Report'], headers ), RowItemModel( ['S-7', 'CJ', '1853397', '2013-06-29 11:25:02', '20075', 'AC_DOCUMENTLIST_SUBC', '1', 'Prelude Documents list accessed for patient.', None, None], headers ), RowItemModel( ['S-4', 'AB', '3133593', '2013-10-22 06:46:29', '17008', 'MR_REPORTS', '1', 'A report with patient data accessed.', '17001', 'PATIENT CLINICAL INFO'], headers ), RowItemModel( ['S-4', 'AB', '3047429', '2014-03-16 20:56:54', '17016', 'MR_RESULTS_REVIEW', '1', 'Results Review activity accessed.', '17002', 'Patient Chart Review'], headers ), RowItemModel( ['S-4', 'AB', '3408732', '2014-04-08 08:47:38', '17016', 'MR_RESULTS_REVIEW', '1', 'Results Review activity accessed.', '17002', 'Patient Chart Review'], headers ), RowItemModel( ['S-4', 'AB', None, '2014-02-26 19:27:48', '34140', 'IP_SYSTEM_LIST', '1', 'Inpatient system list accessed.', '20001', 'PATIENT DEMOGRAPHICS'], headers ), RowItemModel( ['S-4', 'AB', '2487184', '2013-10-11 08:45:46', '17008', 'MR_REPORTS', '1', 'A report with patient data accessed.', '17001', 'PATIENT CLINICAL INFO'], headers ), ] for dataModel in dataModels: (dataItemId, isNew) = DBUtil.findOrInsertItem(TEST_SOURCE_TABLE, dataModel, retrieveCol="user_id") userID = int(dataItemId[1:]) # Trim leading S and parse remainder as an integer self.testUserIDs.append(userID) self.converter = ProviderRotationConversion() # Instance to test on self.converter.sourceTableName = TEST_SOURCE_TABLE
def _add_race_features(self): log.info('Adding race features...') RACE_FEATURES = [ "RaceWhiteHispanicLatino", "RaceWhiteNonHispanicLatino", "RaceHispanicLatino", "RaceBlack", "RaceAsian", "RacePacificIslander", "RaceNativeAmerican", "RaceOther", "RaceUnknown" ] for feature in RACE_FEATURES: self._factory.addClinicalItemFeatures([feature], dayBins=[], features="pre")
def disconnect_hostNotAllowed(self, msg): """ Sends a "Host Not Allowed To Connect" disconnect packet to the client. The parameter is a string to send as the disconnection message. """ log.info("Disconnecting {0}".format(self.ip)) self.transport.sendDisconnect( transport.DISCONNECT_HOST_NOT_ALLOWED_TO_CONNECT, msg)
def runStats(self): """Convenience method to print out some stats about the training.""" # And log some stats about accuracy, etc. theOut = self.classifier.apply(self.fDictList) lOut = theOut[self.probArr[:, 0]] rOut = theOut[self.probArr[:, 1]] sigOut = sigmoid(lOut - rOut) theAcc = accuracy(sigOut, 1) theRMSE = rmse(sigOut, 1) log.info('theAcc : %.4f, theRMSE : %.4f' % (theAcc, theRMSE))
def _add_sex_features(self): log.info('Adding sex features...') SEX_FEATURES = ["Male", "Female"] for feature in SEX_FEATURES: if LocalEnv.DATASET_SOURCE_NAME == 'STRIDE': # TODO self._factory.addClinicalItemFeatures([feature], dayBins=[], features="pre") else: #elif LocalEnv.DATASET_SOURCE_NAME == 'UMich': self._factory.addClinicalItemFeatures_UMich([feature], dayBins=[], features="pre", clinicalItemType='GenderName', clinicalItemTime=None, tableName="demographics")
def _add_race_features(self): log.info('Adding race features...') for feature in self._factory.RACE_FEATURES: self._factory.addClinicalItemFeatures([feature], dayBins=[], features="pre", is_Michigan_data=True, clinicalItemType='RaceName', clinicalItemTime=None, tableName='demographics')
def runStats(self): """Convenience method to print out some stats about the training.""" # And log some stats about accuracy, etc. theOut = self.classifier.apply(self.fDictList) lOut = theOut[self.probArr[:, 0]] rOut = theOut[self.probArr[:, 1]] sigOut = sigmoid(lOut - rOut) theAcc = accuracy(sigOut, 1) theRMSE = rmse(sigOut, 1) log.info('theAcc : %.4f, theRMSE : %.4f' % (theAcc, theRMSE))
def processFeatureDictList(self, featureReader): """Given a feature dict reader, go through and calculate a minMax dict for each feature.""" log.info('Beginning processing a featDict'); minMaxDict = {}; idList = []; fDictList = []; progress = ProgressDots(); for iRow, fDict in enumerate(featureReader): minMaxDict = self.updateMinMaxDict(minMaxDict, fDict); progress.Update(); progress.PrintStatus(); minRangeDict = {}; minRangeDict = self.convertMinMaxDictToMinRange(minMaxDict) return minRangeDict;
def __generalTest(self, featDictList, probArr, archModel): """Convenience to test things multiple times""" classifier = PairMonteFeatDictClassifier(archModel, featDictList, probArr); classifier.setupModels(); currOut = classifier.apply(featDictList); lData = currOut[probArr[:, 0]] rData = currOut[probArr[:, 1]] outputs = sigmoid(lData - rData); outputs = where(outputs < OFFSET_EPSILON, OFFSET_EPSILON, outputs); outputs = where(outputs > 1-OFFSET_EPSILON, 1-OFFSET_EPSILON, outputs); # Assuming all is 1 currCost = 2 * -(log(outputs)).sum(); decayContrib = classifier.l2decay * (classifier.params**2).sum(); currCost += decayContrib; currRMSE = rmse(outputs, 1); currAcc = accuracy(outputs, 1); beginRMSE = currRMSE; myLogger.info('(Cost, Acc, RMSE, decay) before training: (%.4f, %.4f, %.4f, %.4f)' % (currCost, currAcc, currRMSE, decayContrib)); classifier.train(); currOut = classifier.apply(featDictList); lData = currOut[probArr[:, 0]] rData = currOut[probArr[:, 1]] outputs = sigmoid(lData - rData); outputs = where(outputs < OFFSET_EPSILON, OFFSET_EPSILON, outputs); outputs = where(outputs > 1-OFFSET_EPSILON, 1-OFFSET_EPSILON, outputs); # Assuming all is 1 currCost = -(log(outputs)).sum(); decayContrib = classifier.l2decay * (classifier.params**2).sum(); currCost += decayContrib; currRMSE = rmse(outputs, 1); currAcc = accuracy(outputs, 1); myLogger.info('(Cost, Acc, RMSE, decay) after training: (%.4f, %.4f, %.4f, %.4f)' % (currCost, currAcc, currRMSE, decayContrib)); self.assert_(currRMSE < beginRMSE, 'RMSE did not decrease.');
def postEpochCall(self, epoch): """Convenience to run some stats at the end of each epoch.""" outputs = self.apply(self.fDictList); outputs = outputs[self.idxArr]; outputs = where(outputs < OFFSET_EPSILON, OFFSET_EPSILON, outputs); outputs = where(outputs > 1-OFFSET_EPSILON, 1-OFFSET_EPSILON, outputs); error = multiply(self.targetArr, log(outputs)) + multiply(1 - self.targetArr, log(1-outputs)); currCost = -error.sum(); decayContrib = self.totalL2Decay * (self.params**2).sum(); theAcc = accuracy(outputs, self.targetArr); theRMSE = rmse(outputs, self.targetArr); if self.epochlog: myLog.info('Epoch %d, curr Cost: %f, decayCont: %f ' % (epoch, currCost, decayContrib)); myLog.info('Epoch %d, theAcc: %f, theRMSE: %f' % (epoch, theAcc, theRMSE)); self.costTrajectory.append(currCost);
def ssh_USERAUTH_REQUEST(self, packet): """ This method is called when a packet is received. The client has requested authentication. Payload: string user string next service string method [authentication specific data] """ self.packet_count += 1 user, nextService, method, rest = getNS(packet, 3) if self.isBadUsername(user): return first = False if self.state is None or self.state.is_invalid( user, nextService ): # If username or desired service has changed during auth, # the RFC says we must discard all state. self.state = UserAuthState( self, user, nextService ) # We do keep track of how many state changes there have been. # This is used to thwart bots. self.state_changes += 1 #log.debug(dir(self.transport.factory.portal)) self.firstContact() first = True log.debug( "Auth request for user {0}, service {1}, method {2}.".format(user, nextService, method) ) if self.state_changes > 3 or self.packet_count > 20: log.info("Disconnecting user: too many attempts") self.disconnect_hostNotAllowed("You are doing that too much!") if first and self.state.user_is_known: self.supportedAuthentications.append("password") if method == "none": # We want to push the user through keyboard-interactive. # This lets the client know what methods we do support. return self.send_authFail() if self.state.user_is_known: # Username is known to us! Do normal login. return self.handle_known_user( method, rest ) else: # This user is not known to us. return self.handle_new_user(method, rest)
def __generalTest(self, featDictList, targArr, archModel, idxArr): """Convenience to test things multiple times""" classifier = MonteFeatDictClassifier(archModel, featDictList, targArr, idxArr); classifier.setupModels(); currOut = classifier.apply(featDictList); currCost = -(multiply(targArr, log(currOut)) + multiply(1 - targArr, log(1-currOut))).sum(); decayContrib = classifier.l2decay * (classifier.params**2).sum(); currCost += decayContrib; currRMSE = rmse(currOut, targArr); currAcc = accuracy(currOut, targArr); myLogger.info('(Cost, Acc, RMSE, decay) before training: (%.4f, %.4f, %.4f, %.4f)' % (currCost, currAcc, currRMSE, decayContrib)); #myLogger.info('head(targArr) : %s, tail(targArr) : %s' % (pprint.pformat(targArr[:5]), pprint.pformat(targArr[-5:]))) #myLogger.info('head(dataArr) : %s, tail(dataArr) : %s' % (pprint.pformat(featDictList[:5]), pprint.pformat(featDictList[-5:]))) #myLogger.info('head(currOut) : %s, tail(currOut) : %s' % (pprint.pformat(currOut[:5]), pprint.pformat(currOut[-5:]))) #myLogger.info('Starting params : %s' % pprint.pformat(classifier.params)) classifier.train(); fOut = classifier.apply(featDictList); fCost = -(multiply(targArr, log(fOut)) + multiply(1 - targArr, log(1-fOut))).sum(); decayContrib = classifier.l2decay * (classifier.params**2).sum(); fCost += decayContrib; fRMSE = rmse(fOut, targArr); fAcc = accuracy(fOut, targArr); myLogger.info('(Cost, Acc, RMSE, decay) after training: (%.4f, %.4f, %.4f, %.4f)' % (fCost, fAcc, fRMSE, decayContrib)); #myLogger.info('head(fOut) : %s, tail(fOut) : %s' % (pprint.pformat(fOut[:5]), pprint.pformat(fOut[-5:]))) myLogger.info('Final params : %s' % pprint.pformat(classifier.params)) self.assert_(fRMSE < currRMSE, 'RMSE did not decrease.')
def postEpochCall(self, epoch): """Convenience to run some stats at the end of each epoch.""" if self.saveparams: self.paramHistory.append(self.params[:]) outputs = self.apply(self.fDictList); lOut = outputs[self.problemArr[:, 0]] rOut = outputs[self.problemArr[:, 1]] sigOut = sigmoid(lOut - rOut); sigOut = where(sigOut < OFFSET_EPSILON, OFFSET_EPSILON, sigOut); sigOut = where(sigOut > 1-OFFSET_EPSILON, 1-OFFSET_EPSILON, sigOut); # Cross-Entropy # NOTE here that all targs are 1. error = log(sigOut) currCost = -error.sum(); decayContrib = self.totalL2Decay * (self.params**2).sum(); theAcc = accuracy(sigOut, 1); theRMSE = rmse(sigOut, 1); if self.epochlog: myLog.info('Epoch %d, curr Cost: %f, decayCont: %f ' % (epoch, currCost, decayContrib)); myLog.info('Epoch %d, theAcc: %f, theRMSE: %f' % (epoch, theAcc, theRMSE)); self.costTrajectory.append(currCost);
def handle_new_user(self, method, rest): """ Handles incoming auth from a new, unknown username. """ if method == "publickey": # Store their pubkeys so they can use one to register with us. log.debug( "Pubkey attempt" ) self.store_pubkey( rest ) elif method == "keyboard-interactive": log.debug( "Interactive attempt") # Start up the keyboard-interactive state machine. # This will take care of asking questions. self.state.begin_interactive() elif method == "password": # We told this client we don't support passwords # but they are ignoring us. Probably a bot. log.info("Disconnecting user: illegal password attempt") self.disconnect_noAuthAllowed("This auth method is not allowed") self.transport.factory.banHost(self.ip) else: # No idea what this is, but we don't support it. log.debug( "Unknown {0} attempt".format(method) ) self.send_authFail()
def balanceIdList(self, idList, balanceByCol=2): """Given an IdList object, repeat the entries labeled with 1 so that we have close to the same number of entries with label 0.""" numZeros = 0; numOnes = 0; for row in idList: if int(row[balanceByCol]) == 0: numZeros += 1; else: numOnes += 1; numTimesToRepeat = int(float(numZeros)/numOnes) - 1; ## Artifically up this a little bit.., numTimesToRepeat += 3; log.info('In balance, will repeat %d times' % numTimesToRepeat); for row in idList: yield row; if int(row[balanceByCol]) != 0: for iRepeat in range(numTimesToRepeat): yield row; return;
def main(self, argv): """Callable from Command line""" if argv is None: argv = sys.argv usageStr = \ """usage: %prog [options] mapFile inFile outFile """ parser = OptionParser(usage = usageStr); parser.add_option('-p', '--pklfile', dest='pklfile', default=None, help='Set to save the normalization params'); parser.add_option('-t', '--testFile', dest='testFile', default=None, help='Set to load in this test file to normalize as well.') parser.add_option('-o', '--outTestFile', dest='outTestFile', default=None, help='Where to save the outTestFile') parser.add_option('-w', '--logwarning', dest='logwarning', default=False, action='store_true', help='Log warnings about features missing from maps/data') (options, args) = parser.parse_args(argv[1:]) if len(args) == 3: mapFile = args[0]; inFile = args[1]; outFile = args[2]; self.logwarning = options.logwarning self.loadFeatKeyToColMap(mapFile); # First read through the original file and calc the params log.info('Calculating the norm params on %s' % inFile) ifs = gzip.open(inFile) reader = FeatureDictReader(ifs); normParamsDict = self.processFeatureDictList(reader) ifs.close(); #Save the params if options.pklfile is not None: self.saveNormalizationParams(normParamsDict, options.pklfile) # Then process the inFile log.info('About to normalize : %s' % outFile) ofs = gzip.open(outFile, 'w') writer = FeatureDictWriter(ofs); ifs = gzip.open(inFile) reader = FeatureDictReader(ifs) progress = ProgressDots(); for idx, fDict in self.normalizeFeatDictList(reader, normParamsDict, self.featKeyToColMap): writer.update(fDict, str(idx)); progress.Update(); progress.PrintStatus(); ifs.close(); ofs.close(); # then if the -t and -o options were set do the same on the test Data if options.testFile is not None and options.outTestFile is not None: ifs = gzip.open(options.testFile) ofs = gzip.open(options.outTestFile, 'w') reader = FeatureDictReader(ifs) writer = FeatureDictWriter(ofs) log.info('About normalize : %s' % options.testFile); progress = ProgressDots(); for idx, fDict in self.normalizeFeatDictList(reader, normParamsDict, self.featKeyToColMap): writer.update(fDict, str(idx)); progress.Update(); progress.PrintStatus(); ifs.close(); ofs.close(); else: parser.print_help(); sys.exit(2);
def train(self): """Method to run through all the data and train away""" # Set some things up if doing online # will only calculate the gradient on the left hand side at a time # So make a target array and make sure we look at each ordered pair in both orders numOnlineRuns = int((self.problemArr.shape[0])/float(self.onlineChunkSize)) + 1; theIdx = arange(0, self.problemArr.shape[0]) theStep = int(len(theIdx) / numOnlineRuns) + 1; ## Check if we have some null rows. If we do, then stratify the rows trained on in each ## online step ## We don't want a complete step of left -1's or right -1's. isNullRows = (self.problemArr[:,0] == -1) | (self.problemArr[:, 1] == -1) isNullRows = any(isNullRows) if isNullRows: myLog.info('Have -1 in problemArr, stratifying the online step data') nNullIdx = theIdx[(self.problemArr[:,0] != -1) & (self.problemArr[:,1] != -1)] lNullIdx = theIdx[self.problemArr[:,0] == -1] rNullIdx = theIdx[self.problemArr[:,1] == -1] nNullStep = int(len(nNullIdx)/numOnlineRuns) + 1 lNullStep = int(len(lNullIdx)/numOnlineRuns) + 1 rNullStep = int(len(rNullIdx)/numOnlineRuns) + 1 try: for iEpoch in range(self.numEpochs): if self.batch: self.trainer.step(self.fDictList, self.problemArr); else: # Want to balance each of the chunks used in the online learning. if isNullRows: shuffle(nNullIdx); shuffle(lNullIdx) shuffle(rNullIdx) else: shuffle(theIdx); for iOnlineRun in range(numOnlineRuns): if isNullRows: nNullStart = iOnlineRun * nNullStep nNullEnd = nNullStart + nNullStep lNullStart = iOnlineRun * lNullStep lNullEnd = lNullStart + lNullStep rNullStart = iOnlineRun * rNullStep rNullEnd = rNullStart + rNullStep subProbArr = concatenate((self.problemArr[nNullIdx[nNullStart:nNullEnd], :], self.problemArr[lNullIdx[lNullStart:lNullEnd], :], self.problemArr[rNullIdx[rNullStart:rNullEnd], :])) else: rowStart = iOnlineRun * theStep; rowEnd = rowStart + theStep; subIdx = theIdx[rowStart:rowEnd]; subProbArr = self.problemArr[subIdx, :] self.trainer.step(self.fDictList, subProbArr); myLog.debug('About to call cost in postEpoch call') self.postEpochCall(iEpoch) # Test for convergence if self.checkconverge and len(self.costTrajectory) > self.nconvergesteps: if std(self.costTrajectory[-self.nconvergesteps:]) < self.costEpsilon: myLog.critical('Convergence after Epoch %d!!' % iEpoch); return self.costTrajectory; if self.callback is not None: self.callback(self); myLog.critical('Never completely converged after %d epochs!' % self.numEpochs); except KeyboardInterrupt, e: myLog.critical('Interrupted with Keyboard after %d epochs, stopping here, currCost = %f' % \ (iEpoch, self.costTrajectory[-1])) return self.costTrajectory;
def firstContact(self): """ Called the first time a user sends us a userauth request. """ known_text = "Known" if self.state.user_is_known else "Unknown" log.info("{0} user {1} is authenticating".format(known_text, self.state.username))