def createCurrentState(self, previousState, currentState, row, buildType): rxState = self.createRxState(previousState, currentState) if rxState != currentState: return rxState return utils.buildTransition(buildType, row[7], row[6], '', '') + currentState
def predictNextState(goldFileList, transitionDictionary, emissionDictionary, filterOption): previous = "" for tup in goldFileList: if previous == "": previous = tup[0] continue current = utils.buildTransition(filterOption, tup[3], tup[2], tup[4], tup[5]) + tup[0] # get most frequent from previous if transitionDictionary.has_key(previous): highestProbNext = getHighestProb(transitionDictionary, previous, 1)[0][0] emissionKey = previous + "_" + highestProbNext if emissionDictionary.has_key(emissionKey): frequentAmount = getHighestProb(emissionDictionary, emissionKey, 1)[0][0] randomAmount = getRandomAmount(emissionDictionary, emissionKey) highestAmount = getHighestAmount(emissionDictionary, emissionKey) lowestAmount = getLowestAmount(emissionDictionary, emissionKey) yield [previous, current, highestProbNext, float(tup[1]), randomAmount, frequentAmount, highestAmount, lowestAmount, (current == highestProbNext)] else: yield [previous, current, highestProbNext, float(tup[1]), 0, 0, 0, 0, (current == highestProbNext)] previous = current else: yield ["error " + previous, current, "error", "error", "error", "error", "error", "error", "error"]
def goldFileCheck(goldFileList, transitionDictionary, emissionDictionary, filterOption, maxToTake=100): totalErrors = 0 total = 0 previous = "" current = "" goldAmount = 0 path = previous trainAmount = 0 trainHighest = 0 trainLowest = 0 randomAmount = 0 total = total + 1 foundError = False for tup in goldFileList: # handle start state if previous == "": previous = tup[0] path = path + " " + previous continue current = utils.buildTransition(filterOption, tup[3], tup[2], tup[4], tup[5]) + tup[0] path = path + " " + current amount = float(tup[1]) if (transitionDictionary.has_key(previous) and transitionDictionary[previous].has_key(current)): emissionKey = previous + "_" + current if emissionDictionary.has_key(emissionKey): goldAmount = goldAmount + amount trainAmount = trainAmount + float(getHighestProb(emissionDictionary, emissionKey, 1)[0][0]) trainHighest = trainHighest + getHighestAmount(emissionDictionary, emissionKey) trainLowest = trainLowest + getLowestAmount(emissionDictionary, emissionKey) randomAmount = randomAmount + getRandomAmount(emissionDictionary, emissionKey) previous = current continue totalErrors = totalErrors + 1 foundError = True yield ["%s %s" % (previous, current), "Error", "Error", "Error", "Error", "Error"] break # don't report if there was an error... if foundError == True: return if len(path) > 7500: yield [path[0:7500], goldAmount, randomAmount, trainAmount, trainLowest, trainHighest] else: yield [path, goldAmount, randomAmount, trainAmount, trainLowest, trainHighest]
def predictTrans(goldFileList, transitionDictionary, emissionDictionary, filterOption): previous = "" for tup in goldFileList: if previous == "": previous = tup[0] continue current = utils.buildTransition(filterOption, tup[3], tup[2], tup[4], tup[5]) + tup[0] # get most frequent from previous if transitionDictionary.has_key(previous) and transitionDictionary[previous].has_key(current): highestProbNext = getHighestProb(transitionDictionary, previous, 1)[0][0] yield [previous, current, highestProbNext, transitionDictionary[previous][current], transitionDictionary[previous][highestProbNext], (current == highestProbNext)] previous = current else: yield [previous, current, "error", "error", "error", "error"]
def build(self, buildType): csv_file_object = csv.reader(open(self.fileName, 'rb')) header = csv_file_object.next() # read all of a member's claims in at once # NewMemberID, CPTCode currentMemberId = "" currentDependentId = "" # CPT -> cost emissions = {} # CPT -> CPT testTransitions = {} trainTransitions = {} # gold standard for testing goldStandard = {} # initial one will be thrown away transitions = {} isTest = False previousCptCode = utils.startState for row in csv_file_object: rowMemberId = row[0] dependentId = row[1] rawCode = row[3] currentCptCode = utils.buildTransition(buildType, row[7], row[6], '', '') + rawCode # unfilteredCptCode = self.createRxState(previousCptCode, currentCptCode) patientAmount = float(row[4]) totalAmount = str(patientAmount) if rowMemberId != currentMemberId or dependentId != currentDependentId: # set final state self.setDict(transitions, previousCptCode, utils.endState) self.setDict(emissions, previousCptCode + "_" + utils.endState, totalAmount) (transitions, isTest) = self.determineDictionary(testTransitions, trainTransitions) # set start state startState = utils.startState self.setDict(transitions, startState, currentCptCode) self.setDict(emissions, startState + "_" + currentCptCode, totalAmount) if isTest: goldStandard[rowMemberId + dependentId] = [(startState, 0)] goldStandard[rowMemberId + dependentId].append((currentCptCode, totalAmount, row[6], row[7], '', '')) currentMemberId = rowMemberId currentDependentId = dependentId previousCptCode = currentCptCode continue self.setDict(transitions, previousCptCode, currentCptCode) self.setDict(emissions, previousCptCode + "_" + currentCptCode, totalAmount) if isTest: goldStandard[rowMemberId + dependentId].append((currentCptCode, totalAmount, row[6], row[7], '', '')) previousCptCode = currentCptCode # create probabilities out of these now emissionsProb = self.buildDict(emissions) trainTransitionsProb = self.buildDict(trainTransitions) testTransitionsProb = self.buildDict(testTransitions) return (emissionsProb, trainTransitionsProb, testTransitionsProb, goldStandard)
def buildTransitionWrapper(gender, year, state): return utils.buildTransition(utils.ageGender, gender, year, "", "") + state