def main(): # sys variables transformedDir = "transformed" claimsDetailsOrderedMemberIDDateZipFile = "transformed/ClaimDetailDependent.zip" claimsDetailsOrderedMemberIDDateFile = "transformed/ClaimDetailDependent.csv" trainTransitionsOutputFileName = "buildResults/%strainTransitions.csv" testTransitionsOutputFileName = "buildResults/%stestTransitions.csv" emissionsOutputFileName = "buildResults/%sEmissions.csv" goldStandardFileName = "buildResults/%sGoldStandard.json" transitionColumnNames = ["From_CPT", "To_CPT", "Probability"] emissionColumnNames = ["CPT", "Total_Amount", "Probability"] # extract data, if it doesn't exist print "extracting claims data - if it doesn't exist already" utils.extractFileIfNotExists( claimsDetailsOrderedMemberIDDateZipFile, claimsDetailsOrderedMemberIDDateFile, transformedDir ) # ensure cptToCss file exists print "buildint cptToCss dictionary - if needed" cptDict = cptToCcs.createCptToCcsDictionary() # uncomment these if we need to upload the data to the database again # utils.loadClaimData(claimsDetailsOrderedMemberIDDateFile, cptDict) # utils.saveClaimDetailDependent() # create the builder builder = orderedClaimsHmmBuilder.OrderedClaimsHmmBuilder(claimsDetailsOrderedMemberIDDateFile, cptDict) # get the hmm dictionaries # for filteringType in utils.filteringTypes: filteringType = utils.ageGender print "building the models and gold standard test file with %s" % (filteringType) dictionaryTuples = builder.build(filteringType) # save to file print "saving the models and gold standard test file to buildResults/" utils.createCsvFromMarkovDict(dictionaryTuples[0], emissionColumnNames, (emissionsOutputFileName % (filteringType))) utils.createCsvFromMarkovDict( dictionaryTuples[1], transitionColumnNames, (trainTransitionsOutputFileName % (filteringType)) ) utils.createGoldStandardFile(dictionaryTuples[3], (goldStandardFileName % (filteringType)))
def main(): currentYear = int(time.strftime("%Y")) currentDir = os.getcwd() + "/" resultsDir = "sparkBuildResults/" transformedDir = "transformed" utilModuleFile = currentDir + "spark/utils.py" claimsDetailsOrderedMemberIDDateZipFile = transformedDir + "/ClaimDetailDependent.csv.zip" claimsDetailsOrderedMemberIDDateFile = transformedDir + "/ClaimDetailDependent.csv" claimDataLocation = currentDir + claimsDetailsOrderedMemberIDDateFile transitionType = "transition" emissionType = "emission" goldType = "gold" # extract zip if it doesn't exist already utils.extractFileIfNotExists(claimsDetailsOrderedMemberIDDateZipFile, claimsDetailsOrderedMemberIDDateFile, transformedDir) # 0, 1, 2, 3, 4, 5, 6, 7, 11 # memberId, dependentId, cptCode, ccsCode, patientAmount, totalAmount, Year, Gender are the fields we care about def buildTransitionWrapper(gender, year, state): return utils.buildTransition(utils.ageGender, gender, year, "", "") + state #map columns def mapTransition(row): columns = row.split(',') memberId = columns[0] dependentId = columns[1] ccsCode = columns[3] patientAmount = columns[4] year = columns[6] gender = columns[7] serviceDate = date.today() serviceDateList = columns[11].split(" ") if len(serviceDateList) == 2: serviceDate = datetime.datetime.strptime(serviceDateList[0], "%Y-%m-%d") #natural key, value return ((memberId, dependentId), (memberId, dependentId, ccsCode, patientAmount, year, gender, serviceDate)) def mapSequences(rows): # triage into three separate lists # transition, emission, and gold goldTests = [] for row in rows: personRecords = row[1] sorted(row[1], key=lambda x: (x[6], x[2])) startState = "START" previous = startState keepInModel = not utils.isTest() goldStandard = [] for item in personRecords: # handle transition current = buildTransitionWrapper(item[5], item[4], item[2]) # handle emission emissionKey = previous + "_" + current patientAmount = item[3] # keep in model? #new dictionaries - yield creates each one if (keepInModel): yield ((previous, current, transitionType), 1) yield ((emissionKey, patientAmount, emissionType), 1) else: goldStandard.append((previous, current, emissionKey, patientAmount, item[4], item[5])) previous = current # push gold standard, if it exists if not keepInModel: yield ((row[0][0], row[0][1], goldType), goldStandard) # should only be processing transition and emission def sumConditionalTotals(rows): for row in rows: total = len(row[1]) yield (row[0][0], (row[0][1], total)) # should only be processing transition and emission def createProbabilities(rows): for row in rows: total = 0 # find the total for conditional probability for item in row[1]: total = total + item[1] # yield each new probability for item in row[1]: yield (row[0], (item[0], float(item[1]) / total)) sc = SparkContext("local", "Healthcare Hidden Markov Models", pyFiles=[utilModuleFile]) claimData = sc.textFile(claimDataLocation) claimDataMappedByMemberIdDependentId = claimData.map(mapTransition).groupByKey() transitionEmissionGoldData = claimDataMappedByMemberIdDependentId.mapPartitions(mapSequences).groupByKey() # transition transitionTotals = transitionEmissionGoldData.filter(lambda row: row[0][2] == transitionType).mapPartitions(sumConditionalTotals).groupByKey() transitionProbs = transitionTotals.mapPartitions(createProbabilities).groupByKey() transitions = transitionProbs.flatMapValues(lambda row: row) transitions.saveAsTextFile(resultsDir + "transitionDictionary") # emission emissionTotals = transitionEmissionGoldData.filter(lambda row: row[0][2] == emissionType).mapPartitions(sumConditionalTotals).groupByKey() emissionProbs = emissionTotals.mapPartitions(createProbabilities).groupByKey() emissions = emissionProbs.flatMapValues(lambda row: row) emissions.saveAsTextFile(resultsDir + "emissionDictionary") # gold test goldFiles = transitionEmissionGoldData.filter(lambda row: row[0][2] == goldType).flatMapValues(lambda row: row) goldFiles.saveAsTextFile(resultsDir + "goldFiles") # TODO: finish prediction def handleGoldFileRow(row): print "processing gold files" path = row[1] actualAmount = 0 expectedAmount = 0 predictedAmount = 0 for transition in path: previousTransition = transition[0] currentTransition = transition[1] emissionKey = transition[2] actualAmount = transition[3] print transition goldFiles.foreach(handleGoldFileRow)