def dataProcessing(self, inputFilePath, outputFilePath): """ 1. given function reads csv file, Note: data from https://archive.ics.uci.edu/ml/datasets does not provide headers, so we are providing our own headers 2. pre-process it by 1. removing null values, 2. scaling data if required 3. converting categorical / nominal values into numerical values, 3. writing data to the csv file and returning refined dataframe """ #reading csv from url in dataframe myIO = MyIO() inputDataFrame = myIO.inputCSVFromURL(filePath=inputFilePath) # #debug # print ('inputDataFrame = {} '.format(inputDataFrame)) # #debug -ends dataProcess = DataPreprocess() #creating dummy headers and adding them to dataframe headerList = dataProcess.provideHeaders(inputDataFrame=inputDataFrame) inputDataFrame.columns = headerList #removing null values nullRemovedDataFrame = dataProcess.removeNullValues(inputDataFrame = \ inputDataFrame) #converting catgorical values into integer values numericalDataFrame = dataProcess.categoricalToNumericalConversion(\ dataFrame = nullRemovedDataFrame) #scaling integer and float values refinedDataFrame = dataProcess.scaleData(inputDataFrame =\ numericalDataFrame) # #debug # print ('refinedDataFrame =\n {} '.format(refinedDataFrame)) # #debug -ends #writing refined csv file myIO.writeCSV(inputDataFrame = refinedDataFrame, outputFilePath = \ outputFilePath) return refinedDataFrame
def preprocessTestingData(self, testingDirPath): """ input: testingDirPath, fileActualClassDict output: fileTokenDict Given function walks through all files in testingDir Path, and provides unique tokens of eachFile and store them in the form, key = fileName, value= uniqueTokenList """ fileTokenDict = {} fileActualClassDict = {} for currentRoot, dirs, files in os.walk(testingDirPath): #walking through all files in the currentDir for currentFile in files: #finding file path of current Directory and reading its content currentFilePath = os.path.join(currentRoot, currentFile) myIO = MyIO() currentInputStr = myIO.readDoc(docPath=currentFilePath) # #debug # print("currentInputStr : {}".format(currentInputStr)) # #debug -ends #finding token of given file fileTokenList = self._tokenizationFilter( rowStr=currentInputStr) #adding given file token list to class token list # fileTokenDict[currentFile] = list(set(fileTokenList)) fileTokenDict[currentFile] = fileTokenList #assigning actual class value fileActualClassDict[currentFile] = currentRoot.split(\ os.path.sep)[-1] #for currentFile -ends #for currentRoot, dirs, files -ends return fileTokenDict, fileActualClassDict
def preprocessTrainingData(self, dirPath): """ Input: dirPath output: classTokenList, uniqueTokenList, nDocsInClassArr, dirNameList classTokenList is a list which has sublist of all tokens of classes of given directory uniqueTokenList is a list which has all unique tokens of all classes of given directory nDocsInClassArr is a numpy array with number of documents of each class dirNameList provides all directory names (which are class names here), from current dirPath (training or testing) given folder takes dirPath, walk through all the directories, read its files, tokenize them and return the combine tokens of all classes, unique token list, and number of documents in each class """ #variables classTokenList = [] generalTokenList = [] nDocsInClassList = [] dirNameList = next(os.walk(dirPath))[1] # #debug # print("dirNameList : {}".format(dirNameList)) # #debug -ends #walking through all internal directories, reading files, finding tokens for currentRoot, dirs, files in os.walk(dirPath): #finding number of files in given directory and assigning it to list nFiles = len(files) nDocsInClassList.append(nFiles) #walking through all files in the currentDir currentClassTokenList = [] for currentFile in files: #finding file path of current Directory and reading its content currentFilePath = os.path.join(currentRoot, currentFile) myIO = MyIO() currentInputStr = myIO.readDoc(docPath=currentFilePath) # #debug # print("currentInputStr : {}".format(currentInputStr)) # #debug -ends #finding token of given file fileTokenList = self._tokenizationFilter( rowStr=currentInputStr) #adding given file token list to class token list currentClassTokenList.extend(fileTokenList) generalTokenList.extend(fileTokenList) #for currentFile -ends #appending currentClassTokenList to classTokenList classTokenList.append(currentClassTokenList) # #putting all tokens in one token list # generalTokenList.extend(classTokenList) #for currentRoot,dirs,files -ends # #debug # print("generalTokenList : {}".format(generalTokenList)) # #debug -ends uniqueTokenList = list(set(generalTokenList)) #Assuming that our currentFile path is a train/test path, which contains #all the classDir, and no files, and the classDir contains all the file #Now, root directory does not provide class info. So removing its data classTokenList.pop(0) # uniqueTokenList.pop(0) nDocsInClassList.pop(0) nDocsInClassArr = np.array(nDocsInClassList) #returning outputs return classTokenList, uniqueTokenList, nDocsInClassArr, dirNameList
def myUI(self, w1, h1, w2, h2, name_input, name_output): ''' given function performs the following tasks: 1. read the image in BGR format 2. convert w1, h1, w2, h2 window size in respective pixel format 3. convert BGR image into Luv image 4. find histogram of entire image on Luv domain, where L is in range of given window 5. convert Luv image into BGR image 6. write output image ''' # 1. read the image in BGR format myIO = MyIO() bgrImg = myIO.readImage(name_input) # debug print("bgrImg =\n {}".format(bgrImg)) # debug -ends # debug myIO.showImage(bgrImg, "BGR Image") # debug -ends # 2. convert w1, h1, w2, h2 window size in respective pixel format W1, H1, W2, H2 = myIO.windowsSizeMapping(inputImage = bgrImg,\ w1 = w1, h1=h1,\ w2 = w2, h2=h2) # debug print("W1 = {}, H1={}, W2={}, H2={}".format(W1, H1, W2, H2)) # debug -ends # 3. convert BGR image into Luv image colorProcess=ColorProcess() LuvImg = colorProcess.bgrToLuv(bgrImg = bgrImg) # debug print("-----------------------------------------------------") print("\nLuvImg = \n{}".format(LuvImg)) # debug -ends #4. find histogram of entire image on Luv domain, # where L is in range of given window imageProcess = ImageProcess() HELuvImg = imageProcess.histogramEqualizationInLuv(LuvImg, W1, H1, W2, H2) # debug print("-----------------------------------------------------") print("HELuvImg = \n{}".format(HELuvImg)) # debug -ends # 5. convert Luv image into BGR image HEBGRImage = colorProcess.LuvToBGR(LuvImage = HELuvImg) # debug myIO.showImage(HEBGRImage, "Histogram Equalized BGR Image") cv2.waitKey(0) # debug -ends # debug print("-----------------------------------------------------") print("HEBGRImage =\n {}".format(HEBGRImage)) # debug -ends #6. write output image myIO.writeImage(outputImage = HEBGRImage, name_output = name_output)
def createNeuralNetwork(self, inputFilePath, trainingPercent, maxItr, nHiddenLayers, nNeurons): """ given function creates neural network, and displays its waight at each level and accuracy of the network 0. read dataset 1. split dataset into training and testing datasets 2. initialize network a. take weights randomly for hidden and output layers 3. forward propogation a. neuron activation: sigma(wi*xi) b. neuron transfer : sigmoid function 1/(1+e(-x)) 4. back propogation a. transfer derivative b. error back propogation 5. train network a. update weights 6. predict 7. find mean square errors """ #0. read dataset myIO = MyIO() inputDataFrame = myIO.inputProcessedCSV(filePath=inputFilePath) headerList = inputDataFrame.columns.values #1. split dataset into training and testing dataset\ myUtility = MyUtility() trainingDataFrame, testingDataFrame = myUtility.splitDataset(\ inputDataFrame = inputDataFrame,\ trainingPercent = trainingPercent) #2. initializeNeuralNetwork uniqueClasses = inputDataFrame['class'].unique() numOfUniqueClasses = uniqueClasses.size trainingDataArr = trainingDataFrame.values testingDataArr = testingDataFrame.values trainingAtrArr, trainingClassArr, trainingAtrHeader = \ myUtility.segregateAttributesAndClass(\ inputArr = trainingDataArr,\ inputHeader = headerList) testingAtrArr, testingClassArr, testingAtrHeader =\ myUtility.segregateAttributesAndClass(\ inputArr = testingDataArr,\ inputHeader = headerList) nRows, nCols = trainingDataArr.shape neuralNetwork = NeuralNetwork( nInputs = nCols-1,\ nHiddenLayers = nHiddenLayers, \ nNeurons = nNeurons, \ nOutputs = numOfUniqueClasses) #4. back propogation # neuralNetwork.findBackwardPropagationError(targetValue = [1,0,0]) trainingError = neuralNetwork.trainNetwork(\ trainingDataArr = trainingDataArr,\ nIteration = maxItr,\ numOfUniqueClasses=numOfUniqueClasses, \ learningRate=0.5) trainingPredictedOPArr = neuralNetwork.predictDataset(\ testingDataSet = trainingDataArr) testingPredictedOPArr = neuralNetwork.predictDataset(\ testingDataSet = testingDataArr) trainingError = neuralNetwork.meanSquareError(\ targetArr = trainingClassArr,\ predictedOutputArr = trainingPredictedOPArr) testingError = neuralNetwork.meanSquareError(\ targetArr = testingClassArr,\ predictedOutputArr = testingPredictedOPArr) #debug print("\nAfter training neural network:\n") neuralNetwork.printNeuralNetworkWeights(headerList=trainingAtrHeader) print('\ntrainingError = {}'.format(trainingError)) print('testingError = {} '.format(testingError))
def createAlarmBayes(self, inputParam, queryParam): """ Given function is a UI function, which takes inputParam and queryParam and perform tasks #TODO: complete comment details - which tasks and output if any """ myIO = MyIO() evidences_input = myIO.parse_evidence_input(input_value=inputParam) query_params = myIO.parse_query_input(input_value=queryParam) # #debug # print ('evidences_input = {} '.format(evidences_input)) print('query_params = {}'.format(query_params)) # #debug -ends alarmBayes = AlarmBayes() # work on each query param turn by turn for query in query_params: # result for query print("\n###################################################") print ("\t RESULT FOR QUERY: {}".format(alarmBayes.find_node(\ query).name.upper())) print("###################################################") # enum result enumerationUtil = EnumerationUtil() enumerationUtil.result_for_enumeration(query, evidences_input,\ alarmBayes) # sampling sample_list = [10, 50, 100, 200, 500, 1000, 10000, 100000] sample_output = \ enumerationUtil.result_for_sampling(query, evidences_input,\ alarmBayes, sample_list) # sampling rejection result_with_sample_rejection = \ enumerationUtil.result_for_sampling_rejection(query, evidences_input, \ alarmBayes, sample_list) nSamples = len(sample_list) print( "\n------------------- sampling (positive samples / total samples) ---------------------------" ) myIO.print_sample_output(sample_output, nSamples) print( "\n------------------ sample - rejection (positive samples / total samples)-------------------" ) # myIO.print_sample_output(sample_rejection_output, nSamples) myIO.print_sample_rejection_output(result_with_sample_rejection) #finding likelihood likelihoodUtil = LikelihoodUtil() likelihood_result = enumerationUtil.result_for_likelihood_weight( query, evidences_input, alarmBayes, sample_list) print( "\n------------- likelihood (query sample weight / total weight) ----------------------------" ) myIO.print_likelihood_output(likelihood_result, nSamples)
def decisionTreeUI(self, trainingPath, validationPath, testingPath,\ pruningFactor): """ given UI method performs following tasks: 1. takes input 2. finds decision tree using ID3 algorithm 3. perform pruning 4. provides output """ #taking input myIO = MyIO() trainingData,trainingHeader,trainingClassArr = myIO.inputCSV(trainingPath) validationData, validationHeader, validationClassArr = myIO.inputCSV(validationPath) testingData, testingHeader, testingClassArr = myIO.inputCSV(testingPath) #finding entropy of the class treeGeneration=TreeGeneration() trainingEntropyOfClass = treeGeneration.findEntropyOfClass(trainingClassArr) # #debug # print ('entropyOfClass = {} '.format(trainingEntropyOfClass)) # #debug -ends #calling createDecisionTree() to get treeNodeList treeNodeList = treeGeneration.createDecisionTree(dataArr = trainingData,\ headerList = trainingHeader,\ classArr = trainingClassArr,\ classEntropy = trainingEntropyOfClass,\ treeNode = [], \ rootNodeCounter = 0,\ parentNode = None) # #debug # print(RenderTree(node = treeNodeList[0], style=AsciiStyle())) # #debug -ends #printing tree myIO.printTree(treeNodeList) accuracyCalculation = AccuracyCalculation() prePruningTrainingAccuracy = accuracyCalculation.findAccuracy(\ dataArr = trainingData,\ headerList = trainingHeader,\ classArr = trainingClassArr,\ treeNodeList = treeNodeList) prePruningValidationAccuracy = accuracyCalculation.findAccuracy(\ dataArr = validationData,\ headerList = validationHeader,\ classArr = validationClassArr,\ treeNodeList = treeNodeList) prePruningTestingAccuracy = accuracyCalculation.findAccuracy(\ dataArr = testingData,\ headerList = testingHeader,\ classArr = testingClassArr,\ treeNodeList = treeNodeList) #printing accuracy report print ("-------------------------") print ("pre-Pruning accuracy") print ("-------------------------") myIO.printAccuracyReport(dataArr = trainingData,\ accuracy = prePruningTrainingAccuracy,\ dataTypeStr = "training",\ treeNodeList = treeNodeList) myIO.printAccuracyReport(dataArr = validationData,\ accuracy = prePruningValidationAccuracy,\ dataTypeStr = "validation") myIO.printAccuracyReport(dataArr = testingData,\ accuracy = prePruningTestingAccuracy,\ dataTypeStr = "testing") pruningTree = PruningTree() prunedTreeNodeList = pruningTree.findPrunedTree(\ pruningFactor = pruningFactor,\ treeNodeList = treeNodeList,\ validationData = validationData,\ validationHeader = validationHeader,\ validationClassArr = validationClassArr,\ initialvalidationAccuracy = \ prePruningValidationAccuracy) postPruningTrainingAccuracy = accuracyCalculation.findAccuracy(\ dataArr = trainingData,\ headerList = trainingHeader,\ classArr = trainingClassArr,\ treeNodeList = prunedTreeNodeList) postPruningValidationAccuracy = accuracyCalculation.findAccuracy(\ dataArr = validationData,\ headerList = validationHeader,\ classArr = validationClassArr,\ treeNodeList = prunedTreeNodeList) postPruningTestingAccuracy = accuracyCalculation.findAccuracy(\ dataArr = testingData,\ headerList = testingHeader,\ classArr = testingClassArr,\ treeNodeList = prunedTreeNodeList) #printing accuracy report print ("-------------------------") print ("post-Pruning accuracy") print ("-------------------------") myIO.printAccuracyReport(dataArr = trainingData,\ accuracy = postPruningTrainingAccuracy,\ dataTypeStr = "training",\ treeNodeList = prunedTreeNodeList) myIO.printAccuracyReport(dataArr = validationData,\ accuracy = postPruningValidationAccuracy,\ dataTypeStr = "validation") myIO.printAccuracyReport(dataArr = testingData,\ accuracy = postPruningTestingAccuracy,\ dataTypeStr = "testing") return treeNodeList, prunedTreeNodeList