def compute_information_gain(vectorizer: CountVectorizer, word: str, dataTrain: csr_matrix, targetTrain: [int]) \ -> float: """Compute information gain of given word and return value""" word = word.lower() parentEntropy = computeEntropy(targetTrain) numRows = dataTrain.get_shape()[0] wordYesSplit = {0: 0, 1: 0} wordNoSplit = {0: 0, 1: 0} for count in range(numRows): simpleSentence = vectorizer.inverse_transform(dataTrain[count])[0] if word in simpleSentence: wordYesSplit[targetTrain[count]] += 1 else: wordNoSplit[targetTrain[count]] += 1 wordYesArray = wordYesSplit[0] * [0] + wordYesSplit[1] * [1] #print("lenYesArr: {}, YesDict: {}".format(len(wordYesArray), wordYesSplit)) wordNoArray = wordNoSplit[0] * [0] + wordNoSplit[1] * [1] #print("lenNoArr: {}, NoDict: {}".format(len(wordNoArray), wordNoSplit)) yesSplitEntropy = computeEntropy(wordYesArray) noSplitEntropy = computeEntropy(wordNoArray) probYes = len(wordYesArray) / numRows probNo = len(wordNoArray) / numRows #print("parEnt: {}, YesEnt: {}, NoEnt: {}".format(parentEntropy, yesSplitEntropy, noSplitEntropy)) #print("probYes= {}, probNo= {}".format(probYes, probNo)) return parentEntropy - (yesSplitEntropy * probYes + noSplitEntropy * probNo)