コード例 #1
0
def compute_information_gain(vectorizer: CountVectorizer, word: str, dataTrain: csr_matrix, targetTrain: [int]) \
        -> float:
    """Compute information gain of given word and return value"""
    word = word.lower()
    parentEntropy = computeEntropy(targetTrain)
    numRows = dataTrain.get_shape()[0]
    wordYesSplit = {0: 0, 1: 0}
    wordNoSplit = {0: 0, 1: 0}
    for count in range(numRows):
        simpleSentence = vectorizer.inverse_transform(dataTrain[count])[0]
        if word in simpleSentence:
            wordYesSplit[targetTrain[count]] += 1
        else:
            wordNoSplit[targetTrain[count]] += 1
    wordYesArray = wordYesSplit[0] * [0] + wordYesSplit[1] * [1]
    #print("lenYesArr: {}, YesDict: {}".format(len(wordYesArray), wordYesSplit))
    wordNoArray = wordNoSplit[0] * [0] + wordNoSplit[1] * [1]
    #print("lenNoArr: {}, NoDict: {}".format(len(wordNoArray), wordNoSplit))
    yesSplitEntropy = computeEntropy(wordYesArray)
    noSplitEntropy = computeEntropy(wordNoArray)
    probYes = len(wordYesArray) / numRows
    probNo = len(wordNoArray) / numRows
    #print("parEnt: {}, YesEnt: {}, NoEnt: {}".format(parentEntropy, yesSplitEntropy, noSplitEntropy))
    #print("probYes= {}, probNo= {}".format(probYes, probNo))

    return parentEntropy - (yesSplitEntropy * probYes +
                            noSplitEntropy * probNo)