Esempio n. 1
0
    def __getdKlArray__(self):
        dKL = []

        print(f"Calculating kl information for {self.__strTypeDictionary__()}")
        bar = defaultProgress(len(self.words)).start()
        j = 0
        for wordVector in self.words:
            KLt = 0

            pOfWord = 0
            argLogKt = 0

            for i in range(0, len(wordVector.groupVector)):
                # KLt
                pOfC = self.totalClassDocuments[i] / self.totalDocuments
                groupedWord = wordVector.groupVector[i]
                # Kt
                pOfWord += pOfC * wordVector.weights[i]

                if groupedWord is not None:
                    KLt -= pOfC * wordVector.weights[i] * math.log(
                        groupedWord.documents / self.totalClassDocuments[i])
                    argLogKt += groupedWord.documents / self.totalDocuments

            Kt = -pOfWord * math.log(argLogKt)
            KL = Kt - KLt
            dKL.append([KL, wordVector])
            j += 1
            bar.update(j)
        bar.finish()

        dKL.sort(reverse=True, key=lambda tup: tup[0])
        return dKL
Esempio n. 2
0
def copyFiles(datasetPath, files, destinationPath):
    bar = defaultProgress(len(files)).start()
    i = 0
    for file in files:
        copyfile("{}/{}".format(datasetPath, file), "{}/{}".format(destinationPath, file))
        i += 1
        bar.update(i)
    
    bar.finish()
Esempio n. 3
0
    def startTest(self, maxLength=-1, kl=False):
        mbmDictionary, mmDictionary = self.__setSelectFeature__(maxLength, kl)

        currentTestedFiles = 0
        correctMBMPrediction = 0
        correctMMPrediction = 0

        for testGroup in self.testGroups:
            print(f"Testing file in group {testGroup.name}")
            totalTestFiles = len(testGroup.documents)
            bar = defaultProgress(totalTestFiles).start()
            documentTested = 0
            for document in testGroup.documents:
                mbmWeights = mbmDictionary.classifyDictionary(
                    document.dictionary)
                mmWeights = mmDictionary.classifyDictionary(
                    document.dictionary)

                groupMBMPosition = 0
                groupMMPosition = 0
                minMBMWeight = mbmWeights[0]
                minMMWeight = mmWeights[0]

                for i in range(0, len(mbmWeights)):
                    if minMBMWeight > mbmWeights[i]:
                        minMBMWeight = mbmWeights[i]
                        groupMBMPosition = i
                    if minMMWeight > mmWeights[i]:
                        minMMWeight = mmWeights[i]
                        groupMMPosition = i

                if self.trainGroups[groupMBMPosition].name == testGroup.name:
                    correctMBMPrediction += 1
                if self.trainGroups[groupMMPosition].name == testGroup.name:
                    correctMMPrediction += 1

                currentTestedFiles += 1
                documentTested += 1
                bar.update(documentTested)
            bar.finish()
            print(f"Done testing group {testGroup.name}")

        accuracyMBM = correctMBMPrediction / currentTestedFiles
        accuracyMM = correctMMPrediction / currentTestedFiles

        mbmTest = Test("MBM", len(mbmDictionary.words), accuracyMBM)
        self.resultMBMTest.append(mbmTest)

        mmTest = Test("MM", len(mmDictionary.words), accuracyMM)
        self.resultMMTest.append(mmTest)

        return mbmTest, mmTest
Esempio n. 4
0
    def featureSelection(self, maxLength):
        remainingWords = []

        print(f"Selecting feature for {self.__strTypeDictionary__()}")
        if len(self.activeInformation) < maxLength:
            maxLength = len(self.activeInformation)
        bar = defaultProgress(maxLength).start()
        for i in range(0, maxLength):
            remainingWords.append(self.activeInformation[i][1])
            bar.update(i)
        bar.finish()
        print("Done selecting feature")
        self.__setNewWordsVectors__(remainingWords)
Esempio n. 5
0
 def createParameters(self):
     self.__setUpTotalWordsCount__()
     print(
         f"Starting calculating parameters for {self.__strTypeDictionary__()}"
     )
     bar = defaultProgress(len(self.words)).start()
     i = 0
     self.resetStartWeight()
     self.__setUpTotalWordsCount__()
     for word in self.words:
         word.updateWeights()
         self.__updateStartWeights__(word)
         i += 1
         bar.update(i)
     bar.finish()
     print("Parameters created")
Esempio n. 6
0
    def cleanDictionary(self):
        print(f"Cleaning {self.__strTypeDictionary__()}")
        bar = defaultProgress(len(self.words)).start()
        cleanedWords = []
        for i in range(0, len(self.words)):
            wordInDocuments = 0
            for groupedWord in self.words[i].groupVector:
                if groupedWord != None:
                    wordInDocuments += self.__cleanValueWord__(groupedWord)

            if wordInDocuments > 1:
                cleanedWords.append(self.words[i])
            bar.update(i)
        bar.finish()
        removedWords = len(self.words) - len(cleanedWords)
        self.words = cleanedWords
        print(f"Removed {removedWords} words")
Esempio n. 7
0
    def readDocuments(self, stopWords=[], headers=[], fastReading=False):
        self.dictionary.clean()

        print(f"Start reading group {self.name}, type: {self.type}")
        bar = defaultProgress(len(self.documents)).start()
        i = 0
        for document in self.documents:
            document.readWords(stopWords, headers, fastReading)

            for word in document.dictionary.words:
                self.dictionary.searchAndAddWord(
                    GroupedWord(word.text, self, word.counted, 1))

            document.clearReadedWords()
            i += 1
            bar.update(i)
        self.setTotalCountedWords()
        bar.finish()
        print(f"Done reading group {self.name}")
Esempio n. 8
0
    def createDictionary(self):
        if self.datasetReaded is False:
            self.readDataset()

        self.mbmWeightedDictionary = MBMWeightedDictionary(self.trainGroups)
        self.mmWeightedDictionary = MMWeightedDictionary(self.trainGroups)

        print("Creating weight")
        for group in self.trainGroups:
            print(f"Adding weight from group {group.name}")
            bar = defaultProgress(len(group.dictionary.words)).start()
            i = 0
            for word in group.dictionary.words:
                self.mbmWeightedDictionary.searchAndAddWord(word)
                self.mmWeightedDictionary.searchAndAddWord(word)
                i += 1
                bar.update(i)
            bar.finish()
            print(f"Done adding weight from group {group.name}")

        self.mbmWeightedDictionary.cleanDictionary()
        self.mmWeightedDictionary.cleanDictionary()

        self.mbmWeightedDictionary.createParameters()
        self.mmWeightedDictionary.createParameters()

        self.mbmWeightedDictionary.setUpFeatureInformation()
        self.mmWeightedDictionary.setUpFeatureInformation()

        print(
            bcolors.OKGREEN +
            f"Dictionary MBM created with {len(self.mbmWeightedDictionary.words)} words"
            + bcolors.ENDC)
        print(
            bcolors.OKGREEN +
            f"Dictionary MM created with {len(self.mmWeightedDictionary.words)} words"
            + bcolors.ENDC)
Esempio n. 9
0
    def __getMutualInformationArray__(self):
        mutualInformation = []

        print(
            f"Calculating mutual information for {self.__strTypeDictionary__()}"
        )
        bar = defaultProgress(len(self.words)).start()
        j = 0
        for wordVector in self.words:
            mi = 0
            B = wordVector.getSumOfCounted()

            for i in range(0, len(wordVector.groupVector)):
                groupedWord = wordVector.groupVector[i]
                A = 0
                if groupedWord is not None:
                    A = groupedWord.counted

                C = self.totalClassWords[i]

                n = A * self.totalWords
                d = B * C
                if A != 0:
                    mi += (A / self.totalWords) * math.log(n / d)

                n0 = (C - A) * self.totalWords
                d0 = (self.totalWords - B) * C
                if C - A != 0:
                    mi += ((C - A) / self.totalWords) * math.log(n0 / d0)

            mutualInformation.append([mi, wordVector])
            j += 1
            bar.update(j)
        bar.finish()

        mutualInformation.sort(reverse=True, key=lambda tup: tup[0])
        return mutualInformation