コード例 #1
0
    def getWordFirstBop_DiscretizedTss(self,
                                       discretizedTss,
                                       numReduct=True,
                                       indictBi=-1):
        '''
        indictBi: -1: No bigrams. 0: Current words are unigrams, but we want bigrams latter. \
        1: Current words are bigrams, and we do not need other words.
        
        The idea of using bigrams comes from 
        
        Patrick Schafer, Ulf Leser:
        Fast and Accurate Time Series Classification with WEASEL. CIKM 2017: 637-646
        
        This idea is not used in the final version of HBOP for efficiency concerns.
        
        '''
        bagWord = {}
        allBigrams = []
        for i, discretizedTs in enumerate(discretizedTss):
            if self.binary:
                bagTs = np.unique(discretizedTs)
                for word in bagTs:
                    if indictBi >= 0:
                        word = bu.appendBits(word, indictBi, 1)
                    if word in bagWord.keys():
                        bagWord[word].add(i)
                    else:
                        bagWord[word] = {i}

            else:
                if numReduct:
                    prevWord = None
                for word in discretizedTs:
                    if not numReduct or word != prevWord:
                        if numReduct:
                            prevWord = word
                        if indictBi >= 0:
                            word = bu.appendBits(word, indictBi, 1)
                        if word in bagWord.keys():
                            bagWord[word][i] = bagWord[word][
                                i] + 1 if i in bagWord[word].keys() else 1
                        else:
                            bagWord[word] = {i: 1}

            if indictBi == 0:
                numBigrams = len(discretizedTs) - self.discretizer.winLen
                if numBigrams > 0:
                    bigrams = np.empty(numBigrams, dtype='int64')
                    shift = bu.numBits(
                        self.discretizer.card) * self.discretizer.wordSize
                    for j in range(numBigrams):
                        bigrams[j] = bu.appendBits(
                            discretizedTs[j],
                            discretizedTs[j + self.discretizer.winLen], shift)
                    allBigrams.append(bigrams)
        if len(allBigrams) > 0:
            bagBigrams = self.getWordFirstBop_DiscretizedTss(
                allBigrams, numReduct, 1)
            bagWord.update(bagBigrams)
        return bagWord
コード例 #2
0
 def createMethodId(self, wordSizeInd, numBitsWordSize, wordType,
                    numBitsWordType):
     methodId = 0
     methodId = bu.appendBits(methodId, wordSizeInd, numBitsWordSize)
     methodId = bu.appendBits(methodId, wordType, numBitsWordType)
     #         methodId = bu.appendBits(methodId, xSimId, numBitsXSimId)
     return methodId
コード例 #3
0
    def getBOP_DiscretizedTs(
            self,
            discretizedTs,
            numReduct=True,
            indictBi=-1
    ):  #numerosity reduction optional, include bigram optional.
        '''
        indictBi: -1: No bigrams. 0: Current words are unigrams, but we want bigrams latter. \
        1: Current words are bigrams, and we do not need other words.
        
        The idea of using bigrams comes from 
        
        Patrick Schafer, Ulf Leser:
        Fast and Accurate Time Series Classification with WEASEL. CIKM 2017: 637-646
        
        This idea is not used in the final version of HBOP for efficiency concerns.
        
        '''
        #unigram
        if self.binary:
            bagTs = np.unique(discretizedTs)
            if indictBi >= 0:
                for i in range(len(bagTs)):
                    bagTs[i] = bu.appendBits(bagTs[i], indictBi, 1)
            bagTs = set(bagTs)
        else:
            bagTs = {}
            if numReduct:
                prevWord = None
            for word in discretizedTs:
                if not numReduct or word != prevWord:
                    if numReduct:
                        prevWord = word
                    if indictBi >= 0:
                        word = bu.appendBits(word, indictBi, 1)
                    bagTs[word] = bagTs[word] + 1 if word in bagTs.keys(
                    ) else 1

        #bigram
        if indictBi == 0:
            numBigrams = len(discretizedTs) - self.discretizer.winLen
            if numBigrams > 0:
                bigrams = np.empty(numBigrams, dtype='int64')
                shift = bu.numBits(
                    self.discretizer.card) * self.discretizer.wordSize
                for i in range(numBigrams):
                    bigrams[i] = bu.appendBits(
                        discretizedTs[i],
                        discretizedTs[i + self.discretizer.winLen], shift)
                biBagTs = self.getBOP_DiscretizedTs(bigrams, numReduct, 1)
                if self.binary:
                    bagTs = bagTs | biBagTs
                else:
                    bagTs.update(biBagTs)
        return bagTs
コード例 #4
0
 def calcDistanceBetweenWords(self, word_0, word_1, sfxLen=0):
     word_0 = bu.trimBits(word_0, sfxLen)
     word_1 = bu.trimBits(word_1, sfxLen)
     numBitsLetter = bu.numBits(self.card)
     dist = 0
     for i in range(self.wordSize):
         shift = i * numBitsLetter
         letter_0 = bu.getBits(word_0, shift, numBitsLetter)
         letter_1 = bu.getBits(word_1, shift, numBitsLetter)
         dist += np.abs(letter_0 - letter_1)
     return dist
コード例 #5
0
 def createBOPId(self,
                 winLenInd,
                 numBitsWinLen,
                 wordSizeInd,
                 numBitsWordSize,
                 wordType,
                 numBitsWordType=1):
     bopId = 0
     bopId = bu.appendBits(bopId, winLenInd, numBitsWinLen)
     bopId = bu.appendBits(bopId, wordSizeInd, numBitsWordSize)
     bopId = bu.appendBits(bopId, wordType, numBitsWordType)
     return bopId
コード例 #6
0
 def discretizeTransformed(self, transformed):
     numBitsLetter = bu.numBits(self.card)
     word = 0
     wordSize = min([self.wordSize, len(self.cValOrder)])
     for i in range(wordSize):
         idx = self.cValOrder[i]
         val = transformed[idx]
         boundaries = self.allBoundaries[idx]
         for j in range(0, len(boundaries) - 1):
             if val >= boundaries[j] and val < boundaries[j + 1]:
                 word = bu.appendBits(word, j, numBitsLetter)
                 break
     return word
コード例 #7
0
    def predict(self, allBagsTs, allInfo, fineMetId, numBitsXSimId,
                numBitsWordType, numBitsSLAId, numBitsSAXId):

        methodId = bu.trimBits(fineMetId, numBitsXSimId)
        xSimId = bu.getBits(fineMetId, 0, numBitsXSimId)
        wordType = bu.getBits(methodId, 0, numBitsWordType)
        numBitsId = numBitsSLAId if wordType == 0 else numBitsSAXId
        (bops, selectedWordInfo, selectedBopIds, selectedWords,
         sigmas2Centroids_nx, sigmas2Centroids_x, cv1_scores,
         XMeansRelabelMap) = allInfo[methodId]
        curBopIds = selectedBopIds[xSimId]
        curWords = selectedWords[xSimId]
        cv1_score = cv1_scores[xSimId]
        sigma2Centroids = sigmas2Centroids_nx if xSimId in (
            0, 1) else sigmas2Centroids_x
        XMeansRelabelMap = None if xSimId in (0, 1) else XMeansRelabelMap
        XMeansNumCls = len(
            XMeansRelabelMap) if XMeansRelabelMap is not None else self.numCls
        dists = np.zeros(XMeansNumCls)
        if xSimId in (1, 3):  #tfidf
            sigma2Ts = 0
            sigmaProd = np.zeros(XMeansNumCls)

        for word in curWords:
            infoByCls = selectedWordInfo[word][xSimId]
            word_nid = bu.trimBits(word, numBitsId)
            bopId = bu.getBits(word, 0, numBitsId)

            cnt = 0
            if bopId in allBagsTs.keys():
                bagTs = allBagsTs[bopId]
                if word_nid in bagTs.keys():
                    cnt = bagTs[word_nid]

            if xSimId in (0, 2):  #ed
                dists += (cnt - infoByCls)**2
            else:
                tf = 0 if cnt == 0 else 1 + np.log10(cnt)
                sigma2Ts += tf**2
                sigmaProd += tf * infoByCls

        if xSimId in (1, 3):
            divide = sigma2Ts * sigma2Centroids
            divide[np.where(divide == 0)] = -1
            dists = 1 - sigmaProd**2 / divide
        preLabel = XMeansRelabelMap[np.argmin(
            dists)] if XMeansRelabelMap is not None else np.argmin(dists)
        return preLabel, cv1_score
コード例 #8
0
    def crossValidation(self,
                        numWords,
                        words,
                        wordRanks,
                        curBopsAndBags,
                        numBitsId,
                        XMeansTrainLabels,
                        XMeansNumCls,
                        XMeansNumTrainByCls,
                        XMeansRelabelMap=None):
        bopIds = []
        tfIdfsByCls = np.zeros((numWords, XMeansNumCls))
        meanCntsByCls = np.zeros((numWords, XMeansNumCls))
        sigmas2Centroids = np.zeros(XMeansNumCls)
        tmpSigma2Ts = np.zeros(self.numTrain)
        tmpSigma2Centroids = np.zeros((self.numTrain, XMeansNumCls))
        tmpsigmaProd = np.zeros((self.numTrain, XMeansNumCls))
        eds = np.zeros((self.numTrain, XMeansNumCls))

        bestAcc_ed = -1
        bestAcc_cos = -1

        for i in range(numWords):

            idx = wordRanks[i]
            word = words[idx]
            bopId = bu.getBits(word, 0, numBitsId)
            (bop, bagWord) = curBopsAndBags[bopId]
            bopIds.append(bopId)

            cntByCls = np.zeros(XMeansNumCls, dtype='uint32')
            cntByTs = np.zeros(self.numTrain, dtype='uint32')
            word_nid = bu.trimBits(word, numBitsId)
            for tsId, cnt in bagWord[word_nid].items():  #count by class
                label = XMeansTrainLabels[tsId]
                cntByCls[label] += cnt
                cntByTs[tsId] += cnt
            meanCntsByCls[idx][:] = cntByCls / XMeansNumTrainByCls

            nonZeroCls = (np.where(cntByCls != 0))[0]
            numNonZero = len(nonZeroCls)
            idf = np.log10(1 + XMeansNumCls / numNonZero)
            tfIdfsByCls[idx][nonZeroCls] = (
                1 + np.log10(cntByCls[nonZeroCls])) * idf
            sigmas2Centroids += tfIdfsByCls[idx][:]**2

            tfByTs = np.zeros(self.numTrain)
            nonZeroTs = np.where(cntByTs != 0)
            tfByTs[nonZeroTs] = 1 + np.log10(cntByTs[nonZeroTs])
            tmpSigma2Ts += tfByTs**2

            addToEd = np.zeros((self.numTrain, XMeansNumCls))
            tmpTfIdfsByCls = np.zeros((self.numTrain, XMeansNumCls))
            for j in range(self.numTrain):
                label = XMeansTrainLabels[j]
                tmpCntByCls = deepcopy(cntByCls)
                tmpNumTrainByCls = deepcopy(XMeansNumTrainByCls)
                tmpCntByCls[label] -= cntByTs[j]
                if tmpNumTrainByCls[label] > 1:
                    tmpNumTrainByCls[label] -= 1
                addToEd[j][:] = (tmpCntByCls / tmpNumTrainByCls -
                                 cntByTs[j])**2

                nonZeroCls = (np.where(tmpCntByCls != 0))[0]
                numNonZero = len(nonZeroCls)
                if numNonZero == 0:
                    continue
                idf = np.log10(1 + XMeansNumCls / numNonZero)
                tmpTfIdfsByCls[j][nonZeroCls] = (
                    1 + np.log10(tmpCntByCls[nonZeroCls])) * idf
            eds += addToEd
            tmpSigma2Centroids += tmpTfIdfsByCls**2
            tmpsigmaProd += tmpTfIdfsByCls * tfByTs[:, np.newaxis]

            divide = tmpSigma2Centroids * tmpSigma2Ts[:, np.newaxis]
            divide[np.where(divide == 0)] = -1
            cosSims = tmpsigmaProd**2 / divide

            preLabels_ed = np.argmin(eds, axis=1)
            for j, preLabel in enumerate(preLabels_ed):
                preLabels_ed[j] = XMeansRelabelMap[
                    preLabel] if XMeansRelabelMap is not None else preLabel
            acc_ed = accuracy_score(self.trainLabels, preLabels_ed)
            if acc_ed >= bestAcc_ed:
                bestAcc_ed = acc_ed
                bestPreLabels_ed = preLabels_ed
                numSelected_ed = i + 1
            preLabels_cos = np.argmax(cosSims, axis=1)
            for j, preLabel in enumerate(preLabels_cos):
                preLabels_cos[j] = XMeansRelabelMap[
                    preLabel] if XMeansRelabelMap is not None else preLabel
            acc_cos = accuracy_score(self.trainLabels, preLabels_cos)
            if acc_cos >= bestAcc_cos:
                bestAcc_cos = acc_cos
                bestPreLabels_cos = preLabels_cos
                bestSigmas2Centroids = deepcopy(sigmas2Centroids)
                numSelected_cos = i + 1

        return bopIds, bestAcc_ed, bestAcc_cos, bestPreLabels_ed, bestPreLabels_cos, \
            numSelected_ed, numSelected_cos, meanCntsByCls, tfIdfsByCls, bestSigmas2Centroids
コード例 #9
0
    def test(self):

        numCases = len(self.allFineMetIds)
        numCases *= 2
        self.preLabels_hy = np.zeros(self.numTest, dtype='uint32')
        self.preLabels = np.zeros((numCases, self.numTest), dtype='uint32')

        numBitsWinLen = bu.numBits(
            np.ceil((self.maxWinLen - self.minWinLen) / self.winLenStep) + 1)
        numBitsWordType = 1
        numBitsXSimId = 2

        numBitsSLASize = bu.numBits(
            np.ceil((self.maxSLASize - self.minSLASize) / self.SLASizeStep) +
            1)
        numBitsSLAId = numBitsWinLen + numBitsSLASize + numBitsWordType
        numBitsSAXSize = bu.numBits(
            np.ceil((self.maxSAXSize - self.minSAXSize) / self.SAXSizeStep) +
            1)
        numBitsSAXId = numBitsWinLen + numBitsSAXSize + numBitsWordType

        for tsId in range(len(self.testTss)):

            if int(tsId) % 10 == 0:
                print(tsId, end=', ')
                sys.stdout.flush()
            if int(tsId) % 100 == 0:
                print()
                sys.stdout.flush()


#             ts = scale(self.testTss[tsId])
            ts = np.array(self.testTss[tsId])
            tsLen = self.testLens[tsId]

            (cumSums, cumSums_2, wCumSums) = Discretizer.getAllCumSums(ts)
            allBagsTs = {}
            for methodId, (bops, selectedWordInfo, selectedBopIds,
                           selectedWords, sigmas2Centroids_nx,
                           sigmas2Centroids_x, cv1_scores,
                           XMeansRelabelMap) in self.allInfo_hy.items():

                wordType = bu.getBits(methodId, 0, numBitsWordType)
                for bopId, bop in bops.items():
                    if bopId in allBagsTs.keys():
                        continue

                    if bop.discretizer.winLen > tsLen:
                        curCumSums = np.concatenate(
                            (cumSums, cumSums[-1] *
                             np.ones(bop.discretizer.winLen - tsLen)))
                        curCumSums_2 = np.concatenate(
                            (cumSums_2, cumSums_2[-1] *
                             np.ones(bop.discretizer.winLen - tsLen)))
                        curWCumSums = np.concatenate(
                            (wCumSums, wCumSums[-1] *
                             np.ones(bop.discretizer.winLen - tsLen)))
                        curTsLen = bop.discretizer.winLen
                    else:
                        curCumSums = cumSums
                        curCumSums_2 = cumSums_2
                        curWCumSums = wCumSums
                        curTsLen = tsLen

                    if wordType == 0:
                        #                         transformedTs = bop.discretizer.transformTsFromCumSums(cumSums, cumSums_2, wCumSums)
                        #                         discretizedTs = bop.discretizer.discretizeTransformedTs(transformedTs)
                        discretizedTs = bop.discretizer.discretizeTsFromCumSums_LNR(
                            curCumSums, curCumSums_2, curWCumSums, curTsLen)
                        bagTs = bop.getBOP_DiscretizedTs(discretizedTs)
                    else:
                        #                         transformedTs = bop.discretizer.transformTsFromCumSums(cumSums, cumSums_2)
                        #                         discretizedTs = bop.discretizer.discretizeTransformedTs(transformedTs)
                        discretizedTs = bop.discretizer.discretizeTsFromCumSums_LNR(
                            curCumSums, curCumSums_2, curTsLen)
                        bagTs = bop.getBOP_DiscretizedTs(discretizedTs)
                    allBagsTs[bopId] = bagTs

            votesByLabel_weighted = np.zeros(self.numCls)
            for fineMetId in self.fineMetIds_hy:
                preLabel, cv1_score = self.predict(allBagsTs, self.allInfo_hy,
                                                   fineMetId, numBitsXSimId,
                                                   numBitsWordType,
                                                   numBitsSLAId, numBitsSAXId)
                votesByLabel_weighted[preLabel] += cv1_score
            self.preLabels_hy[tsId] = np.argmax(votesByLabel_weighted)

            ###########Not timed
            for methodId, (bops, selectedWordInfo, selectedBopIds,
                           selectedWords, sigmas2Centroids_nx,
                           sigmas2Centroids_x, cv1_scores,
                           XMeansRelabelMap) in self.allInfo.items():

                wordType = bu.getBits(methodId, 0, numBitsWordType)
                for bopId, bop in bops.items():
                    if bopId in allBagsTs.keys():
                        continue

                    if bop.discretizer.winLen > tsLen:
                        curCumSums = np.concatenate(
                            (cumSums, cumSums[-1] *
                             np.ones(bop.discretizer.winLen - tsLen)))
                        curCumSums_2 = np.concatenate(
                            (cumSums_2, cumSums_2[-1] *
                             np.ones(bop.discretizer.winLen - tsLen)))
                        curWCumSums = np.concatenate(
                            (wCumSums, wCumSums[-1] *
                             np.ones(bop.discretizer.winLen - tsLen)))
                        curTsLen = bop.discretizer.winLen
                    else:
                        curCumSums = cumSums
                        curCumSums_2 = cumSums_2
                        curWCumSums = wCumSums
                        curTsLen = tsLen

                    if wordType == 0:
                        #                         transformedTs = bop.discretizer.transformTsFromCumSums(cumSums, cumSums_2, wCumSums)
                        #                         discretizedTs = bop.discretizer.discretizeTransformedTs(transformedTs)
                        discretizedTs = bop.discretizer.discretizeTsFromCumSums_LNR(
                            curCumSums, curCumSums_2, curWCumSums, curTsLen)
                        bagTs = bop.getBOP_DiscretizedTs(discretizedTs)
                    else:
                        #                         transformedTs = bop.discretizer.transformTsFromCumSums(cumSums, cumSums_2)
                        #                         discretizedTs = bop.discretizer.discretizeTransformedTs(transformedTs)
                        discretizedTs = bop.discretizer.discretizeTsFromCumSums_LNR(
                            curCumSums, curCumSums_2, curTsLen)
                        bagTs = bop.getBOP_DiscretizedTs(discretizedTs)
                    allBagsTs[bopId] = bagTs

            for case, fineMetIds in enumerate(self.allFineMetIds):
                bestNumMethods = self.allBestNumMethods[case]
                bestNumMethods_weighted = self.allBestNumMethods_weighted[case]
                votesByLabel = np.zeros(self.numCls)
                votesByLabel_weighted = np.zeros(self.numCls)
                for i, fineMetId in enumerate(fineMetIds):
                    preLabel, cv1_score = self.predict(
                        allBagsTs, self.allInfo, fineMetId, numBitsXSimId,
                        numBitsWordType, numBitsSLAId, numBitsSAXId)
                    if i < bestNumMethods:
                        votesByLabel[preLabel] += 1
                    if i < bestNumMethods_weighted:
                        votesByLabel_weighted[preLabel] += cv1_score
                self.preLabels[case][tsId] = np.argmax(votesByLabel)
                self.preLabels[case +
                               len(self.allFineMetIds)][tsId] = np.argmax(
                                   votesByLabel_weighted)

        self.accuracy = accuracy_score(self.testLabels, self.preLabels_hy)
        self.accuracies = np.zeros(numCases)
        for i in range(numCases):
            self.accuracies[i] = accuracy_score(self.testLabels,
                                                self.preLabels[i][:])
コード例 #10
0
    def train(self):

        trainTss_padded = []
        maxTsLen = max(self.trainLens)
        for i in range(self.numTrain):
            ts = np.array(self.trainTss[i])
            tsLen = self.trainLens[i]
            #             zTs = scale(np.array(ts))
            #             zTs = np.concatenate((zTs, np.zeros(maxTsLen - tsLen)))
            #             trainTss_padded.append(zTs)
            ts = np.concatenate((ts, np.zeros(maxTsLen - tsLen)))
            trainTss_padded.append(ts)
        trainTss_padded = np.array(trainTss_padded)

        self.minWinLen = np.maximum(int(
            np.around(self.minTrainLen * self.minWinRatio)),
                                    10,
                                    dtype='int32')
        self.maxWinLen = np.minimum(int(
            np.around(self.minTrainLen * self.maxWinRatio)),
                                    self.minTrainLen,
                                    dtype='int32')
        self.winLenStep = np.maximum(int(
            np.around(self.minTrainLen * self.winRatioStep)),
                                     1,
                                     dtype='int32')
        if self.minTrainLen < self.minWinLen:
            self.minWinLen = self.minTrainLen
        if self.minTrainLen < self.maxWinLen:
            self.maxWinLen = self.minTrainLen
        numBitsWinLen = bu.numBits(
            np.ceil((self.maxWinLen - self.minWinLen) / self.winLenStep) + 1)
        numBitsWordType = 1
        numBitsXSimId = 2

        SLASizes = np.arange(self.maxSLASize,
                             self.minSLASize - 1,
                             -self.SLASizeStep,
                             dtype='uint32')
        numBitsSLASize = bu.numBits(
            np.ceil((self.maxSLASize - self.minSLASize) / self.SLASizeStep) +
            1)
        numBitsSLAId = numBitsWinLen + numBitsSLASize + numBitsWordType
        SAXSizes = np.arange(self.maxSAXSize,
                             self.minSAXSize - 1,
                             -self.SAXSizeStep,
                             dtype='uint32')
        numBitsSAXSize = bu.numBits(
            np.ceil((self.maxSAXSize - self.minSAXSize) / self.SAXSizeStep) +
            1)
        numBitsSAXId = numBitsWinLen + numBitsSAXSize + numBitsWordType

        self.XMeansTime = 0
        self.trainTime = 0
        tic = perf_counter()

        (allCumSums, allCumSums_2,
         allWCumSums) = Discretizer.getAllCumSums(trainTss_padded)
        all_cv1_scores_hy = []
        all_cv1_scoresMap_hy = {}
        self.trainTime += perf_counter() - tic

        all_cv1_scores = []
        all_cv1_scoresMap = {}
        allInfo = {}
        transferMetIdMap = {}

        for wordType in range(2):

            if wordType == 0:
                wordSizes = SLASizes
                numBitsId = numBitsSLAId
                numBitsWordSize = numBitsSLASize
            else:
                wordSizes = SAXSizes
                numBitsId = numBitsSAXId
                numBitsWordSize = numBitsSAXSize

            for wordSize in wordSizes:
                print('WordType = ', wordType, ' WordSize = ', wordSize)
                sys.stdout.flush()

                flg = wordType == 0 or (wordType == 1
                                        and wordSize in {6, 7, 8})
                if flg:
                    tic = perf_counter()

                if wordType == 0:
                    wordSizeInd = (self.maxSLASize -
                                   wordSize) / self.SLASizeStep
                else:
                    wordSizeInd = (self.maxSAXSize -
                                   wordSize) / self.SAXSizeStep

                words = []
                chi2s = []
                curBopsAndBags = {}  #bopId + (bop, bagWord)

                for winLen in range(self.minWinLen, self.maxWinLen + 1,
                                    self.winLenStep):
                    winLenInd = (winLen - self.minWinLen) / self.winLenStep

                    if wordType == 0:
                        discretizer = SLA.SLA(winLen, wordSize, self.SLACard,
                                              True, True, True, self.binSizeTh)
                        discretizedTss = discretizer.discretizeTssFromCumSums_LNR(
                            allCumSums, allCumSums_2, allWCumSums,
                            self.trainLens)
#                         transformedTss = discretizer.transfromTssFromCumSums(allCumSums, allCumSums_2, allWCumSums, self.trainLens)
#                         discretizedTss = discretizer.discretizeTransformedDataset_(transformedTss, self.trainLens, self.trainLabels, 'ED', 'Default')
                    else:
                        discretizer = SAX.SAX(winLen, wordSize, self.SAXCard,
                                              True, True, self.binSizeTh)
                        discretizedTss = discretizer.discretizeTssFromCumSums_LNR(
                            allCumSums, allCumSums_2, self.trainLens)
#                         transformedTss = discretizer.transfromTssFromCumSums(allCumSums, allCumSums_2, self.trainLens)
#                         discretizedTss = discretizer.discretizeTransformedDataset_(transformedTss, self.trainLens, self.trainLabels, 'GD', 'Default')

                    bop = BOP(discretizer, False)
                    #                         bagTss = bop.getBOP_DiscretizedTss(discretizedTss, True, -1) #With numerosity reduction, no bigrams
                    bagWord = bop.getWordFirstBop_DiscretizedTss(
                        discretizedTss)
                    bopId = self.createBOPId(winLenInd, numBitsWinLen,
                                             wordSizeInd, numBitsWordSize,
                                             wordType, numBitsWordType)
                    curBopsAndBags[bopId] = (bop, bagWord)

                    for word_nid, cntTs in bagWord.items():
                        feats = np.zeros(self.numTrain)
                        for tsId, cnt in cntTs.items():
                            feats[tsId] = cnt
                        chi2Val = chi2(feats, self.trainLabels)
                        word = bu.appendBits(word_nid, bopId, numBitsId)
                        words.append(word)
                        chi2s.append(chi2Val)

                #ranking the words
                numWords = len(words)
                wordRanks = np.argsort(-np.array(chi2s))

                #prepare feature matrix for xmeans clustering
                numFeats = min(numWords, self.maxXMeansFeat)
                feats = np.zeros((self.numTrain, numFeats))
                for i in range(numFeats):
                    idx = wordRanks[i]
                    word = words[idx]
                    bopId = bu.getBits(word, 0, numBitsId)
                    (bop, bagWord) = curBopsAndBags[bopId]
                    word_nid = bu.trimBits(word, numBitsId)
                    for tsId, cnt in bagWord[word_nid].items():
                        feats[tsId][i] = cnt

                #Xmeans
                tic_x = perf_counter()
                XMeansRelabelMap = {
                }  #XMeans label -> originally relabeled label
                XMeansTrainLabels = np.zeros(self.numTrain, dtype='uint32')
                XMeansNumTrainByCls = []
                nextXMeansLabel = 0
                for label, indsToCluster in enumerate(self.trainIndsByCls):
                    numToCluster = len(indsToCluster)
                    if numToCluster > self.minClusterSize:
                        curFeats = feats[indsToCluster][:]
                        initial_centers = kmeans_plusplus_initializer(
                            curFeats, 2).initialize()
                        xmeans_instance = xmeans(curFeats, initial_centers,
                                                 self.maxClusters)
                        xmeans_instance.process()
                        clusters = xmeans_instance.get_clusters()
                    else:
                        clusters = [range(numToCluster)]
                    for cluster in clusters:
                        XMeansTrainLabels[indsToCluster[np.array(
                            cluster)]] = nextXMeansLabel
                        XMeansRelabelMap[nextXMeansLabel] = label
                        XMeansNumTrainByCls.append(len(cluster))
                        nextXMeansLabel += 1
                XMeansNumCls = nextXMeansLabel
                XMeansNumTrainByCls = np.array(XMeansNumTrainByCls)
                self.XMeansTime += perf_counter() - tic_x

                #cross validation
                bopIds, bestAcc_ed_nx, bestAcc_cos_nx, bestPreLabels_ed_nx, bestPreLabels_cos_nx, \
                numSelected_ed_nx, numSelected_cos_nx, meanCntsByCls_nx, tfIdfsByCls_nx, sigmas2Centroids_nx = \
                self.crossValidation(numWords, words, wordRanks, curBopsAndBags, numBitsId, self.trainLabels,
                        self.numCls, self.numTrainByCls, None)
                #                 matsToSave = [meanCntsByCls_nx, tfIdfsByCls_nx, sigmas2Centroids_nx]

                if np.array_equal(XMeansTrainLabels, self.trainLabels):
                    bestAccs = [bestAcc_ed_nx, bestAcc_cos_nx]
                    bestPreLabels = [bestPreLabels_ed_nx, bestPreLabels_cos_nx]
                    numsSelected = np.array(
                        [numSelected_ed_nx, numSelected_cos_nx])
                else:
                    bopIds, bestAcc_ed_x, bestAcc_cos_x, bestPreLabels_ed_x, bestPreLabels_cos_x, numSelected_ed_x, numSelected_cos_x, \
                    meanCntsByCls_x, tfIdfsByCls_x, sigmas2Centroids_x = \
                    self.crossValidation(numWords, words, wordRanks, curBopsAndBags, numBitsId, XMeansTrainLabels,
                            XMeansNumCls, XMeansNumTrainByCls, XMeansRelabelMap)
                    #                     matsToSave += [meanCntsByCls_x, tfIdfsByCls_x, sigmas2Centroids_x]
                    bestAccs = [
                        bestAcc_ed_nx, bestAcc_cos_nx, bestAcc_ed_x,
                        bestAcc_cos_x
                    ]
                    bestPreLabels = [
                        bestPreLabels_ed_nx, bestPreLabels_cos_nx,
                        bestPreLabels_ed_x, bestPreLabels_cos_x
                    ]
                    numsSelected = np.array([
                        numSelected_ed_nx, numSelected_cos_nx,
                        numSelected_ed_x, numSelected_cos_x
                    ])
                xSimIdRange = np.argsort(numsSelected)

                methodId = self.createMethodId(wordSizeInd, numBitsWordSize,
                                               wordType, numBitsWordType)
                numCases = len(xSimIdRange)
                cv1_scores = [None] * numCases
                selectedWords = [None] * numCases
                selectedBopIds = [None] * numCases
                curWords = set()
                curBopIds = set()
                selectedWordInfo = {}  #{word: meanCntByCls, tfIdfByCls}
                bops = {}
                prevNumSelected = 0
                for xSimId in xSimIdRange:
                    cv1_score = bestAccs[xSimId]
                    cv1_scores[xSimId] = cv1_score
                    fineMetId = bu.appendBits(methodId, xSimId, numBitsXSimId)

                    if flg:
                        all_cv1_scores_hy.append(cv1_score)
                        if cv1_score not in all_cv1_scoresMap_hy.keys():
                            all_cv1_scoresMap_hy[cv1_score] = {}
                        all_cv1_scoresMap_hy[cv1_score][
                            fineMetId] = bestPreLabels[xSimId]

                    ###Not timed
                    tic_o = perf_counter()
                    all_cv1_scores.append(cv1_score)
                    if cv1_score not in all_cv1_scoresMap.keys():
                        all_cv1_scoresMap[cv1_score] = {}
                    all_cv1_scoresMap[cv1_score][fineMetId] = bestPreLabels[
                        xSimId]
                    if numCases == 2:
                        otherMetId = bu.appendBits(methodId, xSimId + 2,
                                                   numBitsXSimId)
                        transferMetIdMap[fineMetId] = otherMetId
                    time_o = perf_counter() - tic_o
                    #####################

                    numSelected = numsSelected[xSimId]
                    for i in range(prevNumSelected, numSelected):
                        idx = wordRanks[i]
                        word = words[idx]
                        curWords.add(word)
                        bopId = bopIds[i]
                        if bopId not in bops.keys():
                            curBopIds.add(bopId)
                            bops[bopId] = curBopsAndBags[bopId][0]
                        if numCases == 4:
                            selectedWordInfo[word] = (meanCntsByCls_nx[idx][:],
                                                      tfIdfsByCls_nx[idx][:],
                                                      meanCntsByCls_x[idx][:],
                                                      tfIdfsByCls_x[idx][:])
                        else:
                            selectedWordInfo[word] = (meanCntsByCls_nx[idx][:],
                                                      tfIdfsByCls_nx[idx][:])
                    selectedWords[xSimId] = deepcopy(curWords)
                    selectedBopIds[xSimId] = deepcopy(curBopIds)
                    #                     selectedWords[xSimId] = curWords
                    #                     selectedBopIds[xSimId] = curBopIds
                    prevNumSelected = numSelected

                if numCases == 2:
                    XMeansRelabelMap = None
                    sigmas2Centroids_x = None
                allInfo[methodId] = (bops, selectedWordInfo, selectedBopIds,
                                     selectedWords, sigmas2Centroids_nx,
                                     sigmas2Centroids_x, cv1_scores,
                                     XMeansRelabelMap)

                if flg:
                    self.trainTime += perf_counter() - tic - time_o

        tic = perf_counter()
        all_cv1_scores_hy = -np.sort(-np.array(all_cv1_scores_hy))
        self.fineMetIds_hy = []
        bestAcc_weighted = -1
        votesByLabels_weighted = np.zeros((self.numTrain, self.numCls))
        numMethods = 0
        prev_cv1_score = -1
        for cv1_score in all_cv1_scores_hy:
            if cv1_score == prev_cv1_score:
                continue
            prev_cv1_score = cv1_score
            metPreMap = all_cv1_scoresMap_hy[cv1_score]
            for fineMetId, preLabels_met in metPreMap.items():
                self.fineMetIds_hy.append(fineMetId)
                numMethods += 1

                for j in range(self.numTrain):
                    votesByLabels_weighted[j][preLabels_met[j]] += cv1_score
                preLabels_weighted = np.argmax(votesByLabels_weighted, axis=1)
                acc_weighted = accuracy_score(self.trainLabels,
                                              preLabels_weighted)
                if acc_weighted >= bestAcc_weighted:
                    bestAcc_weighted = acc_weighted
                    bestNumMethods_weighted_hy = numMethods
        self.fineMetIds_hy = self.fineMetIds_hy[:bestNumMethods_weighted_hy]
        self.allInfo_hy = {}
        for fineMetId in self.fineMetIds_hy:
            methodId = bu.trimBits(fineMetId, numBitsXSimId)
            if methodId not in self.allInfo_hy.keys():
                self.allInfo_hy[methodId] = allInfo[methodId]
        self.trainTime += perf_counter() - tic

        ####Not timed
        all_cv1_scores = -np.sort(-np.array(all_cv1_scores))
        allValidWordTypes = [[0, 1], [1]]
        allValidSLAWordSizes = [[6, 8, 10], []]
        allValidSAXWordSizes = [[6, 7, 8], [3, 4, 5, 6, 7, 8]]
        allValidXSimIds = [[0, 1, 2, 3], [0, 1], [2, 3]]
        self.allFineMetIds = []
        self.allBestNumMethods = []
        self.allBestNumMethods_weighted = []
        for i, validWordTypes in enumerate(allValidWordTypes):
            validSLAWordSizes = np.array(allValidSLAWordSizes[i],
                                         dtype='uint32')
            validSLAWordSizeInds = (self.maxSLASize -
                                    validSLAWordSizes) / self.SLASizeStep
            validSAXWordSizes = np.array(allValidSAXWordSizes[i],
                                         dtype='uint32')
            validSAXWordSizeInds = (self.maxSAXSize -
                                    validSAXWordSizes) / self.SAXSizeStep
            for validXSimIds in allValidXSimIds:

                bestAcc = -1
                bestAcc_weighted = -1
                votesByLabels = np.zeros((self.numTrain, self.numCls))
                votesByLabels_weighted = np.zeros((self.numTrain, self.numCls))
                fineMetIds = []
                numMethods = 0
                prev_cv1_score = -1
                for cv1_score in all_cv1_scores:
                    if cv1_score == prev_cv1_score:
                        continue
                    prev_cv1_score = cv1_score
                    metPreMap = all_cv1_scoresMap[cv1_score]
                    for fineMetId, preLabels_met in metPreMap.items():

                        wordType = bu.getBits(fineMetId, numBitsXSimId,
                                              numBitsWordType)
                        if wordType not in validWordTypes:
                            continue

                        if wordType == 0:
                            numBitsWordSize = numBitsSLASize
                        elif wordType == 1:
                            numBitsWordSize = numBitsSAXSize

                        wordSizeInd = bu.getBits(
                            fineMetId, numBitsXSimId + numBitsWordType,
                            numBitsWordSize)
                        if wordType == 0 and wordSizeInd not in validSLAWordSizeInds:
                            continue
                        if wordType == 1 and wordSizeInd not in validSAXWordSizeInds:
                            continue

                        xSimId = bu.getBits(fineMetId, 0, numBitsXSimId)
                        if xSimId not in validXSimIds:
                            if fineMetId not in transferMetIdMap.keys():
                                continue
                            else:
                                otherMethodId = transferMetIdMap[fineMetId]
                                if bu.getBits(
                                        otherMethodId, 0,
                                        numBitsXSimId) not in validXSimIds:
                                    continue
                        fineMetIds.append(fineMetId)
                        numMethods += 1

                        for j in range(self.numTrain):
                            votesByLabels[j][preLabels_met[j]] += 1
                            votesByLabels_weighted[j][
                                preLabels_met[j]] += cv1_score
                        preLabels = np.argmax(votesByLabels, axis=1)
                        preLabels_weighted = np.argmax(votesByLabels_weighted,
                                                       axis=1)
                        acc = accuracy_score(self.trainLabels, preLabels)
                        acc_weighted = accuracy_score(self.trainLabels,
                                                      preLabels_weighted)
                        if acc >= bestAcc:
                            bestAcc = acc
                            bestNumMethods = numMethods
                        if acc_weighted >= bestAcc_weighted:
                            bestAcc_weighted = acc_weighted
                            bestNumMethods_weighted = numMethods
                fineMetIds = fineMetIds[:max(
                    [bestNumMethods, bestNumMethods_weighted])]
                self.allFineMetIds.append(fineMetIds)
                self.allBestNumMethods.append(bestNumMethods)
                self.allBestNumMethods_weighted.append(bestNumMethods_weighted)

        self.allInfo = {}
        for fineMetIds in self.allFineMetIds:
            for fineMetId in fineMetIds:
                methodId = bu.trimBits(fineMetId, numBitsXSimId)
                if methodId not in self.allInfo.keys():
                    self.allInfo[methodId] = allInfo[methodId]
コード例 #11
0
    def increGetWordFirstBop_Tss(self, prevBagWord, numLetters=2, indictBi=-1):
        '''
        The idea of incrementally obtain bag-of-patterns comes from 
        
        Patrick Schafer:
        The BOSS is concerned with time series classification in the presence of noise. Data Min. Knowl. Discov. 29(6): 1505-1530 (2015)
        
        This method is suitable to SFA, not to SLA and SAX. We have not used this.
        
        '''

        numBitsTrim = bu.numBits(self.discretizer.card) * numLetters
        #         if self.binary:
        curBagWord = {}
        for word in prevBagWord.keys():
            if indictBi >= 0:
                isBigram = bu.getBits(word, 0, 1)
                if isBigram:
                    numBitsUni = bu.numBits(
                        self.discretizer.card) * self.discretizer.wordSize
                    newUniWord_0 = bu.trimBits(word,
                                               numBitsTrim + numBitsUni + 1)
                    newUniWord_1 = bu.getBits(word, numBitsTrim + 1,
                                              numBitsUni - numBitsTrim)
                    newWord = bu.appendBits(newUniWord_0, newUniWord_1,
                                            numBitsUni - numBitsTrim)
                    newWord = bu.appendBits(newWord, isBigram, 1)
                    if newWord not in curBagWord.keys():
                        curBagWord[newWord] = set() if self.binary else {}
                        for i in range(2**numBitsTrim):
                            affUniword_0 = bu.appendBits(
                                newUniWord_0, i, numBitsTrim)
                            for j in range(2**numBitsTrim):
                                affUniword_1 = bu.appendBits(
                                    newUniWord_1, j, numBitsTrim)
                                affWord = bu.appendBits(
                                    affUniword_0, affUniword_1, numBitsUni)
                                affWord = bu.appendBits(affWord, isBigram, 1)
                                if affWord in prevBagWord.keys():
                                    if self.binary:
                                        curBagWord[newWord] = curBagWord[
                                            newWord] | prevBagWord[affWord]
                                    else:
                                        for tsId, cnt in prevBagWord[
                                                affWord].items():
                                            if tsId in curBagWord[
                                                    newWord].keys():
                                                curBagWord[newWord][
                                                    tsId] += cnt
                                            else:
                                                curBagWord[newWord][tsId] = cnt
                else:
                    newWord_ni = bu.trimBits(word, numBitsTrim + 1)
                    newWord = bu.appendBits(newWord_ni, isBigram, 1)
                    if newWord not in curBagWord.keys():
                        curBagWord[newWord] = set() if self.binary else {}
                        for i in range(2**numBitsTrim):
                            affWord = bu.appendBits(newWord_ni, i, numBitsTrim)
                            affWord = bu.appendBits(affWord, isBigram, 1)
                            if affWord in prevBagWord.keys():
                                if self.binary:
                                    curBagWord[newWord] = curBagWord[
                                        newWord] | prevBagWord[affWord]
                                else:
                                    for tsId, cnt in prevBagWord[
                                            affWord].items():
                                        if tsId in curBagWord[newWord].keys():
                                            curBagWord[newWord][tsId] += cnt
                                        else:
                                            curBagWord[newWord][tsId] = cnt
            else:
                newWord = bu.trimBits(word, numBitsTrim)
                if newWord not in curBagWord.keys():
                    curBagWord[newWord] = set() if self.binary else {}
                    for i in range(2**numBitsTrim):
                        affWord = bu.appendBits(newWord, i, numBitsTrim)
                        if affWord in prevBagWord.keys():
                            if self.binary:
                                curBagWord[newWord] = curBagWord[
                                    newWord] | prevBagWord[affWord]
                            else:
                                for tsId, cnt in prevBagWord[affWord].items():
                                    if tsId in curBagWord[newWord].keys():
                                        curBagWord[newWord][tsId] += cnt
                                    else:
                                        curBagWord[newWord][tsId] = cnt
        return curBagWord
コード例 #12
0
    def increGetBOP_Ts(self, prevBagTs, numLetters=2, indictBi=-1):
        '''
        The idea of incrementally obtain bag-of-patterns comes from 
        
        Patrick Schafer:
        The BOSS is concerned with time series classification in the presence of noise. Data Min. Knowl. Discov. 29(6): 1505-1530 (2015)
        
        This method is suitable to SFA, not to SLA and SAX. We have not used this.
        
        '''

        numBitsTrim = bu.numBits(self.discretizer.card) * numLetters
        #         if self.binary:
        curBagTs = set() if self.binary else {}
        tmp_prev = prevBagTs if self.binary else prevBagTs.keys()
        for word in tmp_prev:
            if indictBi >= 0:
                isBigram = bu.getBits(word, 0, 1)
                if isBigram:
                    numBitsUni = bu.numBits(
                        self.discretizer.card) * self.discretizer.wordSize
                    newUniWord_0 = bu.trimBits(word,
                                               numBitsTrim + numBitsUni + 1)
                    newUniWord_1 = bu.getBits(word, numBitsTrim + 1,
                                              numBitsUni - numBitsTrim)
                    newWord = bu.appendBits(newUniWord_0, newUniWord_1,
                                            numBitsUni - numBitsTrim)
                    newWord = bu.appendBits(newWord, isBigram, 1)
                    tmp_cur = curBagTs if self.binary else curBagTs.keys()
                    if newWord not in tmp_cur:
                        if not self.binary:
                            curBagTs[newWord] = 0
                        breakFlg = False
                        for i in range(2**numBitsTrim):
                            affUniword_0 = bu.appendBits(
                                newUniWord_0, i, numBitsTrim)
                            for j in range(2**numBitsTrim):
                                affUniword_1 = bu.appendBits(
                                    newUniWord_1, j, numBitsTrim)
                                affWord = bu.appendBits(
                                    affUniword_0, affUniword_1, numBitsUni)
                                affWord = bu.appendBits(affWord, isBigram, 1)
                                if affWord in tmp_prev:
                                    if self.binary:
                                        curBagTs.add(newWord)
                                        breakFlg = True
                                        break
                                    else:
                                        curBagTs[newWord] += prevBagTs[affWord]
                            if breakFlg:
                                break
                else:
                    newWord_ni = bu.trimBits(word, numBitsTrim + 1)
                    newWord = bu.appendBits(newWord_ni, isBigram, 1)
                    tmp_cur = curBagTs if self.binary else curBagTs.keys()
                    if newWord not in tmp_cur:
                        if not self.binary:
                            curBagTs[newWord] = 0
                        for i in range(2**numBitsTrim):
                            affWord = bu.appendBits(newWord_ni, i, numBitsTrim)
                            affWord = bu.appendBits(affWord, isBigram, 1)
                            if affWord in tmp_prev:
                                if self.binary:
                                    curBagTs.add(newWord)
                                    break
                                else:
                                    curBagTs[newWord] += prevBagTs[affWord]
            else:
                newWord = bu.trimBits(word, numBitsTrim)
                tmp_cur = curBagTs if self.binary else curBagTs.keys()
                if newWord not in tmp_cur:
                    if not self.binary:
                        curBagTs[newWord] = 0
                    for i in range(2**numBitsTrim):
                        affWord = bu.appendBits(newWord, i, numBitsTrim)
                        if affWord in tmp_prev:
                            if self.binary:
                                curBagTs.add(newWord)
                                break
                            else:
                                curBagTs[newWord] += prevBagTs[affWord]
        return curBagTs