예제 #1
0
    def test(self):

        self.testTimePerTs = 0

        self.preLabels = np.zeros(self.numTest, dtype='uint32')
        for tsId in range(self.numTest):
            if int(tsId) % 10 == 0:
                print(tsId, end=', ')
                sys.stdout.flush()
            if int(tsId) % 100 == 0:
                print()
                sys.stdout.flush()


#             ts = scale(self.testTss[tsId])
            ts = np.array(self.testTss[tsId])
            tsLen = self.testLens[tsId]

            tic = perf_counter()

            cumSums = gu.getCumSums(ts)
            cumSums_2 = gu.getCumSums(ts * ts)
            votes = np.zeros(self.numCls, dtype='uint32')
            for methodId, (bop, selectedWords, selectedWordInfo,
                           sigma2Centroids) in self.allInfo.items():

                if bop.discretizer.winLen > tsLen:
                    curCumSums = np.concatenate(
                        (cumSums, cumSums[-1] *
                         np.ones(bop.discretizer.winLen - tsLen)))
                    curCumSums_2 = np.concatenate(
                        (cumSums_2, cumSums_2[-1] *
                         np.ones(bop.discretizer.winLen - tsLen)))
                else:
                    curCumSums = cumSums
                    curCumSums_2 = cumSums_2

                transformedTs = bop.discretizer.transformTsFromCumSums(
                    curCumSums, curCumSums_2)
                discretizedTs = bop.discretizer.discretizeTransformedTs(
                    transformedTs)
                bagTs = bop.getBOP_DiscretizedTs(discretizedTs)

                for simId, methodIds in enumerate(self.allMethodIds):
                    if methodId not in methodIds:
                        continue
                    curSelectedWords = selectedWords[simId]

                    dists = np.zeros(self.numCls)
                    if simId == 1:
                        sigma2Ts = 0
                        sigmaProd = np.zeros(self.numCls)
                    for word in curSelectedWords:
                        infoByCls = selectedWordInfo[word][simId]

                        cnt = 0
                        if word in bagTs.keys():
                            cnt = bagTs[word]

                        if simId == 0:  #ed
                            dists += (cnt - infoByCls)**2
                        else:
                            tf = 0 if cnt == 0 else 1 + np.log10(cnt)
                            sigma2Ts += tf**2
                            sigmaProd += tf * infoByCls

                    if simId == 1:
                        divide = sigma2Ts * sigma2Centroids
                        divide[np.where(divide == 0)] = -1
                        dists = 1 - sigmaProd**2 / divide
                    preLabel = np.argmin(dists)
                    votes[preLabel] += 1

            self.preLabels[tsId] = np.argmax(votes)

            toc = perf_counter()
            self.testTimePerTs += toc - tic

        self.accuracy = accuracy_score(self.testLabels, self.preLabels)
        self.testTimePerTs /= self.numTest
예제 #2
0
def getAllCumSums(data):

    cumSums = gu.getCumSums(data)
    cumSums_2 = gu.getCumSums(data * data)
    weightedCumSums = gu.getCumSums(data * np.arange(data.shape[-1]))
    return (cumSums, cumSums_2, weightedCumSums)
예제 #3
0
    def train(self):

        trainTss_padded = []
        maxTsLen = max(self.trainLens)
        for i in range(self.numTrain):
            ts = np.array(self.trainTss[i])
            tsLen = self.trainLens[i]
            #             zTs = scale(np.array(ts))
            #             zTs = np.concatenate((zTs, np.zeros(maxTsLen - tsLen)))
            #             trainTss_padded.append(zTs)
            ts = np.concatenate((ts, np.zeros(maxTsLen - tsLen)))
            trainTss_padded.append(ts)
        trainTss_padded = np.array(trainTss_padded)

        self.minWinLen = np.maximum(int(
            np.around(self.minTrainLen * self.minWinRatio)),
                                    self.minWinLen,
                                    dtype='int32')
        self.maxWinLen = np.minimum(int(
            np.around(self.minTrainLen * self.maxWinRatio)),
                                    self.minTrainLen,
                                    dtype='int32')
        self.winLenStep = np.maximum(int(
            np.around(self.minTrainLen * self.winRatioStep)),
                                     1,
                                     dtype='int32')
        if self.minTrainLen < self.minWinLen:
            self.minWinLen = self.minTrainLen
        if self.minTrainLen < self.maxWinLen:
            self.maxWinLen = self.minTrainLen
#         numBitsWinLen = bu.numBits(np.ceil((self.maxWinLen - self.minWinLen) / self.winLenStep) + 1)
#         numBitsWordSize = bu.numBits(np.ceil((self.maxWordSize - self.minWordSize) / self.wordSizeStep) + 1)

        tic = perf_counter()

        allCumSums = gu.getCumSums(trainTss_padded)
        allCumSums_2 = gu.getCumSums(trainTss_padded * trainTss_padded)

        all_cv1_scores = [[], []]
        #         allMethodIds = [[], []]
        allInfo = []
        for wordSize in range(self.minWordSize, self.maxWordSize + 1,
                              self.wordSizeStep):
            for winLen in range(self.minWinLen, self.maxWinLen + 1,
                                self.winLenStep):

                discretizer = SAX.SAX(winLen, wordSize, self.card, True, True,
                                      self.binSizeTh)
                transformedTss = discretizer.transfromTssFromCumSums(
                    allCumSums, allCumSums_2, self.trainLens)
                discretizedTss = discretizer.discretizeTransformedDataset_(
                    transformedTss, self.trainLens, None, 'GD', 'Default')
                bop = BOP(discretizer, False)
                bagWord = bop.getWordFirstBop_DiscretizedTss(discretizedTss)

                words = []
                fs = []
                for word, cntTs in bagWord.items():
                    feats = np.zeros(self.numTrain)
                    for tsId, cnt in cntTs.items():
                        feats[tsId] = cnt
                    f = FStat_2(feats, self.trainLabels, self.numCls)
                    if f:
                        words.append(word)
                        fs.append(f)

                numWords = len(words)
                if numWords == 0:
                    continue
                wordRanks = np.argsort(-np.array(fs))

                bestAcc_ed, bestAcc_cos, numSelected_ed, numSelected_cos, meanCntsByCls, tfIdfsByCls, sigmas2Centroids\
                 = self.crossValidation(numWords, words, wordRanks, bagWord)

                bestAccs = [bestAcc_ed, bestAcc_cos]
                numsSelected = np.array([numSelected_ed, numSelected_cos])
                simIdRange = np.argsort(numsSelected)
                selectedWordInfo = {}
                selectedWords = [None, None]

                #                 methodId = self.createMethodId(winLenInd, numBitsWinLen, wordSizeInd, numBitsWordSize)
                prevNumSelected = 0
                curSelectedWords = set()
                for simId in simIdRange:
                    cv1_score = bestAccs[simId]
                    all_cv1_scores[simId].append(cv1_score)
                    #                     allMethodIds[simId].append(methodId)

                    numSelected = numsSelected[simId]
                    for i in range(prevNumSelected, numSelected):
                        idx = wordRanks[i]
                        word = words[idx]
                        curSelectedWords.add(word)
                        selectedWordInfo[word] = (meanCntsByCls[idx][:],
                                                  tfIdfsByCls[idx][:])
                    selectedWords[simId] = deepcopy(curSelectedWords)
                    prevNumSelected = numSelected
                allInfo.append(
                    (bop, selectedWords, selectedWordInfo, sigmas2Centroids))

        self.allMethodIds = []
        allAvgAcc = np.empty(2)
        for i in range(2):
            cur_cv1_scores = np.array(all_cv1_scores[i])
            numMet = len(cur_cv1_scores)
            if numMet > self.topK:
                methodIds = np.argpartition(-cur_cv1_scores,
                                            self.topK)[:self.topK]
            else:
                methodIds = np.arange(numMet)
            self.allMethodIds.append(set(methodIds))
            allAvgAcc[i] = np.mean(cur_cv1_scores[methodIds])
        maxAcc = np.amax(allAvgAcc)
        for i in range(2):
            if allAvgAcc[i] <= self.accRatio * maxAcc:
                self.allMethodIds[i] = set()

        self.allInfo = {}
        for methodIds in self.allMethodIds:
            for methodId in methodIds:
                if methodId not in self.allInfo.keys():
                    self.allInfo[methodId] = allInfo[methodId]

        toc = perf_counter()
        self.trainTime = toc - tic