Beispiel #1
0
def getTagsForListOfVideoIds(youtube, videoIds):
    videoTagData = {}

    for chunk in GeneralUtil.chunkList(videoIds, 50):
        idString = ""

        for videoId in chunk:
            idString += videoId + ','

        idString = idString[:-1]

        request = youtube.videos().list(
            part="snippet,topicDetails",
            id=idString
        )
        response = request.execute()
        
        for video in response['items']:
            tagData = {}
            
            if('tags' in video['snippet']):
                tagData['tags'] = video['snippet']['tags']
            if('topicDetails' in video):
                tagData['topicDetails'] = video['topicDetails']

            videoTagData[video['snippet']['title']] = tagData


    return videoTagData
Beispiel #2
0
 def __init__(self, winLen, wordSize, card, meanNorm = True, stdNorm = True, binSizeTh = 3, step = -1):
     super().__init__(winLen, wordSize, card, binSizeTh, step)
     self.type = 'SAX'
     self.meanNorm = meanNorm
     self.stdNorm = stdNorm
     if self.meanNorm and self.stdNorm:
         self.avg = 0
         self.stdv = 1
     self.segStarts = gu.getSegStarts(self.winLen, self.wordSize)
     self.segSizes = self.segStarts[1 :] - self.segStarts[: len(self.segStarts) - 1]
Beispiel #3
0
    def transformSub(self, cumSums, cumSums_2, wCumSums, pos):

        transformedSub = np.zeros(self.wordSize)

        #window mean and std
        if not (self.meanNorm or self.stdNorm):
            meanSub = 0
            sigmaSub = 1
        elif self.stdNorm:
            meanSub = (cumSums[pos + self.winLen] - cumSums[pos]) / self.winLen
            meanSub_2 = (cumSums_2[pos + self.winLen] -
                         cumSums_2[pos]) / self.winLen
            varSub = meanSub_2 - meanSub * meanSub
            sigmaSub = np.sqrt(varSub) if varSub > 0 else 1
            if not self.meanNorm:
                meanSub = 0
        else:
            meanSub = (cumSums[pos + self.winLen] - cumSums[pos]) / self.winLen
            sigmaSub = 1

        #timestamp parameters
        startPts = self.segStarts[:len(self.segStarts) - 1] + pos
        finishPts = self.segStarts[1:] + pos
        sum_X = gu.getAriSeqSum(startPts, finishPts - 1)
        mean_X = sum_X / self.segSizes
        mean_X2 = gu.getSumOfSquares(startPts, finishPts - 1) / self.segSizes

        #segment parameters
        #             sumSegs = cumSums[self.segStarts[1 :]] - cumSums[self.segStarts[: len(self.segStarts) - 1]]
        sumSegs = cumSums[finishPts] - cumSums[startPts]
        meanSegs = (sumSegs / self.segSizes - meanSub) / sigmaSub
        #             wCumSegs = wCumSums[self.segStarts[1 :]] - wCumSums[self.segStarts[: len(self.segStarts) - 1]]
        wCumSegs = wCumSums[finishPts] - wCumSums[startPts]
        wMeanSegs = (wCumSegs - meanSub * sum_X) / self.segSizes / sigmaSub

        #the coefficients
        slopes = (wMeanSegs - mean_X * meanSegs) / (mean_X2 - mean_X * mean_X)
        intercepts = meanSegs - slopes * mean_X
        if self.posNorm:
            intercepts += startPts * slopes  #shift to the same starting timestamp of 0
        transformedSub[0:self.wordSize - 1:2] = slopes
        transformedSub[1:self.wordSize:2] = intercepts
        return transformedSub
def infoGain_singleSplit(vals, labels, retMajorClasses = False):
        
        #takes in np.array
        
        if len(np.unique(vals)) == 1:   #no distinguishing power at all
            if retMajorClasses:
                return (-1, -1, -1)
            return (-1, -1)
        
        total = len(vals)
        order = np.argsort(vals)
        sortedVals = vals[order]
        sortedLabels = labels[order]
        
        bestGain = -1
        bestPos = -1
        
        uniqLabels, cOut = np.unique(sortedLabels, return_counts = True)
        numCls = len(np.unique(sortedLabels))
        
        labelMap = {}
        for i in range(numCls):
            labelMap[uniqLabels[i]] = i
        
        entAll = entropy(cOut, total)
        
        lastCVal = sortedVals[0]
        nOut = total
        nIn = 0
        cIn = np.zeros(numCls)
        
        for split in range(total):
            cVal = sortedVals[split]
            
            if lastCVal != cVal:
                gain = infoGain(cIn, cOut, entAll, total, nIn, nOut)
                if gain >= bestGain:
                    bestPos = split
                    bestGain = gain
                lastCVal = cVal
                
            labelIdx = labelMap[sortedLabels[split]]
            cOut[labelIdx] -= 1
            nOut -= 1
            cIn[labelIdx] += 1
            nIn += 1
        
        splitPt = sortedVals[bestPos]
        if retMajorClasses:
            labelsOut = sortedLabels[sortedVals >= splitPt]
            uniqLabelsOut, cOut = np.unique(labelsOut, return_counts = True)
            maxC, maxCIdx = gu.maxWithTies(cOut)
            majorClasses = uniqLabelsOut[maxCIdx]
            return bestGain, splitPt, majorClasses
        return bestGain, splitPt
Beispiel #5
0
def getChannelData(youtube, channelIds, part):

    channelMap = {}

    for chunk in GeneralUtil.chunkList(channelIds, 50):
        idString = ""
        for id in chunk:
            idString += id + ","

        idString = idString[:-1]
        request = youtube.channels().list(part=part, id=idString)

        response = request.execute()

        for item in response['items']:
            channelMap[item['id']] = item

    return channelMap
Beispiel #6
0
 def __init__(self,
              winLen,
              wordSize,
              card,
              meanNorm=True,
              stdNorm=True,
              posNorm=True,
              binSizeTh=3,
              step=-1):
     super().__init__(winLen, wordSize, card, binSizeTh, step)
     self.type = 'SLA'
     if self.wordSize % 2:
         self.wordSize += 1  #make it even so that both the slopes and the intercepts can be kept
     self.meanNorm = meanNorm
     self.stdNorm = stdNorm
     self.posNorm = posNorm
     self.segStarts = gu.getSegStarts(self.winLen, self.wordSize / 2)
     self.segSizes = self.segStarts[1:] - self.segStarts[:len(self.segStarts
                                                              ) - 1]
Beispiel #7
0
def getChannelTopics(youtube, channelIds):

    channelTopicMap = {}

    for chunk in GeneralUtil.chunkList(channelIds, 50):
        idString = ""
        for id in chunk:
            idString += id + ","

        idString = idString[:-1]
        request = youtube.channels().list(
            part="snippet,topicDetails",
            id=idString
        )

        response = request.execute()

        for item in response['items']:
            channelTopicMap[item['snippet']['title']] = item['topicDetails']['topicCategories']

    return channelTopicMap
Beispiel #8
0
def getAllMeanX2(tsLen, segSize):
    starts = np.arange(tsLen - segSize + 1)
    finishes = np.arange(segSize - 1, tsLen)
    return gu.getSumOfSquares(starts, finishes) / segSize
Beispiel #9
0
def getAllSumX(tsLen, segSize):

    first = gu.getAriSeqSum(0, segSize - 1, 1)
    return np.arange(first, first + segSize * (tsLen - segSize) + 1, segSize)
Beispiel #10
0
def getAllCumSums(data):

    cumSums = gu.getCumSums(data)
    cumSums_2 = gu.getCumSums(data * data)
    weightedCumSums = gu.getCumSums(data * np.arange(data.shape[-1]))
    return (cumSums, cumSums_2, weightedCumSums)
Beispiel #11
0
 def getCumSums_Ts(self, ts):
     return gu.getCumSums_1D(ts)
Beispiel #12
0
    def test(self):

        self.testTimePerTs = 0

        self.preLabels = np.zeros(self.numTest, dtype='uint32')
        for tsId in range(self.numTest):
            if int(tsId) % 10 == 0:
                print(tsId, end=', ')
                sys.stdout.flush()
            if int(tsId) % 100 == 0:
                print()
                sys.stdout.flush()


#             ts = scale(self.testTss[tsId])
            ts = np.array(self.testTss[tsId])
            tsLen = self.testLens[tsId]

            tic = perf_counter()

            cumSums = gu.getCumSums(ts)
            cumSums_2 = gu.getCumSums(ts * ts)
            votes = np.zeros(self.numCls, dtype='uint32')
            for methodId, (bop, selectedWords, selectedWordInfo,
                           sigma2Centroids) in self.allInfo.items():

                if bop.discretizer.winLen > tsLen:
                    curCumSums = np.concatenate(
                        (cumSums, cumSums[-1] *
                         np.ones(bop.discretizer.winLen - tsLen)))
                    curCumSums_2 = np.concatenate(
                        (cumSums_2, cumSums_2[-1] *
                         np.ones(bop.discretizer.winLen - tsLen)))
                else:
                    curCumSums = cumSums
                    curCumSums_2 = cumSums_2

                transformedTs = bop.discretizer.transformTsFromCumSums(
                    curCumSums, curCumSums_2)
                discretizedTs = bop.discretizer.discretizeTransformedTs(
                    transformedTs)
                bagTs = bop.getBOP_DiscretizedTs(discretizedTs)

                for simId, methodIds in enumerate(self.allMethodIds):
                    if methodId not in methodIds:
                        continue
                    curSelectedWords = selectedWords[simId]

                    dists = np.zeros(self.numCls)
                    if simId == 1:
                        sigma2Ts = 0
                        sigmaProd = np.zeros(self.numCls)
                    for word in curSelectedWords:
                        infoByCls = selectedWordInfo[word][simId]

                        cnt = 0
                        if word in bagTs.keys():
                            cnt = bagTs[word]

                        if simId == 0:  #ed
                            dists += (cnt - infoByCls)**2
                        else:
                            tf = 0 if cnt == 0 else 1 + np.log10(cnt)
                            sigma2Ts += tf**2
                            sigmaProd += tf * infoByCls

                    if simId == 1:
                        divide = sigma2Ts * sigma2Centroids
                        divide[np.where(divide == 0)] = -1
                        dists = 1 - sigmaProd**2 / divide
                    preLabel = np.argmin(dists)
                    votes[preLabel] += 1

            self.preLabels[tsId] = np.argmax(votes)

            toc = perf_counter()
            self.testTimePerTs += toc - tic

        self.accuracy = accuracy_score(self.testLabels, self.preLabels)
        self.testTimePerTs /= self.numTest
Beispiel #13
0
    def train(self):

        trainTss_padded = []
        maxTsLen = max(self.trainLens)
        for i in range(self.numTrain):
            ts = np.array(self.trainTss[i])
            tsLen = self.trainLens[i]
            #             zTs = scale(np.array(ts))
            #             zTs = np.concatenate((zTs, np.zeros(maxTsLen - tsLen)))
            #             trainTss_padded.append(zTs)
            ts = np.concatenate((ts, np.zeros(maxTsLen - tsLen)))
            trainTss_padded.append(ts)
        trainTss_padded = np.array(trainTss_padded)

        self.minWinLen = np.maximum(int(
            np.around(self.minTrainLen * self.minWinRatio)),
                                    self.minWinLen,
                                    dtype='int32')
        self.maxWinLen = np.minimum(int(
            np.around(self.minTrainLen * self.maxWinRatio)),
                                    self.minTrainLen,
                                    dtype='int32')
        self.winLenStep = np.maximum(int(
            np.around(self.minTrainLen * self.winRatioStep)),
                                     1,
                                     dtype='int32')
        if self.minTrainLen < self.minWinLen:
            self.minWinLen = self.minTrainLen
        if self.minTrainLen < self.maxWinLen:
            self.maxWinLen = self.minTrainLen
#         numBitsWinLen = bu.numBits(np.ceil((self.maxWinLen - self.minWinLen) / self.winLenStep) + 1)
#         numBitsWordSize = bu.numBits(np.ceil((self.maxWordSize - self.minWordSize) / self.wordSizeStep) + 1)

        tic = perf_counter()

        allCumSums = gu.getCumSums(trainTss_padded)
        allCumSums_2 = gu.getCumSums(trainTss_padded * trainTss_padded)

        all_cv1_scores = [[], []]
        #         allMethodIds = [[], []]
        allInfo = []
        for wordSize in range(self.minWordSize, self.maxWordSize + 1,
                              self.wordSizeStep):
            for winLen in range(self.minWinLen, self.maxWinLen + 1,
                                self.winLenStep):

                discretizer = SAX.SAX(winLen, wordSize, self.card, True, True,
                                      self.binSizeTh)
                transformedTss = discretizer.transfromTssFromCumSums(
                    allCumSums, allCumSums_2, self.trainLens)
                discretizedTss = discretizer.discretizeTransformedDataset_(
                    transformedTss, self.trainLens, None, 'GD', 'Default')
                bop = BOP(discretizer, False)
                bagWord = bop.getWordFirstBop_DiscretizedTss(discretizedTss)

                words = []
                fs = []
                for word, cntTs in bagWord.items():
                    feats = np.zeros(self.numTrain)
                    for tsId, cnt in cntTs.items():
                        feats[tsId] = cnt
                    f = FStat_2(feats, self.trainLabels, self.numCls)
                    if f:
                        words.append(word)
                        fs.append(f)

                numWords = len(words)
                if numWords == 0:
                    continue
                wordRanks = np.argsort(-np.array(fs))

                bestAcc_ed, bestAcc_cos, numSelected_ed, numSelected_cos, meanCntsByCls, tfIdfsByCls, sigmas2Centroids\
                 = self.crossValidation(numWords, words, wordRanks, bagWord)

                bestAccs = [bestAcc_ed, bestAcc_cos]
                numsSelected = np.array([numSelected_ed, numSelected_cos])
                simIdRange = np.argsort(numsSelected)
                selectedWordInfo = {}
                selectedWords = [None, None]

                #                 methodId = self.createMethodId(winLenInd, numBitsWinLen, wordSizeInd, numBitsWordSize)
                prevNumSelected = 0
                curSelectedWords = set()
                for simId in simIdRange:
                    cv1_score = bestAccs[simId]
                    all_cv1_scores[simId].append(cv1_score)
                    #                     allMethodIds[simId].append(methodId)

                    numSelected = numsSelected[simId]
                    for i in range(prevNumSelected, numSelected):
                        idx = wordRanks[i]
                        word = words[idx]
                        curSelectedWords.add(word)
                        selectedWordInfo[word] = (meanCntsByCls[idx][:],
                                                  tfIdfsByCls[idx][:])
                    selectedWords[simId] = deepcopy(curSelectedWords)
                    prevNumSelected = numSelected
                allInfo.append(
                    (bop, selectedWords, selectedWordInfo, sigmas2Centroids))

        self.allMethodIds = []
        allAvgAcc = np.empty(2)
        for i in range(2):
            cur_cv1_scores = np.array(all_cv1_scores[i])
            numMet = len(cur_cv1_scores)
            if numMet > self.topK:
                methodIds = np.argpartition(-cur_cv1_scores,
                                            self.topK)[:self.topK]
            else:
                methodIds = np.arange(numMet)
            self.allMethodIds.append(set(methodIds))
            allAvgAcc[i] = np.mean(cur_cv1_scores[methodIds])
        maxAcc = np.amax(allAvgAcc)
        for i in range(2):
            if allAvgAcc[i] <= self.accRatio * maxAcc:
                self.allMethodIds[i] = set()

        self.allInfo = {}
        for methodIds in self.allMethodIds:
            for methodId in methodIds:
                if methodId not in self.allInfo.keys():
                    self.allInfo[methodId] = allInfo[methodId]

        toc = perf_counter()
        self.trainTime = toc - tic