Ejemplo n.º 1
0
    def test_getTerms(self):
        usrid = 0
        usr2labels = {
          0: [0, 1, 0, 1],
          1: [1, 0, 1, 0],
        }
        usr2NonzeroCols = {
          0: [1, 3],
          1: [0, 2],
        }
        usr2itemsIndx = {
          0: [0, 1],
          1: [2, 3],
        }
        W = np.array([[0, 1, 2, 4],
                      [0, 1, 2, 4],
                      ])
        usr_rep = np.array([[1, 1],
                            [2, 2],
                            ])[0]  # should be [1,1]'
        usr2negsNonzeroCols = {
          0: [[0, 3], [1, 2]],
          1: [[1, 3], [0, 2]],
        }

        NEG_SAMPLE_NUM = int(random.random() * 10)  # Dont care
        ITEM_FIELDS_NUM = W.shape[0]
        MAX_TRAIN_NUM = int(random.random() * 10)  # Dont care
        LEARNING_RATE = random.random()  # Dont care
        MOMENTUM = int(random.random() * 10)  # Dont care
        LAMBDA = random.random()  # Dont care
        args = (NEG_SAMPLE_NUM, ITEM_FIELDS_NUM, MAX_TRAIN_NUM,
                LEARNING_RATE, MOMENTUM, LAMBDA)
        baseupdator = Baseupdator(*args)

        # Actual:
        y, y_nonzeroCols, itemsIndx, sumedW_y, sigmoid_y, \
            y_negsNonzeroCols, sumedW_negs, sigmoids_negs, \
            sigmoidedSumedW = baseupdator.getTerms(
                usrid,
                usr2labels,
                usr2NonzeroCols,
                usr2itemsIndx,
                W,
                usr_rep,
                usr2negsNonzeroCols
            )
        # expected:
        y2 = [0, 1, 0, 1]
        y_nonzeroCols2 = [1, 3]
        itemsIndx2 = [0, 1]
        sumedW_y2 = sumOverW(W, y_nonzeroCols2)
        sigmoid_y2 = sigmoid(- usr_rep.transpose().dot(sumedW_y2))
        y_negsNonzeroCols2 = [[0, 3], [1, 2]]
        sumedW_negs2 = [sumOverW(W, [0, 3]).reshape(ITEM_FIELDS_NUM, 1), sumOverW(W, [1, 2]).reshape(ITEM_FIELDS_NUM, 1)]
        sigmoids_negs2 = [sigmoid(usr_rep.transpose().dot(sumedW_negs2[0])), sigmoid(usr_rep.transpose().dot(sumedW_negs2[1]))]
        sigmoidedSumedW2 = sigmoids_negs2[0] * sumedW_negs2[0] + sigmoids_negs2[1] * sumedW_negs2[1]

        self.assertEqual(y2, y)
        self.assertEqual(y_nonzeroCols2, y_nonzeroCols)
        self.assertEqual(itemsIndx2, itemsIndx)
        for x, e in enumerate(sumedW_y2):
            self.assertEqual(sumedW_y2[x], sumedW_y[x])
        self.assertEqual(sigmoid_y2, sigmoid_y)
        self.assertEqual(y_negsNonzeroCols2, y_negsNonzeroCols)
        for ind, e in enumerate(sumedW_negs2):
            for x, e2 in enumerate(e):
                self.assertEqual(e[x], sumedW_negs[ind][x])
        self.assertEqual(sigmoids_negs2, sigmoids_negs)
        for x, e in enumerate(sigmoidedSumedW2):
            self.assertEqual(sigmoidedSumedW2[x], sigmoidedSumedW[x])
Ejemplo n.º 2
0
def main(argv):
    ''' Parse args, init dataloader '''
    foldNum, dataset, subtitle, rating_file, usr2labels_file = parseArgs(
        argv[:4], **dict(arg.split('=') for arg in argv[4:]))
    if rating_file and usr2labels_file:
        dataloader = DATA2LOADER[dataset](
            rating_file=rating_file,
            usr2labels_file=usr2labels_file,
            sub=subtitle,
        )
    else:
        dataloader = DATA2LOADER[dataset]()
    ''' Load training conifgs '''
    NEG_SAMPLE_NUM, \
        ITEM_FIELDS_NUM, \
        MAX_TRAIN_NUM, \
        LEARNING_RATE, \
        MOMENTUM, \
        LAMBDA = dataloader.getTrainingConf()
    ''' Load each usr's BOI (and for valid data) '''
    usr2itemsIndx, ind2itemNum = dataloader.load()
    usrs = map(lambda usr: usr, usr2itemsIndx)
    ''' Assert enough usrs '''
    if foldNum > len(usrs):
        s = ' '.join(['foldNum: ', str(foldNum), '>', 'usrNums:', str(usrs)])
        raise Exception(s)
    ''' Acquire (for all usrs) usr2labels & usr2NonzeroCols '''
    usr2labels, usr2NonzeroCols = dataloader.get_labels(usrs)
    ''' Init Baseupdator '''
    baseupdator = Baseupdator(*dataloader.getTrainingConf())
    ''' K-fold validation '''
    kfolds = splitKfolds(usr2itemsIndx, foldNum)
    for ind, fold in enumerate(kfolds):
        # Init train/valid folds
        usr2itemsIndxValid = fold
        usr2itemsIndxTrain = {}
        for tind, tfold in enumerate(kfolds):
            if ind != tind:
                usr2itemsIndxTrain = merge_two_dicts(usr2itemsIndxTrain, tfold)

        # Init statevalidator
        statevalidator = DATA2VALIDATOR[dataset](
            dataset=dataset,
            datasetSub=dataloader.getDataSub(),
            curFold=ind,
            totalFolds=len(kfolds),
            usr2itemsIndxTrain=usr2itemsIndxTrain,
            usr2itemsIndxValid=usr2itemsIndxValid,
            MAX_TRAIN_NUM=MAX_TRAIN_NUM,
            ITEM_FIELDS_NUM=ITEM_FIELDS_NUM,
        )
        statevalidator.logFoldInfo()
        ''' acquire (k times) usr2NegativeSamples & usr2negsNonzeroCols '''
        cdfByLabels, labelsList = getDistribution(usr2labels)
        usr2NegativeSamples, usr2negsNonzeroCols = negativeSample(
            usr2labels, cdfByLabels, labelsList, k=NEG_SAMPLE_NUM)
        logging.info('usr2NegativeSamples, usr2negsNonzeroCols created')
        ''' init V to [-1, 1) '''
        numOfItems = len(ind2itemNum)
        V = 2 * nprandom.rand(numOfItems, ITEM_FIELDS_NUM) - 1
        logging.info('V inited, V.shape == ' + str(V.shape) +
                     ' == (num items, itemFeatures length)')
        ''' init W to [-1, 1); init pooler'''
        # Warn: assume ITEM_FIELDS_NUM is the same as usr's representation's dimension
        # (No dimension reduction in pooler!)
        totalLabelsNum = dataloader.gettotalLabelsNum()
        W = 2 * nprandom.rand(ITEM_FIELDS_NUM, totalLabelsNum) - 1
        pooler = sample_pooler()
        logging.info('W & pooler inited, W.shape == ' + str(W.shape) +
                     ' == (itemFeatures length, total labels num)')
        logging.debug(' '.join(['W', str(W)]))
        logging.debug(' '.join(['V', str(V)]))
        ''' learn W, V '''
        while statevalidator.notConv():
            # Init next run
            statevalidator.nextRun()

            # NegSampling or not
            if statevalidator.shouldNegSample():
                statevalidator.logStartNegSample()
                usr2NegativeSamples, usr2negsNonzeroCols = negativeSample(
                    usr2labels, cdfByLabels, labelsList, k=NEG_SAMPLE_NUM)
                statevalidator.logNegSampleInfo(usr2NegativeSamples)

            for usrid in usr2itemsIndxTrain:
                # Pooling
                usr_rep = pooler.pool_all(usr2itemsIndxTrain[usrid], V)

                # Get y, sumedW(for y AND negs), sigmoids(for y AND negs)
                y, y_nonzeroCols, itemsIndx, sumedW_y, sigmoid_y, \
                    y_negsNonzeroCols, sumedW_negs, sigmoids_negs, \
                    sigmoidedSumedW = baseupdator.getTerms(
                        usrid,
                        usr2labels,
                        usr2NonzeroCols,
                        usr2itemsIndxTrain,
                        W,
                        usr_rep,
                        usr2negsNonzeroCols,)

                # Get gradient of Wq (i.e. q-th column of W)
                gradsOfW = baseupdator.getGradsOfW(
                    W,
                    y_nonzeroCols,
                    sigmoid_y,
                    usr_rep,
                    sigmoids_negs,
                    y_negsNonzeroCols,
                )

                # Get gradient of Vitem
                gradsOfV = baseupdator.getGradsOfV(
                    V,
                    itemsIndx,
                    sumedW_y,
                    sigmoid_y,
                    sigmoidedSumedW,
                )

                # Update W, V by usr, not by epoch
                # Update gradients to W, V
                W, V = baseupdator.updateByGradients(
                    W,
                    V,
                    gradsOfW,
                    gradsOfV,
                    statevalidator.incrInd,
                )

            # Reveal stats/predictions
            if statevalidator.shouldRevealStats():
                # Cal loss if needed
                if statevalidator.shouldCalLoss():
                    loss = baseupdator.getLoss(
                        W,
                        V,
                        usr2NonzeroCols,
                        usr2negsNonzeroCols,
                        usr2itemsIndxTrain,
                        pooler,
                    )
                    statevalidator.updateLossState(loss)
                    statevalidator.logLossStates(W, V, loss)

                # Do predictions
                statevalidator.logStartPrediction()
                dataStats = statevalidator.getDataStats(
                    usr2itemsIndxValid, usr2itemsIndxTrain, usr2NonzeroCols)
                for d in dataStats:
                    usr2itemsIndx = d['usr2itemsIndx']
                    u2predictions = d['u2predictions']
                    for usrid in usr2itemsIndx:
                        usr_rep = pooler.pool_all(usr2itemsIndx[usrid], V)
                        bestCols = baseupdator.predictLabels(
                            usr_rep, W, dataloader.getBds())
                        u2predictions[usrid] = bestCols

                # Collect Stats
                statevalidator.logCollectingStats()
                KPI2getters = {
                    'microF1': getMicroF1ByCol,
                    'oneError': getOneError,
                    'RL': getRL,
                    'coverage': getCoverage,
                    'avgPrec': getAvgPrecision,
                    'hammingLoss': getHammingLoss,
                }
                for d in dataStats:
                    KPIArgs = {
                        'W': W,
                        'V': V,
                        'usr2itemsIndx': d['usr2itemsIndx'],
                        'usr2NonzeroCols': usr2NonzeroCols,
                        'u2predictions': d['u2predictions'],
                        'totalLabelsNum': dataloader.gettotalLabelsNum(),
                        'rlPairsCnt': dataloader.getRLPairsCnt(),
                    }
                    d['KPIs'] = {
                        kpi: getter(KPIArgs)
                        for kpi, getter in KPI2getters.iteritems()
                    }
                    # OR (no write): statevalidator.logStats(d)
                    statevalidator.writeCSVStats(d)

                # Log real, predicted
                if not TEST_SNE:
                    for d in dataStats:
                        statevalidator.logRealPredictedVals(d)
    return 1