Ejemplo n.º 1
0
    def test_getHammingLoss(self):
        pooler = sample_pooler.sample_pooler()
        V = np.array([[1, 1, 1],
                      [1, 1, 1],
                      ])
        W = np.array([[0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0],
                      [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0],
                      [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
                     # 0,       3, 4, 5,            ... 11 ...                                       26
                      ])
        usr2itemsIndx = {
          0: [0, 1],
          1: [1],
          2: [0],
        }

        usr2NonzeroCols = {
          0: [0, 4, 11],
          1: [1, 4, 10],
          2: [2, 4, 26],
        }

        bds = [[0, 3], [4, 5], [6, 26]]
        NEG_SAMPLE_NUM = int(random.random() * 10)  # Dont care
        ITEM_FIELDS_NUM = W.shape[0]
        MAX_TRAIN_NUM = int(random.random() * 10)  # Dont care
        LEARNING_RATE = random.random()  # Dont care
        MOMENTUM = int(random.random() * 10)  # Dont care
        LAMBDA = random.random()  # Dont care
        args = (NEG_SAMPLE_NUM, ITEM_FIELDS_NUM, MAX_TRAIN_NUM,
                LEARNING_RATE, MOMENTUM, LAMBDA)
        baseupdator = Baseupdator(*args)

        u2predictions = {}
        for usrid in usr2itemsIndx:
            usr_rep = pooler.pool_all(usr2itemsIndx[usrid], V)
            bestCols = baseupdator.predictLabels(usr_rep, W, bds)
            u2predictions[usrid] = bestCols

        KPIArgs = {
            'W': W,
            'V': V,
            'usr2itemsIndx': usr2itemsIndx,
            'usr2NonzeroCols': usr2NonzeroCols,
            'u2predictions': u2predictions,
            'totalLabelsNum': W.shape[1],
            'rlPairsCnt': int(random.random() * 10),  # Dont care
        }

        # should all guess 3,5,11
        expectHammingLoss = ((1 + 1 + 0) / 3.0 + (1 + 1 + 1) / 3.0 + (1 + 1 + 1) / 3.0) / 3
        actualHammingLoss = getHammingLoss(KPIArgs)
        self.assertEqual(expectHammingLoss, actualHammingLoss)
Ejemplo n.º 2
0
    def test_getRL(self):
        pooler = sample_pooler.sample_pooler()
        V = np.array([[1, 1, 1],
                      [1, 1, 1],
                      ])
        W = np.array([[0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0],
                      [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0],
                      [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
                     # 0,       3, 4, 5,            ... 11 ...                                       26
                      ])
        usr2itemsIndx = {
          0: [0, 1],
          1: [1],
        }

        usr2NonzeroCols = {
          0: [3, 5, 11],  # --> should be 0.0
          1: [1, 4, 10],  # --> should be 18/72.0
        }

        bds = [[0, 3], [4, 5], [6, 26]]
        NEG_SAMPLE_NUM = int(random.random() * 10)  # Dont care
        ITEM_FIELDS_NUM = W.shape[0]
        MAX_TRAIN_NUM = int(random.random() * 10)  # Dont care
        LEARNING_RATE = random.random()  # Dont care
        MOMENTUM = int(random.random() * 10)  # Dont care
        LAMBDA = random.random()  # Dont care
        args = (NEG_SAMPLE_NUM, ITEM_FIELDS_NUM, MAX_TRAIN_NUM,
                LEARNING_RATE, MOMENTUM, LAMBDA)
        baseupdator = Baseupdator(*args)

        u2predictions = {}
        for usrid in usr2itemsIndx:
            usr_rep = pooler.pool_all(usr2itemsIndx[usrid], V)
            bestCols = baseupdator.predictLabels(usr_rep, W, bds)
            u2predictions[usrid] = bestCols

        onesCnt = len(next(usr2NonzeroCols.itervalues()))

        KPIArgs = {
            'W': W,
            'V': V,
            'usr2itemsIndx': usr2itemsIndx,
            'usr2NonzeroCols': usr2NonzeroCols,
            'u2predictions': u2predictions,
            'totalLabelsNum': W.shape[1],
            'rlPairsCnt': (W.shape[1] - onesCnt) * onesCnt,
        }

        # should all guess 3,5,11
        expectRL = (0.0 + 1/4.0) / 2
        actualRL = getRL(KPIArgs)
        self.assertEqual(expectRL, actualRL)
Ejemplo n.º 3
0
 def test_pooler(self):
     pooler = sample_pooler.sample_pooler()
     usr2itemsIndx = {
       1: [0, 3, 4],
     }
     V = np.array([[1.0, 1.0],
                   [2.0, 2.0],
                   [2.0, 2.0],
                   [2.0, 2.0],
                   [3.0, 3.0],
                   ])
     usr_rep = pooler.pool_all(usr2itemsIndx[1], V)
     for x, val in enumerate(usr_rep):
         self.assertEqual(usr_rep[x], 2.0)
Ejemplo n.º 4
0
def main(argv):
    ''' Parse args, init dataloader '''
    foldNum, dataset, subtitle, rating_file, usr2labels_file = parseArgs(
        argv[:4], **dict(arg.split('=') for arg in argv[4:]))
    if rating_file and usr2labels_file:
        dataloader = DATA2LOADER[dataset](
            rating_file=rating_file,
            usr2labels_file=usr2labels_file,
            sub=subtitle,
        )
    else:
        dataloader = DATA2LOADER[dataset]()
    ''' Load training conifgs '''
    NEG_SAMPLE_NUM, \
        ITEM_FIELDS_NUM, \
        MAX_TRAIN_NUM, \
        LEARNING_RATE, \
        MOMENTUM, \
        LAMBDA = dataloader.getTrainingConf()
    ''' Load each usr's BOI (and for valid data) '''
    usr2itemsIndx, ind2itemNum = dataloader.load()
    usrs = map(lambda usr: usr, usr2itemsIndx)
    ''' Assert enough usrs '''
    if foldNum > len(usrs):
        s = ' '.join(['foldNum: ', str(foldNum), '>', 'usrNums:', str(usrs)])
        raise Exception(s)
    ''' Acquire (for all usrs) usr2labels & usr2NonzeroCols '''
    usr2labels, usr2NonzeroCols = dataloader.get_labels(usrs)
    ''' Init Baseupdator '''
    baseupdator = Baseupdator(*dataloader.getTrainingConf())
    ''' K-fold validation '''
    kfolds = splitKfolds(usr2itemsIndx, foldNum)
    for ind, fold in enumerate(kfolds):
        # Init train/valid folds
        usr2itemsIndxValid = fold
        usr2itemsIndxTrain = {}
        for tind, tfold in enumerate(kfolds):
            if ind != tind:
                usr2itemsIndxTrain = merge_two_dicts(usr2itemsIndxTrain, tfold)

        # Init statevalidator
        statevalidator = DATA2VALIDATOR[dataset](
            dataset=dataset,
            datasetSub=dataloader.getDataSub(),
            curFold=ind,
            totalFolds=len(kfolds),
            usr2itemsIndxTrain=usr2itemsIndxTrain,
            usr2itemsIndxValid=usr2itemsIndxValid,
            MAX_TRAIN_NUM=MAX_TRAIN_NUM,
            ITEM_FIELDS_NUM=ITEM_FIELDS_NUM,
        )
        statevalidator.logFoldInfo()
        ''' acquire (k times) usr2NegativeSamples & usr2negsNonzeroCols '''
        cdfByLabels, labelsList = getDistribution(usr2labels)
        usr2NegativeSamples, usr2negsNonzeroCols = negativeSample(
            usr2labels, cdfByLabels, labelsList, k=NEG_SAMPLE_NUM)
        logging.info('usr2NegativeSamples, usr2negsNonzeroCols created')
        ''' init V to [-1, 1) '''
        numOfItems = len(ind2itemNum)
        V = 2 * nprandom.rand(numOfItems, ITEM_FIELDS_NUM) - 1
        logging.info('V inited, V.shape == ' + str(V.shape) +
                     ' == (num items, itemFeatures length)')
        ''' init W to [-1, 1); init pooler'''
        # Warn: assume ITEM_FIELDS_NUM is the same as usr's representation's dimension
        # (No dimension reduction in pooler!)
        totalLabelsNum = dataloader.gettotalLabelsNum()
        W = 2 * nprandom.rand(ITEM_FIELDS_NUM, totalLabelsNum) - 1
        pooler = sample_pooler()
        logging.info('W & pooler inited, W.shape == ' + str(W.shape) +
                     ' == (itemFeatures length, total labels num)')
        logging.debug(' '.join(['W', str(W)]))
        logging.debug(' '.join(['V', str(V)]))
        ''' learn W, V '''
        while statevalidator.notConv():
            # Init next run
            statevalidator.nextRun()

            # NegSampling or not
            if statevalidator.shouldNegSample():
                statevalidator.logStartNegSample()
                usr2NegativeSamples, usr2negsNonzeroCols = negativeSample(
                    usr2labels, cdfByLabels, labelsList, k=NEG_SAMPLE_NUM)
                statevalidator.logNegSampleInfo(usr2NegativeSamples)

            for usrid in usr2itemsIndxTrain:
                # Pooling
                usr_rep = pooler.pool_all(usr2itemsIndxTrain[usrid], V)

                # Get y, sumedW(for y AND negs), sigmoids(for y AND negs)
                y, y_nonzeroCols, itemsIndx, sumedW_y, sigmoid_y, \
                    y_negsNonzeroCols, sumedW_negs, sigmoids_negs, \
                    sigmoidedSumedW = baseupdator.getTerms(
                        usrid,
                        usr2labels,
                        usr2NonzeroCols,
                        usr2itemsIndxTrain,
                        W,
                        usr_rep,
                        usr2negsNonzeroCols,)

                # Get gradient of Wq (i.e. q-th column of W)
                gradsOfW = baseupdator.getGradsOfW(
                    W,
                    y_nonzeroCols,
                    sigmoid_y,
                    usr_rep,
                    sigmoids_negs,
                    y_negsNonzeroCols,
                )

                # Get gradient of Vitem
                gradsOfV = baseupdator.getGradsOfV(
                    V,
                    itemsIndx,
                    sumedW_y,
                    sigmoid_y,
                    sigmoidedSumedW,
                )

                # Update W, V by usr, not by epoch
                # Update gradients to W, V
                W, V = baseupdator.updateByGradients(
                    W,
                    V,
                    gradsOfW,
                    gradsOfV,
                    statevalidator.incrInd,
                )

            # Reveal stats/predictions
            if statevalidator.shouldRevealStats():
                # Cal loss if needed
                if statevalidator.shouldCalLoss():
                    loss = baseupdator.getLoss(
                        W,
                        V,
                        usr2NonzeroCols,
                        usr2negsNonzeroCols,
                        usr2itemsIndxTrain,
                        pooler,
                    )
                    statevalidator.updateLossState(loss)
                    statevalidator.logLossStates(W, V, loss)

                # Do predictions
                statevalidator.logStartPrediction()
                dataStats = statevalidator.getDataStats(
                    usr2itemsIndxValid, usr2itemsIndxTrain, usr2NonzeroCols)
                for d in dataStats:
                    usr2itemsIndx = d['usr2itemsIndx']
                    u2predictions = d['u2predictions']
                    for usrid in usr2itemsIndx:
                        usr_rep = pooler.pool_all(usr2itemsIndx[usrid], V)
                        bestCols = baseupdator.predictLabels(
                            usr_rep, W, dataloader.getBds())
                        u2predictions[usrid] = bestCols

                # Collect Stats
                statevalidator.logCollectingStats()
                KPI2getters = {
                    'microF1': getMicroF1ByCol,
                    'oneError': getOneError,
                    'RL': getRL,
                    'coverage': getCoverage,
                    'avgPrec': getAvgPrecision,
                    'hammingLoss': getHammingLoss,
                }
                for d in dataStats:
                    KPIArgs = {
                        'W': W,
                        'V': V,
                        'usr2itemsIndx': d['usr2itemsIndx'],
                        'usr2NonzeroCols': usr2NonzeroCols,
                        'u2predictions': d['u2predictions'],
                        'totalLabelsNum': dataloader.gettotalLabelsNum(),
                        'rlPairsCnt': dataloader.getRLPairsCnt(),
                    }
                    d['KPIs'] = {
                        kpi: getter(KPIArgs)
                        for kpi, getter in KPI2getters.iteritems()
                    }
                    # OR (no write): statevalidator.logStats(d)
                    statevalidator.writeCSVStats(d)

                # Log real, predicted
                if not TEST_SNE:
                    for d in dataStats:
                        statevalidator.logRealPredictedVals(d)
    return 1