def check_classification_losses(loss, degree):
    y = np.sign(_poly_predict(X, P, lams, kernel="anova", degree=degree))
    clf = FactorizationMachineClassifier(degree=degree, loss=loss, beta=1e-3,
                                         fit_lower=None, fit_linear=False,
                                         tol=1e-3, random_state=0)
    clf.fit(X, y)
    assert_equal(1.0, clf.score(X, y))
Ejemplo n.º 2
0
def test_augment():
    # The following linear separable dataset cannot be modeled with just an FM
    X_evil = np.array([[-1, -1], [1, 1]])
    y_evil = np.array([-1, 1])
    clf = FactorizationMachineClassifier(fit_linear=False,
                                         fit_lower=None,
                                         random_state=0)
    clf.fit(X_evil, y_evil)
    assert_equal(0.5, clf.score(X_evil, y_evil))

    # However, by adding a dummy feature (a column of all ones), the linear
    # effect can be captured.
    clf = FactorizationMachineClassifier(fit_linear=False,
                                         fit_lower='augment',
                                         random_state=0)
    clf.fit(X_evil, y_evil)
    assert_equal(1.0, clf.score(X_evil, y_evil))
Ejemplo n.º 3
0
def test_augment():
    # The following linear separable dataset cannot be modeled with just an FM
    X_evil = np.array([[-1, -1], [1, 1]])
    y_evil = np.array([-1, 1])
    clf = FactorizationMachineClassifier(fit_linear=False, fit_lower=None,
                                         random_state=0)
    clf.fit(X_evil, y_evil)
    assert_equal(0.5, clf.score(X_evil, y_evil))

    # However, by adding a dummy feature (a column of all ones), the linear
    # effect can be captured.
    clf = FactorizationMachineClassifier(fit_linear=False, fit_lower='augment',
                                         random_state=0)
    clf.fit(X_evil, y_evil)
    assert_equal(1.0, clf.score(X_evil, y_evil))
Ejemplo n.º 4
0
def ModelParamSearch(Param, Model, NumbModel, N_splits, X_train, y_train):
    N_iter_search = np.maximum(10, NumbModel*4)
    ss = ShuffleSplit(n_splits=N_splits, test_size=0.2, random_state=0)
    if Model == 'XGB':
        clf = XGBoostClassifier(eval_metric = 'auc', num_class = 2, 
                                nthread = 4, silent = 1)
    elif Model == 'NN':
        clf = MLPClassifier(max_iter=500)
    elif Model == 'FM':
        clf = FactorizationMachineClassifier(max_iter=500)
    elif Model == 'LR':
        clf = LogisticRegression(max_iter=500)
    elif Model == 'KNN':
        clf = KNeighborsClassifier()
    elif Model == 'RF':
        clf = RandomForestClassifier(oob_score=True, bootstrap=True)
        
    ParamDict = RandomParamSearch(clf, X_train, y_train, N_iter_search, 
                                    NumbModel, Param, ss, 0, Model)
    
    return ParamDict, clf
Ejemplo n.º 5
0
from polylearn import FactorizationMachineClassifier

xx, yy = np.meshgrid(np.linspace(-3, 3, 500), np.linspace(-3, 3, 500))

rng = np.random.RandomState(42)
X = rng.randn(300, 2)
y = np.logical_xor(X[:, 0] > 0, X[:, 1] > 0)

# XOR is too easy for factorization machines, so add noise :)
flip = rng.randint(300, size=15)
y[flip] = ~y[flip]

# fit the model
fm = FactorizationMachineClassifier(n_components=1,
                                    fit_linear=False,
                                    random_state=0)
fm.fit(X, y)

# fit a NuSVC for comparison
svc = NuSVC(kernel='poly', degree=2)
svc.fit(X, y)

# plot the decision function for each datapoint on the grid
Z = fm.decision_function(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

Z_svc = svc.decision_function(np.c_[xx.ravel(), yy.ravel()])
Z_svc = Z_svc.reshape(xx.shape)

plt.imshow(Z,
Ejemplo n.º 6
0
from polylearn import FactorizationMachineClassifier

xx, yy = np.meshgrid(np.linspace(-3, 3, 500),
                     np.linspace(-3, 3, 500))

rng = np.random.RandomState(42)
X = rng.randn(300, 2)
y = np.logical_xor(X[:, 0] > 0, X[:, 1] > 0)

# XOR is too easy for factorization machines, so add noise :)
flip = rng.randint(300, size=15)
y[flip] = ~y[flip]

# fit the model
fm = FactorizationMachineClassifier(n_components=1, fit_linear=False,
                                    random_state=0)
fm.fit(X, y)

# fit a NuSVC for comparison
svc = NuSVC(kernel='poly', degree=2)
svc.fit(X, y)

# plot the decision function for each datapoint on the grid
Z = fm.decision_function(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

Z_svc = svc.decision_function(np.c_[xx.ravel(), yy.ravel()])
Z_svc = Z_svc.reshape(xx.shape)

plt.imshow(Z, interpolation='nearest',
           extent=(xx.min(), xx.max(), yy.min(), yy.max()), aspect='auto',
Ejemplo n.º 7
0
def ModelStacking(Params, Models, NumbModels, X_train, y_train, X):
    N_splits = 5 # number of splits for shuffle splitting
    NotDone = 1
    while NotDone:
        NewX = pd.Series().reindex_like(y_train) # create temp pandas Series 
        ResultParams = [None]*len(Models)
        for i in range(len(NumbModels)):
            print('## Searching parameter set for model: {} ... '.format(Models[i]))
            ResultParams[i], ThisClf = ModelParamSearch(Params[i], Models[i], NumbModels[i], 
                                         N_splits, X_train, y_train)
            print('## Creating classifiers and predicting results of model: {} ...'.format(Models[i]))
            for ii in range(len(ResultParams[i])):
                # create classifiers and fit it
                ThisClf.set_params(**ResultParams[i][ii]).fit(X_train, y_train)
                ThisPred = ThisClf.predict(X_train) # prediction of this model
                ThisTempPredSeries = pd.Series(ThisPred, index=y_train.index)
                NewX = pd.concat([NewX, ThisTempPredSeries], axis=1)
                
        ColumnNumbers = [x for x in range(NewX.shape[1])]
        ColumnNumbers.remove(0)
        NewX = NewX.iloc[:, ColumnNumbers]
        
        ColNames = []
        for i in range(len(NumbModels)):
            for ii in range(NumbModels[i]):
                ColNames.append(Models[i]+str(ii+1))
        NewX.columns = ColNames # rename columns
        
        # Let's remove some features that show high correlation
        NewX = RemoveCorrFeat(NewX,0.9,np.around(NewX.shape[1]*.8).astype(int))
        if NewX.shape[1] != 0:
            NotDone = 0
    
    NewNewX = pd.Series()
    for i in range(NewX.columns.shape[0]):
        ThisModelName = NewX.columns[i]
        ThisModelNameStr = ThisModelName[0:len(ThisModelName)-len(filter(str.isdigit,ThisModelName))]
        ThisModelNumber = int(filter(str.isdigit,ThisModelName))
        WhichModel = Models.index(ThisModelNameStr)
        if ThisModelNameStr == 'XGB':
            ThisClf = XGBoostClassifier(eval_metric = 'auc', num_class = 2, 
                                    nthread = 4, silent = 1)
        elif ThisModelNameStr == 'NN':
            ThisClf = MLPClassifier(max_iter=500)
        elif ThisModelNameStr == 'FM':
            ThisClf = FactorizationMachineClassifier(max_iter=500)
        elif ThisModelNameStr == 'LR':
            ThisClf = LogisticRegression(max_iter=500)
        elif ThisModelNameStr == 'KNN':
            ThisClf = KNeighborsClassifier()
        elif ThisModelNameStr == 'RF':
            ThisClf = RandomForestClassifier(oob_score=True, bootstrap=True)
        
        ThisClf.set_params(**ResultParams[WhichModel][ThisModelNumber-1]).fit(X_train, y_train)
        ThisPred = ThisClf.predict(X) # prediction of this model
        ThisTempPredSeries = pd.Series(ThisPred)
        NewNewX = pd.concat([NewNewX, ThisTempPredSeries], axis=1)
    
    ColumnNumbers = [x for x in range(NewNewX.shape[1])]
    ColumnNumbers.remove(0)
    NewNewX = NewNewX.iloc[:, ColumnNumbers]
    
    NewNewX.columns = NewX.columns # rename columns
    
    return NewX, NewNewX
Ejemplo n.º 8
0
import numpy as np
import scipy.sparse as sp

from sklearn.base import clone
from sklearn.metrics import accuracy_score, f1_score
from sklearn.datasets import fetch_20newsgroups_vectorized

from polylearn import (FactorizationMachineClassifier,
                       PolynomialNetworkClassifier)

estimators = {
    'fm-2':
    FactorizationMachineClassifier(n_components=30,
                                   fit_linear=False,
                                   fit_lower=None,
                                   degree=2,
                                   random_state=0,
                                   max_iter=10),
    'polynet-2':
    PolynomialNetworkClassifier(n_components=15,
                                degree=2,
                                fit_lower=None,
                                max_iter=10,
                                random_state=0)
}

estimators['fm-3'] = clone(estimators['fm-2']).set_params(degree=3)
estimators['polynet-3'] = (clone(estimators['polynet-2']).set_params(
    degree=3, n_components=10))

if __name__ == '__main__':
Ejemplo n.º 9
0
class FMBidModel(BidModelInterface):
    # _regressionFormulaY =''
    # _regressionFormulaX =''
    _model = None
    _cBudget = 0
    _modelType = None

    def __init__(self, cBudget=6250 * 1000, modelType="fmclassificationsgd"):
        """

        # :param regressionFormulaY:
        # :param regressionFormulaX:
        :param cBudget:
        # :param avgCTR:
        :param modelType: Options ['fmclassificationsgd','fmclassificationals','polylearn']
        """
        # self._regressionFormulaY=regressionFormulaY
        # self._regressionFormulaX = regressionFormulaX
        # self._defaultBid = 0
        self._cBudget = cBudget
        # self._avgCTR=avgCTR
        self._modelType = modelType

    def getThreshold(self):
        return 0.5

    def __computeBidPrice(self, pCTR=None):
        """
        The default computation to compute bid price
        The implemented model should have its own ways to gather the necessary parameters as follows
        :param basebid:Using the budget in this case
        :param pCTR: Compute the probability that click=1 for that bidrequest
        :param avgCTR: Consider this as the avgCTR for the training set
        :return: bid
        """
        bid = BidEstimator().linearBidPrice_mConfi(y_pred=pCTR,
                                                   base_bid=self._cBudget,
                                                   m_conf=0.8,
                                                   variable_bid=10)
        print("Bid type:", type(bid))
        return bid

    def __predictClickOneProb(self, testDF):
        """
        Perform prediction for click label.
        Take the output of click=1 probability as the CTR.
        :param oneBidRequest:
        :return:
        """

        print("Setting up X test for prediction")
        xTest = testDF

        print("Converting to sparse matrix")
        xTest = scipy.sparse.csc_matrix(xTest.as_matrix())

        # predict click labels for the test set
        print("Predicting test set...")

        # FastFM only give a probabilty of a click=1
        predictedClickOneProb = self._model.predict_proba(xTest)

        return predictedClickOneProb

    def __predictClickOne(self, testDF):
        """
        Perform prediction for click label.
        Take the output of click=0 or 1 as the CTR.
        :param oneBidRequest:
        :return:
        """

        print("Setting up X test for prediction")
        xTest = testDF

        print("Converting to sparse matrix")
        xTest = scipy.sparse.csc_matrix(xTest.as_matrix())

        # predict click labels for the test set
        print("Predicting test set...")

        # FastFM only give a probabilty of a click=1
        predictedClick = self._model.predict(xTest, self.getThreshold())

        return predictedClick

    def trimToBudget(self, bidpriceDF, budget):
        """
        In case the bidding process exceeds the budget, trim down the bidding
        :param bidpriceDF:
        :param budget:
        :return:
        """
        print("Trimming....")
        totalspend = np.sum(bidpriceDF)
        overspend = totalspend - budget
        print("bidpriceDF:", bidpriceDF.shape)
        print("budget:", budget)
        print("totalspend:", totalspend)
        print("overspend:", overspend)
        i = -1
        while overspend > 0 and len(bidpriceDF) + i > 0:
            overspend += -bidpriceDF[i]
            bidpriceDF[i] = 0
            i += -1

        print("bidpriceDF:", bidpriceDF)
        print("np.sum(bidpriceDF:", np.sum(bidpriceDF))
        assert (np.sum(bidpriceDF) < budget)
        return bidpriceDF

    def getBidPrice(self,
                    xTestOneHotDF,
                    yValDF,
                    noBidThreshold=0.2833333,
                    minBid=200,
                    bidRange=90,
                    sigmoidDegree=-10):
        """
        Retrieve the bidding price
        :param xTestOneHotDF:
        :param yValDF:
        :param noBidThreshold:
        :param minBid:
        :param bidRange:
        :param sigmoidDegree:
        :return:
        """
        print("Computing bid price")
        print("xTestOneHotDF:", xTestOneHotDF.shape, list(xTestOneHotDF))
        print("yValDF:", yValDF.shape, list(yValDF))
        if (self._model == None):
            raise ModelNotTrainedException(
                "Model must be trained prior to prediction!")

        pCTR = self.__predictClickOneProb(xTestOneHotDF)[:,
                                                         1]  #Prob of click==1
        bidprice = BidEstimator().thresholdSigmoid(predOneProb=pCTR,
                                                   noBidThreshold=0.2833333,
                                                   minBid=200,
                                                   bidRange=90,
                                                   sigmoidDegree=-10)
        print("bidprice:", bidprice)
        bidprice = self.trimToBudget(bidprice, self._cBudget)
        print("bidprice after trim:", bidprice)

        #merge with bidid
        bidpriceDF = pd.DataFrame(bidprice, columns=['bidprice'])
        print("bidpriceDF:", bidpriceDF.shape, list(bidpriceDF))
        bididDF = pd.DataFrame(yValDF['bidid'], columns=['bidid'])
        print("bididDF:", bididDF.shape, list(bididDF))
        bidIdPriceDF = pd.concat([bididDF, bidpriceDF],
                                 axis=1,
                                 ignore_index=True)
        print("bidIdPriceDF:", bidIdPriceDF.shape, list(bidIdPriceDF))
        return bidIdPriceDF

    # def getBidPrice(self, allBidRequest):
    #     """
    #     1. Predict click=1 prob for entire test/validation set
    #         Considered as pCTR for each impression
    #     2. Use the bid=base_price*(pCTR/avgCTR) formula
    #     :param oneBidRequest:
    #     :return:
    #     """
    #
    #     if(self._model==None):
    #         raise ModelNotTrainedException("Model must be trained prior to prediction!")
    #
    #
    #
    #     #Compute the CTR of this BidRequest
    #     pCTR=self.__predictClickOneProb(allBidRequest)[:,1]
    #     print("General sensing of pCTR ranges")
    #     print(pCTR)
    #
    #     #Compute the bid price
    #     bids = np.apply_along_axis(self.__computeBidPrice, axis=0, arr=pCTR)
    #     print("General sensing of bids ranges")
    #     print(bids)
    #
    #     #Extract the corresponding bidid
    #     allBidRequestMatrix=allBidRequest.as_matrix(columns=['bidid'])
    #
    #     #Merging bidid and bids into a table (Needed for eval)
    #     bidid_bids=np.column_stack((allBidRequestMatrix, bids))
    #
    #     bids = pd.DataFrame(bidid_bids, columns=['bidid', 'bidprice'])
    #     return bids

    def trainModel(self, X, y, retrain=True, modelFile=None):
        """
        Train model using FM for Click against a set of features
        Trained model will be saved to disk (No need retrain/reload training data in future if not required during program rerun)
        :param allTrainData:
        :param retrain: If False, will load self._modelFile instead of training the dataset.
        :param modelFile: To save trained model into physical file.
        :return:
        """
        self._modelFile = modelFile
        print("Getting xTrain")
        xTrain = X
        yTrain = y
        print("xTrain:", xTrain.shape, list(xTrain))
        print("yTrain:", yTrain.shape, set(yTrain['click']), "ListL",
              list(yTrain))
        yTrain['click'] = yTrain['click'].map({0: -1, 1: 1})

        xTrain.to_csv("data.pruned/xTrain.csv")
        yTrain.to_csv("data.pruned/yTrain.csv")

        print("xTrain:", list(xTrain))
        xTrain = xTrain.as_matrix()
        yTrain = yTrain['click'].as_matrix()

        if (retrain):
            print("Performing oversampling to even out")
            xTrain, yTrain = ImbalanceSampling().oversampling_SMOTE(X=xTrain,
                                                                    y=yTrain)
            #ADASYN is slower and doesn't offer better model performance, choose SMOTE instead.
            # xTrain, yTrain = ImbalanceSampling().oversampling_ADASYN(X=xTrain, y=yTrain)

        # instantiate a logistic regression model
        if (self._modelType == 'fmclassificationals'):
            #Don't use this
            print(
                "Factorisation Machine with ALS solver will be used for training"
            )
            print("Converting X to sparse matrix, required by FastFM")
            xTrain = scipy.sparse.csc_matrix(xTrain)
            self._model = als.FMClassification(n_iter=3000, rank=2, verbose=1)

        elif (self._modelType == 'fmclassificationsgd'):
            # Use this, best results
            print(
                "Factorisation Machine with SGD solver will be used for training"
            )
            print("Converting X to sparse matrix, required by FastFM")
            xTrain = scipy.sparse.csc_matrix(xTrain)
            print(
                "Training with n_iter=200000, rank=2, l2_reg_w=0.0005, l2_reg_V=0.0005, l2_reg=0.0005,step_size=0.01"
            )

            # Best Training set score: 0.9121148444887212
            # Best Param: {'n_iter': 200000, 'l2_reg_w': 0.0005, 'step_size': 0.004, 'l2_reg_V': 0.005, 'rank': 16}
            self._model = SGDFMClassification(n_iter=200000,
                                              rank=16,
                                              l2_reg_w=0.0005,
                                              l2_reg_V=0.0005,
                                              l2_reg=0.0005,
                                              step_size=0.01)

        elif (self._modelType == 'polylearn'):
            # Don't use this
            print(
                "Factorisation Machine from scitkit-learn-contrib polylearn will be used for training"
            )
            self._model = FactorizationMachineClassifier(degree=2,
                                                         loss='squared_hinge',
                                                         n_components=2,
                                                         alpha=1,
                                                         beta=1,
                                                         tol=1e-3,
                                                         fit_lower='explicit',
                                                         fit_linear=True,
                                                         warm_start=False,
                                                         init_lambdas='ones',
                                                         max_iter=5000,
                                                         verbose=True,
                                                         random_state=None)

        else:
            raise ModelNotTrainedException(
                'Selected model not available',
                'Valid models are polylearn,fmclassificationsgd,fmclassificationals'
            )

        if (retrain):
            print("Setting up Y and X for training")
            print(datetime.datetime.now())

            print("Training Model...")
            print(datetime.datetime.now())

            self._model = self._model.fit(xTrain,
                                          yTrain)  # Loss function:liblinear
            super(FMBidModel, self).saveModel(self._model, self._modelFile)

        else:
            self._model = super(FMBidModel,
                                self).loadSavedModel(self._modelFile)

        print("Training completed")
        print(datetime.datetime.now())

    def optimiseBid(self, xTestDF, yTestDF):
        """
        Perform bid optimisation based on params
        :param xTestDF:
        :param yTestDF:
        :return:
        """
        print(" xTestDF:", xTestDF.shape, "\n", list(xTestDF))
        print(" yTestDF:", yTestDF.shape, "\n", list(yTestDF))
        result = pd.concat([xTestDF, yTestDF], axis=1)
        print(" result:", result.shape, "\n", list(result))
        predProb = self.__predictClickOneProb(xTestDF)
        be = BidEstimator()
        be.gridSearch_bidPrice(predProb[:, 1],
                               0,
                               0,
                               result,
                               bidpriceest_model='thresholdsigmoid')

    def gridSearchandCrossValidateFastSGD(self, X, y, retrain=True):
        """
        Perform gridsearch on FM model
        :param X:
        :param y:
        :param retrain:
        :return:
        """
        # n_iter=100000, rank=2, l2_reg_w=0.01, l2_reg_V=0.01, l2_reg=0.01, step_size=0.004
        print("Getting xTrain")
        xTrain = X
        yTrain = y
        print("xTrain:", xTrain.shape, list(xTrain))
        print("yTrain:", yTrain.shape, set(yTrain['click']), "ListL",
              list(yTrain))
        yTrain['click'] = yTrain['click'].map({0: -1, 1: 1})

        # xTrain.to_csv("data.pruned/xTrain.csv")
        # yTrain.to_csv("data.pruned/yTrain.csv")

        print("xTrain:", list(xTrain))
        xTrain = xTrain.as_matrix()
        yTrain = yTrain['click'].as_matrix()
        print("Performing oversampling to even out")
        xTrain, yTrain = ImbalanceSampling().oversampling_SMOTE(X=xTrain,
                                                                y=yTrain)

        print(
            "Factorisation Machine with SGD solver will be used for training")
        print("Converting X to sparse matrix, required by FastFM")
        xTrain = scipy.sparse.csc_matrix(xTrain)

        param_grid = [{
            'n_iter': [150000, 200000, 250000],
            'l2_reg_w': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1],
            'l2_reg_V': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1],
            # 'l2_reg': [0.0001,0.0005,0.001,0.005,0.01,0.05,0.1],
            'step_size': [0.0005, 0.004, 0.007],
            'rank': [32, 36, 42, 46, 52, 56, 64]
            # 'n_iter': [5000],
            # 'l2_reg_w': [0.0005, 0.001],
            # 'l2_reg_V': [0.0005, 0.001],
            # 'l2_reg': [0.0005],
            # 'step_size': [ 0.004]
        }]

        optimized_LR = GridSearchCV(
            SGDFMClassification(),
            param_grid=param_grid,
            scoring='roc_auc',
            cv=5,
            # n_jobs=-1,
            error_score='raise',
            verbose=1)
        print("Training model..")
        print(datetime.datetime.now())
        if (retrain):
            self._model = optimized_LR.fit(xTrain, yTrain)
        print("Training complete")
        print(datetime.datetime.now())

        print("Best Score: ", optimized_LR.best_score_)
        print("Best Param: ", optimized_LR.best_params_)

    def validateModel(self, xVal, yVal):
        """
        Changelog:
        - 1/4 KS Return PredictProb for emsemble
        Perform validation of model with different metrics and graphs for analysis
        :param xVal:
        :param yVal:
        :return: predictedProb[:,1]  Prob of all click=1
        """
        if (self._model != None):
            print("Setting up X Y validation for prediction")

            xValidate = xVal
            yVal['click'] = yVal['click'].map({0: -1, 1: 1})

            xVal = xVal.reset_index(drop=True)
            yVal = yVal.reset_index(drop=True)

            click1list = yVal[yVal['click'] == 1].index.tolist()
            click0list = yVal[yVal['click'] == -1].index.tolist()
            print("yVal:", (yVal).shape)
            print("click1list:", len(click1list))
            print("click0list:", len(click0list))

            print("Converting to sparse matrix")
            xValidate = scipy.sparse.csc_matrix(xValidate.as_matrix())

            # predict click labels for the validation set
            print("Predicting validation set...")
            predicted = self._model.predict(xValidate)
            predictedProb = self._model.predict_proba(xValidate)

            predictedOneProbForclick1 = predictedProb[click1list][:, 1]
            predictedOneProbForclick0 = predictedProb[click0list][:, 1]
            print("predictedProbclick1:", (predictedOneProbForclick1).shape)
            print("predictedProbclick0:", (predictedOneProbForclick0).shape)
            print("yVal['click']", yVal['click'].shape)
            print("predictedProb:", predictedProb.shape)
            print("roc_auc", roc_auc_score(yVal['click'], predictedProb[:, 1]))

            #Get the Goldclick==1 and retrieve the predictedProb1 for it
            if (False):  #Set this to True if want to see plots
                Evaluator.ClickEvaluator().clickProbHistogram(
                    predictedOneProbForclick1,
                    title='Click=1',
                    showGraph=False)

                # Get the Goldclick==0 and retrieve the predictedProb1 for it
                Evaluator.ClickEvaluator().clickProbHistogram(
                    predictedOneProbForclick0,
                    title='Click=0',
                    showGraph=False)

                Evaluator.ClickEvaluator().clickROC(yVal['click'],
                                                    predictedProb[:, 1],
                                                    showGraph=False)

                #Convert -1 to 0 as Evaluator printClickPredictionScore cannot handle -1
                predicted[predicted == -1] = 0
                yVal['click'] = yVal['click'].map({-1: 0, 1: 1})
                Evaluator.ClickEvaluator().printClickPredictionScore(
                    predicted, yVal['click'])

                # cnf_matrix = confusion_matrix(yVal['click'], predicted)

                # Evaluator.ClickEvaluator().plot_confusion_matrix(cm=cnf_matrix,classes=set(yVal['click']),plotgraph=False,printStats=False)
                #Change back, just in case
                predicted[predicted == 0] = -1
                yVal['click'] = yVal['click'].map({0: -1, 1: 1})

                print("Gold label: ", yVal['click'])
                print("predicted label: ", predicted)

                print("Writing to validated prediction csv")
                valPredictionWriter = ResultWriter()
                valPredictionWriter.writeResult(
                    filename="data.pruned/FastFMpredictValidate.csv",
                    data=predicted)

        else:
            print("Error: No model was trained in this instance....")

        return predictedProb[:, 1]
Ejemplo n.º 10
0
    def trainModel(self, X, y, retrain=True, modelFile=None):
        """
        Train model using FM for Click against a set of features
        Trained model will be saved to disk (No need retrain/reload training data in future if not required during program rerun)
        :param allTrainData:
        :param retrain: If False, will load self._modelFile instead of training the dataset.
        :param modelFile: To save trained model into physical file.
        :return:
        """
        self._modelFile = modelFile
        print("Getting xTrain")
        xTrain = X
        yTrain = y
        print("xTrain:", xTrain.shape, list(xTrain))
        print("yTrain:", yTrain.shape, set(yTrain['click']), "ListL",
              list(yTrain))
        yTrain['click'] = yTrain['click'].map({0: -1, 1: 1})

        xTrain.to_csv("data.pruned/xTrain.csv")
        yTrain.to_csv("data.pruned/yTrain.csv")

        print("xTrain:", list(xTrain))
        xTrain = xTrain.as_matrix()
        yTrain = yTrain['click'].as_matrix()

        if (retrain):
            print("Performing oversampling to even out")
            xTrain, yTrain = ImbalanceSampling().oversampling_SMOTE(X=xTrain,
                                                                    y=yTrain)
            #ADASYN is slower and doesn't offer better model performance, choose SMOTE instead.
            # xTrain, yTrain = ImbalanceSampling().oversampling_ADASYN(X=xTrain, y=yTrain)

        # instantiate a logistic regression model
        if (self._modelType == 'fmclassificationals'):
            #Don't use this
            print(
                "Factorisation Machine with ALS solver will be used for training"
            )
            print("Converting X to sparse matrix, required by FastFM")
            xTrain = scipy.sparse.csc_matrix(xTrain)
            self._model = als.FMClassification(n_iter=3000, rank=2, verbose=1)

        elif (self._modelType == 'fmclassificationsgd'):
            # Use this, best results
            print(
                "Factorisation Machine with SGD solver will be used for training"
            )
            print("Converting X to sparse matrix, required by FastFM")
            xTrain = scipy.sparse.csc_matrix(xTrain)
            print(
                "Training with n_iter=200000, rank=2, l2_reg_w=0.0005, l2_reg_V=0.0005, l2_reg=0.0005,step_size=0.01"
            )

            # Best Training set score: 0.9121148444887212
            # Best Param: {'n_iter': 200000, 'l2_reg_w': 0.0005, 'step_size': 0.004, 'l2_reg_V': 0.005, 'rank': 16}
            self._model = SGDFMClassification(n_iter=200000,
                                              rank=16,
                                              l2_reg_w=0.0005,
                                              l2_reg_V=0.0005,
                                              l2_reg=0.0005,
                                              step_size=0.01)

        elif (self._modelType == 'polylearn'):
            # Don't use this
            print(
                "Factorisation Machine from scitkit-learn-contrib polylearn will be used for training"
            )
            self._model = FactorizationMachineClassifier(degree=2,
                                                         loss='squared_hinge',
                                                         n_components=2,
                                                         alpha=1,
                                                         beta=1,
                                                         tol=1e-3,
                                                         fit_lower='explicit',
                                                         fit_linear=True,
                                                         warm_start=False,
                                                         init_lambdas='ones',
                                                         max_iter=5000,
                                                         verbose=True,
                                                         random_state=None)

        else:
            raise ModelNotTrainedException(
                'Selected model not available',
                'Valid models are polylearn,fmclassificationsgd,fmclassificationals'
            )

        if (retrain):
            print("Setting up Y and X for training")
            print(datetime.datetime.now())

            print("Training Model...")
            print(datetime.datetime.now())

            self._model = self._model.fit(xTrain,
                                          yTrain)  # Loss function:liblinear
            super(FMBidModel, self).saveModel(self._model, self._modelFile)

        else:
            self._model = super(FMBidModel,
                                self).loadSavedModel(self._modelFile)

        print("Training completed")
        print(datetime.datetime.now())