def test_random_train_test_split(test_percentage):

    data = fetch_movielens()["train"]

    train, test = random_train_test_split(data, test_percentage=test_percentage)

    assert test.nnz / float(data.nnz) == test_percentage
    _assert_disjoint(train, test)
def model(df, params, u=None, i=None):
    state = np.random.RandomState(params['seed'])
    data = Dataset()
    data.fit(df['userID'].unique(),
             df['poiID'].unique(),
             user_features=u[1] if u is not None else None,
             item_features=i[1] if i is not None else None)

    if u is not None:
        user_features_iterable = map(lambda l: (l[0], l[1]), u[0].iteritems())
        user_features = data.build_user_features(user_features_iterable,
                                                 normalize=False)
    else:
        user_features = None

    if i is not None:
        item_features_iterable = map(lambda l: (l[0], [l[1]]),
                                     i[0].iteritems())
        item_features = data.build_item_features(item_features_iterable,
                                                 normalize=False)
    else:
        item_features = None

    ratings, weights = data.build_interactions(df[['userID', 'poiID'
                                                   ]].itertuples(index=False,
                                                                 name=None))

    train, test = random_train_test_split(ratings,
                                          test_percentage=params['test'],
                                          random_state=state)

    lfm = LightFM(no_components=params['f'],
                  learning_rate=params['lr'],
                  loss=params['loss'],
                  user_alpha=params['alpha'],
                  random_state=state)
    lfm.fit(train,
            epochs=params['epochs'],
            user_features=user_features,
            item_features=item_features)

    return {
        'pr-train':
        100.0 * precision_at_k(lfm,
                               train,
                               k=params['k'],
                               user_features=user_features,
                               item_features=item_features).mean(),
        'mrr-train':
        100.0 * reciprocal_rank(lfm,
                                train,
                                user_features=user_features,
                                item_features=item_features).mean(),
        'pr-test':
        100.0 * precision_at_k(lfm,
                               test,
                               k=params['k'],
                               user_features=user_features,
                               item_features=item_features).mean(),
        'mrr-test':
        100.0 * reciprocal_rank(lfm,
                                test,
                                user_features=user_features,
                                item_features=item_features).mean()
    }
Beispiel #3
0
    csr_data1, user_lookup1, item_lookup1 = create_sparse_matrix(
        traindata, user_key, item_key)
    #csr_data2, user_lookup2, item_lookup2 = create_sparse_matrix(testdata,user_key,item_key)

    user_items_train = csr_data1.T.tocsr()
    #user_items_test = csr_data2.T.tocsr()

    print(user_items_train)
    print('\n')
    #print(user_items_test)
    #print('\n')
    print(user_items_train.shape)
    #print(user_items_test.shape)

    print("Splitting the data into train/test set...\n")
    train, test = cross_validation.random_train_test_split(user_items_train)
    # print(train,test)
    # print(train.shape(),test.shape())

    model1 = LightFM(learning_rate=0.05, loss='bpr')
    model2 = LightFM(learning_rate=0.05, loss='warp')

    print("Fitting models of BPR & WARP ranking losses...\n")
    model1.fit(train, epochs=10)
    model2.fit(train, epochs=10)
    #ranks = model.predict(user_items_train,num_threads=1)
    #print(ranks)

    res = model1.predict_rank(test)
    print(res)
    print("Evaluating methods...\n")
Beispiel #4
0
user_features = dataset.build_user_features(((x['User-ID'], [x['Age']])
                                              for x in get_user_features()))


labels = np.array([x['ISBN'] for x in get_ratings()])

#################################
#								#
#  		Training the Model 		#
#								#
#################################

model = LightFM(loss='warp')

(train, test) = random_train_test_split(interactions=interactions, test_percentage=0.2)

model.fit(train, item_features=item_features, user_features=user_features, epochs=2)

### model performnce evaluation

#train_precision = precision_at_k(model, train,item_features=item_features, k=10).mean()
#test_precision = precision_at_k(model, test, item_features=item_features,k=10).mean()

#train_auc = auc_score(model, train,item_features=item_features).mean()
#test_auc = auc_score(model, test,item_features=item_features).mean()

#print('Precision: train %.2f, test %.2f.' % (train_precision, test_precision))
#print('AUC: train %.2f, test %.2f.' % (train_auc, test_auc))

#print("testing testing testing")
    ((x['VacatureId'], [x['Naam']]) for x in qd.getVacancies()),
    normalize=False)

# print(item_features.toarray())

print(dataset.mapping())
'''
user_features = dataset.build_user_features(((x['Id'], [x['Motivatie']])
                                             for x in qd.getProfiles()))
print(user_features)
'''

# Creating a user fettu
# Split the set in train and test
test, train = random_train_test_split(interactions,
                                      test_percentage=0.2,
                                      random_state=None)

# Start training the model
print("--- Start model training ---")
model = LightFM(no_components=1, learning_rate=0.027, loss='warp')
model.fit(train,
          item_features=item_features,
          epochs=100,
          num_threads=4,
          verbose=False)
# model.fit(train,epochs=12,num_threads=4)

modelnofeatures = LightFM(no_components=1, learning_rate=0.027, loss='warp')
modelnofeatures.fit(train, epochs=100, num_threads=4, verbose=False)
Beispiel #6
0
                      values='playCountScaled')
ratings = ratings_df.fillna(0).values
sparsity = float(len(
    ratings.nonzero()[0])) / (ratings.shape[0] * ratings.shape[1]) * 100

X = csr_matrix(ratings)
n_users, n_items = ratings_df.shape
user_ids = ratings_df.index.values
artist_names = ap.sort_values("artistID")["name"].unique()

Xcoo = X.tocoo()
data = Dataset()
data.fit(np.arange(n_users), np.arange(n_items))
interactions, weights = data.build_interactions(
    zip(Xcoo.row, Xcoo.col, Xcoo.data))
train, test = random_train_test_split(interactions)

model = LightFM(learning_rate=0.05, loss='warp')
model.fit(train, epochs=10, num_threads=2)

# Generating the list of artists at start-up:
artIDs = ap['artistID'].unique()
numarts = len(ap['artistID'].unique())
listart = ""
for it, artName in enumerate(ap['name'].unique()):
    listart = listart + '<input type="checkbox" name="' + str(
        artIDs[it]) + '" value="' + str(artName) + '">' + artName + '<br>'


# get_recommendation from Jupyter notebook:
def get_recommendation(userid, ratings=ratings):
    NUM_EPOCHS = num_epochs
    ITEM_ALPHA = 1e-6  # Recommended by LightFM

    # Let's fit a WARP model: these generally have the best performance.
    model = LightFM(loss='warp',
                    item_alpha=ITEM_ALPHA,
                    no_components=NUM_COMPONENTS)

    # Fit model
    model = model.fit(train, epochs=NUM_EPOCHS, num_threads=NUM_THREADS)

    return model


if __name__ == "__main__":
    hike_user_rating_matrix = gen_collabfilt_matrix(
        hike_data)  # generate interaction matrix
    df = convert_to_binary(hike_user_rating_matrix,
                           2.5)  # binarize interaction matrix

    # Fit model
    dataset, interactions = lightfm_implicit_matrix(interaction_matrix)
    # Create training/test set
    train, test = cross_validation.random_train_test_split(
        interactions,
        test_percentage=0.2,
        random_state=np.random.RandomState(seed=1))
    #Train model
    model = lightfm_train(train, 30, 30)
    print('Great job! You trained your model!')
Beispiel #8
0
def run_learning_curve(test_fraction, max_epoch):

    # create data_train
    data  = Dataset(user_identity_features=True)
    
    # user featurs
    user_features, user_feature_names = get_user_features()
    
    # create map between user_id, post_id, user_features and internal indices
    data.fit((x['user_id'] for x in get_data()),(x['post_id'] for x in get_data()), user_features=user_features)
    
    # print shape
    num_users, num_items = data.interactions_shape()
    print('Num users: {}, num_items {}.'.format(num_users, num_items))
    
    #---------------------------
    # Building the interactions matrix
    #---------------------------
    # create interaction matrix to optimize
    (interactions, weights) = data.build_interactions(((x['user_id'], x['post_id'])) for x in get_data())
    print(repr(interactions))
    
    # retrieve mapping from dataset
    user_id_map, user_feature_map, item_id_map, item_feature_map = data.mapping()
    
    # split test and train
    interaction_train, interaction_test = cross_validation.random_train_test_split(interactions, test_fraction)
    
    #---------------------------
    # train model
    #---------------------------
    model_cs  = LightFM(learning_rate=0.05, loss='warp')
    model_ws  = LightFM(learning_rate=0.05, loss='warp', no_components=len(user_feature_names))

    precision_cs = []
    precision_ws = []

    recall_cs = []
    recall_ws = []

    for epoch in range(int(max_epoch/2)):

        model_cs.fit(interaction_train, epochs=int(epoch*2))
        model_ws.fit(interaction_train, user_features=user_features, epochs=int(epoch*2))
   
        # calculate precision and recall for each epoch
        precision_at_k_cs = evaluation.precision_at_k(model_cs, interaction_test, interaction_train)
        precision_at_k_ws = evaluation.precision_at_k(model_ws, interaction_test, interaction_train, user_features=user_features)

        recall_at_k_cs = evaluation.recall_at_k(model_cs, interaction_test, interaction_train)
        recall_at_k_ws = evaluation.recall_at_k(model_ws, interaction_test, interaction_train, user_features=user_features)

        # append to result
        precision_cs.append(sum(precision_at_k_cs) / len(precision_at_k_cs))
        precision_ws.append(sum(precision_at_k_ws) / len(precision_at_k_ws))
        recall_cs.append(sum(recall_at_k_cs) / len(recall_at_k_cs))
        recall_ws.append(sum(recall_at_k_ws) / len(recall_at_k_ws))

    df_result = pd.DataFrame({
        "precision_cs": precision_cs,
        "precision_ws": precision_ws,
        "recall_cs": recall_cs,
        "recall_ws": recall_ws,
        })

    # save to file
    df_result.to_csv("data/validation/df.epoch.csv", index=False)

    return
Beispiel #9
0
resume_text = pd.read_csv("~/data/Candidate Report_tokenized.csv").fillna('')
####### prepare item features and user features
# item features
resume_embeddings.set_index("ID", inplace=True)
resume_features_sparse = sparse.csr_matrix(resume_embeddings.values)

job_embeddings.set_index("ID", inplace=True)
job_features_sparse = sparse.csr_matrix(job_embeddings.values)

# read the interaction matrix
# interaction_sparse = sparse.load_npz('data/interaction_v4.npz')
interaction_sparse = sparse.load_npz('data/interaction_v5.npz')
interaction_sparse.data = np.nan_to_num(interaction_sparse.data, copy=False)

# train test split for cv
train, test = random_train_test_split(interaction_sparse, test_percentage=0.3, random_state = None)

# free memory
del job_embeddings
del resume_embeddings
del interaction_sparse
gc.collect()

##### create and train LightFM model ######
NUM_THREADS = 4
NUM_COMPONENTS = 30
NUM_EPOCHS = 50
ITEM_ALPHA = 1e-6
K_num = 5

model = LightFM(loss='warp'
def preprocess():
    import pandas as pd
    import math
    import numpy as np 
            
    data_users = pd.read_csv('users_tag.csv',index_col=0)
    data_business = pd.read_csv('business_Nora.csv',index_col=0)
    data_review = pd.read_csv('reviews_cleaned.csv',index_col = 0)        
            
    data_users.review_count = pd.Series([math.log(x+1) for x in data_users.review_count])
    data_users.useful =  pd.Series([math.log(x+1) for x in data_users.useful])  
            
    #cleam business skewness
    data_business.review_count =  pd.Series([math.log(x+1) for x in data_business.review_count])        
            
    from lightfm.data import Dataset        
            
    #model establishment
    dataset = Dataset()
    dataset.fit(data_review.user_id,data_review.business_id)
    type(dataset)
    num_users, num_items = dataset.interactions_shape()        
            
    # fit item and user features. 
    dataset.fit_partial(items=data_business.business_id,
                        item_features=['stars'])
            
            
    dataset.fit_partial(items=data_business.business_id,
                        item_features=['review_count'])        
            
    tar_cols = [x for x in data_business.columns[24:]] 
            
    dataset.fit_partial(items = data_business.business_id,
                       item_features = tar_cols)        
            
    user_cols = [x for x in data_users[['review_count', 'useful',
                                       'Ice Cream & Frozen Yogurt', 'Korean', 'Tapas/Small Plates',
           'Vietnamese', 'Vegan', 'Caribbean', 'Food Delivery Services', 'Lounges',
           'Pubs', 'Greek', 'Cocktail Bars', 'Mexican', 'Wine Bars', 'Tea Rooms',
           'Delis', 'Vegetarian', 'Ethnic Food', 'Salad', 'Seafood', 'Beer',
           'American (New)', 'Juice Bars & Smoothies', 'Shopping', 'Barbeque',
           'Sports Bars', 'French', 'Chicken Wings', 'Gastropubs', 'Diners',
           'Gluten-Free', 'Thai', 'Comfort Food', 'Health Markets', 'Halal',
           'Caterers', 'Arts & Entertainment']]]        
            
    dataset.fit_partial(users=data_users.user_id,
                        user_features = user_cols)  
          
    print("Building Interactions")        
    (interactions, weights) = dataset.build_interactions([(x['user_id'],
                                                           x['business_id'],
                                                           x['stars']) for index,x in data_review.iterrows()])   
    print("Interactions Build")        
    # build user and item features
    
    def build_dict(df,tar_cols,val_list):
        rst = {}
        for col in tar_cols:
            rst[col] = df[col]
        sum_val = sum(list(rst.values())) # get sum of all the tfidf values
        
        if(sum_val == 0):
            return rst
        else:
            
            w = (2-sum(val_list))/sum_val # weight for each tag to be able to sum to 1
            for key,value in rst.items():
                rst[key] = value * w
        return rst
    
    def user_build_dict(df,tar_cols,val_list):
        rst = {}
        for col in tar_cols:
            rst[col] = df[col]
        sum_val = sum(list(rst.values())) # get sum of all the tfidf values
        
        if(sum_val == 0):
            return rst
        else:
            w = (2-sum(val_list))/sum_val # weight for each tag to be able to sum to 1
            for key,value in rst.items():
                rst[key] = value * w
        return rst
    
    # get max of each column to regularize value to [0,1]
    max_star = max(data_business.stars)
    max_b_rc = max(data_business.review_count)
    print('max_b_rc')
    print(max_b_rc)
    
    # give CF info weight 0.5, all other 0.5. Then in others, give (star, review count) 0.25 and tags 0.25
    item_features = dataset.build_item_features(((x['business_id'], 
                                                  {'stars':0.5*x['stars']/max_star,
                                                   'review_count':0.5*x['review_count']/max_b_rc,
                                                   **build_dict(x,tar_cols,[0.5*x['stars']/max_star,
                                                               0.5*x['review_count']/max_b_rc])})
                                                  for index,x in data_business.iterrows()))
    
    
    # user_features = dataset.build_user_features(((x['user_id'],
    #                                              [x['is_elite'],x['year']])
    #                                            for index, x in data_users.iterrows()))
    max_u_rc = max(data_users.review_count)
    max_useful = max(data_users.useful)
    user_features = dataset.build_user_features(((x['user_id'],
                                                 {'review_count':0.35*x['review_count']/max_u_rc,
                                                  'useful':0.35*x['useful']/max_useful,
                                                 **user_build_dict(x,user_cols,[0.35*x['review_count']/max_u_rc,0.35*x['useful']/max_useful])}) for index, x in data_users.iterrows()))
            
    #train-test split
    
    # seed = 12345 #has multiple seeds set up to account for split biases
    # seed = 101
    # seed = 186
    seed = 123
    from lightfm.cross_validation import random_train_test_split
    train,test=random_train_test_split(interactions,test_percentage=0.2,random_state=np.random.RandomState(seed))
    
    print('The dataset has %s users and %s items, '
          'with %s interactions in the test and %s interactions in the training set.'
          % (train.shape[0], train.shape[1], test.getnnz(), train.getnnz()))
    
    train.multiply(test).nnz == 0 # make sure train and test are truly disjoint        
    return train,test,data_business,dataset,user_features,item_features   
Beispiel #11
0
def auto_tune_parameter(k,
                        interactions,
                        model,
                        data,
                        param1,
                        param_type="components",
                        user_features=None,
                        item_features=None):
    """
    Function that identifies the optimal values of parameters which maximizes performance of model

    Parameters:
    - k: Number of folds used to tune parameters
    - interactions: matrix of interactions between users and artists
    - model: specified model used in recommender system
    - data: sparse user-item matrix
    - param1: list of values to try for hyperparameter
    - param_type: name of the parameter we want to optimize;
        options are:
        - "components"
        - "learning rate"
        - "loss function"
    - user_features: parameter used for evaluating and fitting the LightFM model.
    - item_features: parameter used for evaluating and fitting the LightFM model.

    Ouput:
    - max_recall_list: a list of k tuples, one for each fold.
        each tuple is in the form (max_recall,max_first_param,max_precision,max_coverage)
        which records the best recall, and the param that achieved it,
        and the max_precision and max_coverage achieved (which may be from different param values).
    - heatmap_list: a list of k heatmaps of the recall values for the tested
        parameter (one heatmap per fold). Useful for visualizations
    """
    # Train model
    # Create list of MAX Recall depending on # params
    max_recall_list = [
    ]  # will end up being length k list of tuples of best param values
    heatmap_list = []
    train_and_tune, test = cross_validation.random_train_test_split(
        data, test_percentage=.2, random_state=None)
    train_list = []
    tune_list = []
    for i in range(k):
        trainvals, tunevals = cross_validation.random_train_test_split(
            train_and_tune, test_percentage=.2, random_state=None)
        train_list.append(trainvals)
        tune_list.append(tunevals)
    test_recall = 0
    test_first_param = param1[0]
    # create recall matrix storing for each combination of params
    for fold in range(
            k):  # For each fold; there are k-1 folds within train_and_tune
        recall_heatmap = [0 for y in range(len(param1))]
        train = train_list[fold]
        tune = tune_list[fold]
        # initialize best value of first_param for this fold
        max_first_param = param1[0]
        max_recall = 0
        max_precision = 0
        max_coverage = 0
        value1_index = 0  # index for heatmap
        print("Fitting fold number...", fold)
        for value1 in param1:
            print("Trying ", (value1))
            if param_type == "components":
                usemodel = model(learning_rate=0.05,
                                 no_components=value1,
                                 loss='warp')
            elif param_type == "learning_rate":
                usemodel = model(learning_rate=value1,
                                 no_components=50,
                                 loss='warp')
            elif param_type == "loss_function":
                usemodel = model(learning_rate=0.05,
                                 no_components=50,
                                 loss=value1)

            usemodel.fit(train,
                         user_features=user_features,
                         item_features=item_features,
                         epochs=25)
            coverage, precision, recall = evaluate_lightfm(
                usemodel,
                data,
                train,
                tune,
                item_features=item_features,
                user_features=user_features)

            print(value1_index)
            recall_heatmap[value1_index] = recall  # update heatmap
            # update maximum values
            max_precision = max(max_precision, precision)
            max_coverage = max(max_coverage, coverage)
            if recall > max_recall:
                max_recall = recall
                max_first_param = value1
            value1_index = value1_index + 1
        max_recall_list.append(
            [max_recall, max_first_param, max_precision, max_coverage])
        if max_recall > test_recall:
            print("Fold ", fold, " beat the record for recall!")
            print("New best recall is ", max_recall)
            print("New best param is ", (max_first_param))
            test_recall = max_recall
            test_first_param = max_first_param
        heatmap_list.append(recall_heatmap)
        print("end of fold---------------------------")

    # Now, test_first_param should be optimized
    if param_type == "components":
        usemodel = model(learning_rate=0.05,
                         no_components=test_first_param,
                         loss='warp')
    elif param_type == "learning_rate":
        usemodel = model(learning_rate=test_first_param,
                         no_components=50,
                         loss='warp')
    elif param_type == "loss_function":
        usemodel = model(learning_rate=0.05,
                         no_components=50,
                         loss=test_first_param)
    usemodel.fit(train_and_tune,
                 user_features=user_features,
                 item_features=item_features,
                 epochs=25)
    final_coverage, final_precision, final_recall = evaluate_lightfm(
        usemodel,
        data,
        train_and_tune,
        test,
        user_features=user_features,
        item_features=item_features)

    print("The recall on the test set is ", final_recall,
          ", after hyperparameter optimization")
    print("The precision on the test set is ", final_precision,
          ", after hyperparameter optimization")
    print("The coverage on the test set is ", final_coverage,
          ", after hyperparameter optimization")

    return max_recall_list, heatmap_list
Beispiel #12
0
    temp_rows,temp_columns,rate=line.split("\t")
    rows=max(rows,int(temp_rows))
    columns=max(columns,int(temp_columns))
    
arr_train=np.zeros([rows+1,columns+1])
for line in lines:
    line=line.strip()
    temp_rows,temp_columns,rate=line.split("\t")
    if rate=="1":
        #print(temp_rows,temp_columns)
        arr_train[int(temp_rows),int(temp_columns)]=1

df_data=pd.DataFrame(arr_train,index=list(range(rows+1)),columns=list(range(columns+1)))
data1 = csr_matrix(df_data)
data1.toarray()
train,test=random_train_test_split(data1,test_percentage=0.4, random_state=np.random.RandomState(1))


arr_itemfeature=np.load("entity_embedding.npy")[:columns+1,:]
#arr_itemfeature*=10
df_itemfeature=pd.DataFrame(arr_itemfeature,index=list(range(columns+1)),columns=list(range(20)))
data_feature=csr_matrix(df_itemfeature)
data_feature.toarray()

#model.fit(train,item_features=data_feature,epochs=50,verbose=True)
model.fit(train,epochs=10)
#print(auc_score(model,test,item_features=data_feature).mean())
print(auc_score(model,test).mean())
y_true=[]
y_predict=[]
max_rate=0
def train_val_split(csr_mat):
    train, val = random_train_test_split(csr_mat, test_percentage=0.2)
    return (train, val)
    def randomized_search(self,
                          params,
                          metric='auc',
                          max_iterations=None,
                          max_epochs=50,
                          early_stopping=False,
                          use_weights=False):
        """
        Standard randomized search method to select the hyper-parameters that result in the highest score on the test
        set. Each iteration will sample one of the possible combinations of hyper-parameters.
        Uses ParameterGrid class from scikit-learn in order to create an iterable of all possible hyper-parameter
        combinations.
        The user can supply a max_iterations value that will stop the search once said number of combinations has been
        reached. Furthermore, early_stopping can be set to True to stop the training of a particular model when the test
        score has stopped improving, which is particularly useful when overfitting.
        :param params:(dict, required) - dictionary of parameters to test, {parameter: [list of values to try]}
        :param metric:(string, optional) - metric to use to pick the best model
        :param max_iterations:(int, optional) - if provided, the hyper-parameter optimization will stop after this many
               tests, irrespective of len(ParameterGrid(params))
        :param max_epochs:(int, optional) - max number of epochs to train each model
        :param early_stopping:(bool, optional) - if True, the training of a model will be partial and will stop after 5
               epochs of non-improvement on the test score; the model will then be re-trained using the optimal number
               of epochs
        :param use_weights:(bool, optional) - if True, the training procedure will use weights to value repeated
               interactions more
        """
        # Raise an error if any of the parameters supplied is not one of the arguments used by self.init_model
        valid_params = self.init_model.__code__.co_varnames
        if any([x not in valid_params for x in params.keys()]):
            raise ValueError(
                "One of the hyper-parameters supplied is invalid. Please make sure there are no typos."
            )

        # Reset best values
        self.best_model = None
        self.best_params = None
        self.best_score = 0

        # create train and test datasets
        (train_set, test_set) = random_train_test_split(self._interactions,
                                                        test_percentage=0.2)
        if use_weights and self._weights is not None:
            weights_csr = self._weights.tocsr()
            data = [
                weights_csr[u, i] for u, i in zip(train_set.row, train_set.col)
            ]

            train_weights = sp.coo_matrix(
                (data, (train_set.row, train_set.col)),
                shape=self._weights.shape,
                dtype=self._weights.dtype)
        else:
            train_weights = None

        # Create ParameterGrid instance to be iterated and cast it to list
        grid = list(ParameterGrid(params))
        # If max_iterations has not been provided then test all parameter combinations
        if not max_iterations:
            max_iterations = len(grid)
        # Shuffle the list and pop out and remove the last element
        random.shuffle(grid)
        test_params = grid.pop()
        test_params_idx = 1

        start_time = time.time()

        while test_params and test_params_idx <= max_iterations:
            # Initialize model with current combination of hyper-parameters to be tested
            self.init_model(**test_params)

            if early_stopping:
                best_iter = 0
                best_score = 0
                iters_no_improvement = 0
                # Train the model for max_epochs, evaluating it at each step
                for i in range(max_epochs):
                    self.train(train_set,
                               sample_weight=train_weights,
                               partial=True)
                    test_score = self.evaluate_model(self.model, metric,
                                                     test_set, train_set)
                    if test_score > best_score:
                        best_iter = i + 1
                        best_score = test_score
                        iters_no_improvement = 0
                    else:
                        iters_no_improvement += 1
                        # If the test score has not improved in the last 5 epochs stop the training
                        if iters_no_improvement == 5:
                            break

                # If the last epoch did not result in the highest test score, re-train the model for the optimal number
                # of epochs
                if best_iter != max_epochs:
                    self.init_model(**test_params)
                    self.train(train_set,
                               sample_weight=train_weights,
                               epochs=best_iter)
                    test_score = self.evaluate_model(self.model, metric,
                                                     test_set, train_set)

            else:
                self.train(train_set,
                           sample_weight=train_weights,
                           epochs=max_epochs)
                test_score = self.evaluate_model(self.model, metric, test_set,
                                                 train_set)

            # If the test score achieved by this model was the highest so far, set the class variables accordingly
            if test_score > self.best_score:
                self.best_model = self.model
                self.best_params = test_params
                self.best_score = test_score

            random.shuffle(grid)
            if grid:
                test_params = grid.pop()
            else:
                test_params = None

            elapsed_time = (time.time() - start_time) / 60

            print(
                'Hyperparameters tested: {}/{}; {} score: {}; total time: {:.2f} minutes'
                .format(test_params_idx, max_iterations, metric, test_score,
                        elapsed_time))
            test_params_idx += 1

        print(
            'The best model achieved a {} score of {} on the test set, with parameters {}'
            .format(metric, self.best_score, self.best_params))
"""


"""# Recommender System

### LightFM Implementation
"""

pip install -qq lightfm

from lightfm import LightFM
from lightfm.cross_validation import random_train_test_split
from lightfm.evaluation import *

# Split interactions to train and test sets 
train, test = random_train_test_split(interactions,test_percentage=0.1,random_state=42)

"""##### Define LightFM model"""

hybrid = LightFM(no_components=32,random_state=42,loss='warp',item_alpha=1e-06,user_alpha=1e-06)
hybrid.fit(train,user_features,item_features,epochs=10,num_threads=4,verbose=True)

"""##### Evaluation: AUC score

"""

hybrid_train_auc = auc_score(hybrid,train,item_features=item_features,user_features=user_features,num_threads=4)
hybrid_test_auc = auc_score(hybrid,test,train_interactions=train,item_features=item_features,user_features=user_features,num_threads=4)

print('Hybrid model train AUC score: %.5f' %hybrid_train_auc.mean())
print('Hybrid model test AUC score: %.5f' %hybrid_test_auc.mean())
Beispiel #16
0
def run_validation(test_fraction, max_val):

    # containers to hold results
    ave_precision_at_k_cs   = []
    ave_recall_at_k_cs      = []
    ave_auc_score_cs        = []

    ave_precision_at_k_ws   = []
    ave_recall_at_k_ws      = []
    ave_auc_score_ws        = []
   

    # perform validation
    validation_itr = 0

    while (validation_itr < max_val):

        print("Start validating cold, warm start, iteration %s" %validation_itr)

        # prevent random failure to abort entire job
        try:

            # count
            validation_itr += 1

            # create data_train
            data_cs = Dataset()
            data_ws = Dataset(user_identity_features=True)

            # user featurs
            user_features, user_feature_names = get_user_features()
            print(user_feature_names)

            # create map between user_id, post_id, user_features and internal indices
            data_cs.fit((x['user_id'] for x in get_data()),(x['post_id'] for x in get_data()))
            data_ws.fit((x['user_id'] for x in get_data()),(x['post_id'] for x in get_data()), user_features=user_features)
            
            # print shape
            num_users, num_items = data_ws.interactions_shape()
            print('Num users: {}, num_items {}.'.format(num_users, num_items))
            
            #---------------------------
            # Building the interactions matrix
            #---------------------------
            # create interaction matrix to optimize
            (interactions_cs, weights_cs) = data_cs.build_interactions(((x['user_id'], x['post_id'])) for x in get_data())
            (interactions_ws, weights_ws) = data_ws.build_interactions(((x['user_id'], x['post_id'])) for x in get_data())
            print(repr(interactions_ws))

            # retrieve mapping from dataset
            user_id_map_cs, user_feature_map_cs, item_id_map_cs, item_feature_map_cs = data_cs.mapping()
            user_id_map_ws, user_feature_map_ws, item_id_map_ws, item_feature_map_ws = data_ws.mapping()

            # split test and train
            interaction_train_cs, interaction_test_cs = cross_validation.random_train_test_split(interactions_cs, test_fraction)
            interaction_train_ws, interaction_test_ws = cross_validation.random_train_test_split(interactions_ws, test_fraction)

            #---------------------------
            # train model
            #---------------------------
            model_cs  = LightFM(learning_rate=0.05, loss='warp')
            model_ws  = LightFM(learning_rate=0.05, loss='warp', no_components=len(user_feature_names))

            model_cs.fit(interaction_train_cs, epochs=30)
            model_ws.fit(interaction_train_ws, user_features=user_features, epochs=30)

            #---------------------------
            # make predictions
            #---------------------------
            precision_at_k_cs = evaluation.precision_at_k(model_cs, interaction_test_cs, interaction_train_cs)
            recall_at_k_cs = evaluation.recall_at_k(model_cs, interaction_test_cs, interaction_train_cs)
            auc_score_cs = evaluation.auc_score(model_cs, interaction_test_cs, interaction_train_cs)

            precision_at_k_ws = evaluation.precision_at_k(model_ws, interaction_test_ws, interaction_train_ws, user_features=user_features)
            recall_at_k_ws = evaluation.recall_at_k(model_ws, interaction_test_ws, interaction_train_ws, user_features=user_features)
            auc_score_ws = evaluation.auc_score(model_ws, interaction_test_ws, interaction_train_ws, user_features=user_features)

            # append score from each iteration to results
            ave_precision_at_k_cs.append(sum(precision_at_k_cs) / len(precision_at_k_cs))
            ave_recall_at_k_cs.append(sum(recall_at_k_cs) / len(recall_at_k_cs))
            ave_auc_score_cs.append(sum(auc_score_cs) / len(auc_score_cs))

            ave_precision_at_k_ws.append(sum(precision_at_k_ws) / len(precision_at_k_ws))
            ave_recall_at_k_ws.append(sum(recall_at_k_ws) / len(recall_at_k_ws))
            ave_auc_score_ws.append(sum(auc_score_ws) / len(auc_score_ws))


        except:
            print("teration %s failed. Skipping.." %validation_itr)


    print("Validation score for test")
    print(ave_precision_at_k_cs  )
    print(ave_recall_at_k_cs     )
    print(ave_auc_score_cs )
    print(ave_precision_at_k_ws  )
    print(ave_recall_at_k_ws     )
    print(ave_auc_score_ws )

    df_result = pd.DataFrame({
        'precision_at_k_cs': ave_precision_at_k_cs,
        'recall_at_k_cs': ave_recall_at_k_cs,
        'auc_score_cs': ave_auc_score_cs,
        'precision_at_k_ws': ave_precision_at_k_ws,
        'recall_at_k_ws': ave_recall_at_k_ws,
        'auc_score_ws': ave_auc_score_ws,
        })

    # save to file
    df_result.to_csv("data/validation/df.csv", index=False)

    return
Beispiel #17
0
                                   , how="left"
                                   , on="ID")
pos4_full_tfidf.drop_duplicates(subset="ID", inplace=True)
pos4_full_tfidf = pos4_full_tfidf.fillna(value=0)

pos5_full_tfidf = pos5_tfidf.merge(all_dummies
                                   , how="left"
                                   , on="ID")
pos5_full_tfidf.drop_duplicates(subset="ID", inplace=True)
pos5_full_tfidf = pos5_full_tfidf.fillna(value=0)

### Convert data to sparse matrix and split for cv###
pos1_spr = sp.sparse.csr_matrix(pos1_full_tfidf.set_index("ID").values)

pos1_train, pos1_test = random_train_test_split(pos1_spr
                                                , test_percentage=0.25
                                                , random_state = None)

### create and train LightFM model ###
NUM_THREADS = 4
NUM_COMPONENTS = 5
NUM_EPOCHS = 30
ITEM_ALPHA = 1e-6

pos1_model = LightFM(loss='warp'
                    , item_alpha=ITEM_ALPHA
                    , no_components=NUM_COMPONENTS)


%time pos1_model = pos1_model.fit(pos1_train, epochs=NUM_EPOCHS, num_threads=NUM_THREADS)
Beispiel #18
0
def init_movielens(path,
                   min_rating=0.0,
                   k=3,
                   item_features=None,
                   cluster_n=18,
                   model='vgg19',
                   test_percentage=0.2):
    valid_item_features = {'genres': 'genres', 'clusters': 'clusters'}
    if item_features is not None:
        assert all(item in valid_item_features.values() for item in item_features), \
            'Your specified item features is invalid. You have to use one or more of this: ' \
            + ', '.join(valid_item_features)

    train_dataset = Dataset()
    test_dataset = Dataset()

    data = dict()
    min_interactions = dict()

    with open(path + '/ratings.csv', 'r') as ratings_file:
        reader = csv.reader(
            ratings_file,
            delimiter=',',
        )
        next(reader)  # skip header

        ratings = []
        users = set()
        items = set()
        for row in reader:
            user_id = int(row[0])
            item_id = int(row[1])

            users.add(user_id)
            items.add(item_id)

            rating = float(row[2])

            if rating >= min_rating:
                ratings.append((user_id, item_id, rating))
                __add_interaction(min_interactions, user_id)

        __info_no_of_min_interactions(
            k, 'No of interactions per user overall ==> ', min_interactions)

        users = list(users)
        items = list(items)

        users_column, items_column, ratings_column = zip(*ratings)
        ratings = sparse.coo_matrix(
            (ratings_column, (users_column, items_column)))

        ratings_train, ratings_test = random_train_test_split(
            ratings,
            test_percentage=test_percentage,
            random_state=np.random.RandomState(7))

        ratings_train_to_count = zip(ratings_train.row, ratings_train.col,
                                     ratings_train.data)
        ratings_train = zip(ratings_train.row, ratings_train.col,
                            ratings_train.data)

        ratings_test_to_count = zip(ratings_test.row, ratings_test.col,
                                    ratings_test.data)
        ratings_test = zip(ratings_test.row, ratings_test.col,
                           ratings_test.data)

        min_interactions = __count_train_test_min_interactions(
            ratings_train_to_count)
        __info_no_of_min_interactions(
            k, 'No of interactions per user on train ==> ', min_interactions)

        min_interactions = __count_train_test_min_interactions(
            ratings_test_to_count)
        __info_no_of_min_interactions(
            k, 'No of interactions per user on test ==> ', min_interactions)

        train_dataset.fit(users=users, items=items)
        test_dataset.fit(users=users, items=items)

        (train_interactions,
         train_weights) = train_dataset.build_interactions(ratings_train)
        (test_interactions,
         test_weights) = test_dataset.build_interactions(ratings_test)

        data.update({'train': train_interactions})
        data.update({'test': test_interactions})
        data.update({'train-mapping': train_dataset.mapping()})

    # add item features
    if item_features is not None:
        aggregated_features = []

        if valid_item_features.get('genres') in item_features:
            movie_genres, genres = __init_movies_genres(path)
            aggregated_features.append(movie_genres)

            train_dataset.fit_partial(item_features=genres)
            test_dataset.fit_partial(item_features=genres)

            train_dataset.fit_partial(items=list(movie_genres.keys()))
            test_dataset.fit_partial(items=list(movie_genres.keys()))

        if valid_item_features.get('clusters') in item_features:
            movies_posters_clusters, clusters = __init_movies_posters_clusters(
                path, cluster_n, model=model)
            aggregated_features.append(movies_posters_clusters)

            train_dataset.fit_partial(item_features=clusters)
            test_dataset.fit_partial(item_features=clusters)

            train_dataset.fit_partial(
                items=list(movies_posters_clusters.keys()))
            test_dataset.fit_partial(
                items=list(movies_posters_clusters.keys()))

        aggregated_features = __aggregate_features(aggregated_features)
        item_features = train_dataset.build_item_features(
            ((movie_id, aggregated_features.get(movie_id))
             for movie_id in aggregated_features.keys()))

        _ = test_dataset.build_item_features(
            ((movie_id, aggregated_features.get(movie_id))
             for movie_id in aggregated_features.keys()))

        data.update({'item_features': item_features})
    else:
        data.update({'item_features': None})

    return data
Beispiel #19
0
from lightfm.evaluation import precision_at_k
from scipy.sparse import identity
from sklearn.model_selection import train_test_split
from tqdm import tqdm

from logger import Logger
#%%[markdown]
# # > Preparation
#%%[markdown]
# ## >> Load data
# In[ ]:
ratings_pivot_csr_filename = "data/intersect-20m/ratings.csr"
ratings_pivot = pickle.load(open(ratings_pivot_csr_filename, 'rb'))
#%%[markdown]
# ## >> Split data
train, test = random_train_test_split(ratings_pivot, test_percentage=0.2)
# %%[markdown]
# ## >> User & Item features
# Identity matrix to represent users and items feature
# In[ ]:
user_identity = identity(train.shape[0])
item_identity = identity(train.shape[1])
#%%[markdown]
# ## >> Set logger
timestamp = str(datetime.timestamp(datetime.now()))

logger = Logger()
session_log_path = "log/{}/".format(timestamp)
logger.create_session_folder(session_log_path)
logger.set_default_filename(session_log_path + "log.txt")
# %%[markdown]