def test_save_and_load_model(self):
        model = TensorRec(n_components=10)
        model.fit(self.interactions, self.user_features, self.item_features, epochs=10)

        predictions = model.predict(user_features=self.user_features, item_features=self.item_features)
        ranks = model.predict_rank(user_features=self.user_features, item_features=self.item_features)
        model.save_model(directory_path=self.test_dir)

        # Check that, after saving, the same predictions come back
        predictions_after_save = model.predict(user_features=self.user_features, item_features=self.item_features)
        ranks_after_save = model.predict_rank(user_features=self.user_features, item_features=self.item_features)
        self.assertTrue((predictions == predictions_after_save).all())
        self.assertTrue((ranks == ranks_after_save).all())

        # Blow away the session
        set_session(None)
        tf.reset_default_graph()

        # Reload the model, predict, and check for equal predictions
        new_model = TensorRec.load_model(directory_path=self.test_dir)
        new_predictions = new_model.predict(user_features=self.user_features, item_features=self.item_features)
        new_ranks = new_model.predict_rank(user_features=self.user_features, item_features=self.item_features)

        self.assertTrue((predictions == new_predictions).all())
        self.assertTrue((ranks == new_ranks).all())
Example #2
0
    def test_save_and_load_model(self):
        model = TensorRec(n_components=10)
        model.fit(self.interactions,
                  self.user_features,
                  self.item_features,
                  epochs=10)

        predictions = model.predict(user_features=self.user_features,
                                    item_features=self.item_features)
        ranks = model.predict_rank(user_features=self.user_features,
                                   item_features=self.item_features)
        model.save_model(directory_path=self.test_dir)

        # Check that, after saving, the same predictions come back
        predictions_after_save = model.predict(
            user_features=self.user_features, item_features=self.item_features)
        ranks_after_save = model.predict_rank(user_features=self.user_features,
                                              item_features=self.item_features)
        self.assertTrue((predictions == predictions_after_save).all())
        self.assertTrue((ranks == ranks_after_save).all())

        # Blow away the session
        set_session(None)
        tf.reset_default_graph()

        # Reload the model, predict, and check for equal predictions
        new_model = TensorRec.load_model(directory_path=self.test_dir)
        new_predictions = new_model.predict(user_features=self.user_features,
                                            item_features=self.item_features)
        new_ranks = new_model.predict_rank(user_features=self.user_features,
                                           item_features=self.item_features)

        self.assertTrue((predictions == new_predictions).all())
        self.assertTrue((ranks == new_ranks).all())
Example #3
0
 def setUpClass(cls):
     cls.interactions, cls.user_features, cls.item_features = generate_dummy_data_with_indicator(
         num_users=10, num_items=12, interaction_density=.5)
     model = TensorRec(n_components=10)
     model.fit(cls.interactions, cls.user_features, cls.item_features, epochs=10)
     cls.model = model
     cls.ranks = model.predict_rank(user_features=cls.user_features, item_features=cls.item_features)
Example #4
0
    def metric_test(self):
        """ uses tensorrec eval as benchmark for rating performance of various reco algorithms """
        k = 10
        latent_factor = 10
        n_users = 10
        n_items = 12

        interactions, user_features, item_features = util.generate_dummy_data_with_indicator(
            num_users=n_users, num_items=n_items, interaction_density=.5)
        print("interactiosn shape={}".format(np.shape(interactions)))
        print("user features shape={}".format(np.shape(
            user_features.toarray())))
        print("item features shape={}".format(np.shape(
            item_features.toarray())))

        model = TensorRec(n_components=latent_factor)

        model.fit(interactions, user_features, item_features, epochs=19)

        ranks = model.predict_rank(user_features=user_features,
                                   item_features=item_features)

        print("Ranks shape={}".format(np.shape(ranks)))

        self.assertTrue(np.shape(interactions) == np.shape(ranks))

        tr_recall_result = eval.recall_at_k(predicted_ranks=ranks,
                                            test_interactions=interactions,
                                            k=k,
                                            preserve_rows=False)
        # print (tr_recall_result.mean())

        tr_precision_result = eval.precision_at_k(
            predicted_ranks=ranks,
            test_interactions=interactions,
            k=k,
            preserve_rows=False)
        # print(tr_precision_result.mean())

        # we need csr for interactions data
        interactions_ = interactions.tocsr()
        recall_result = metrics.recall_at_k(ranks,
                                            interactions_,
                                            k=k,
                                            preserve_rows=False)
        # print(recall_result.mean())

        precision_result = metrics.precision_at_k(ranks,
                                                  interactions_,
                                                  k=k,
                                                  preserve_rows=False)
        # print (precision_result.mean())

        self.assertTrue(tr_recall_result.mean() == recall_result.mean())
        self.assertTrue(tr_precision_result.mean() == precision_result.mean())
Example #5
0
 def setUpClass(cls):
     cls.interactions, cls.user_features, cls.item_features = generate_dummy_data_with_indicator(
         num_users=10, num_items=12, interaction_density=.5)
     model = TensorRec(n_components=10)
     model.fit(cls.interactions,
               cls.user_features,
               cls.item_features,
               epochs=10)
     cls.model = model
     cls.ranks = model.predict_rank(user_features=cls.user_features,
                                    item_features=cls.item_features)
Example #6
0
    def test_predict_fail_unfit(self):
        model = TensorRec()
        with self.assertRaises(ModelNotFitException):
            model.predict(self.user_features, self.item_features)
        with self.assertRaises(ModelNotFitException):
            model.predict_rank(self.user_features, self.item_features)

        with self.assertRaises(ModelNotFitException):
            model.predict_user_representation(self.user_features)
        with self.assertRaises(ModelNotFitException):
            model.predict_item_representation(self.item_features)
        with self.assertRaises(ModelNotFitException):
            model.predict_user_attention_representation(self.user_features)

        with self.assertRaises(ModelNotFitException):
            model.predict_similar_items(self.item_features,
                                        item_ids=[1],
                                        n_similar=5)

        with self.assertRaises(ModelNotFitException):
            model.predict_item_bias(self.item_features)
        with self.assertRaises(ModelNotFitException):
            model.predict_user_bias(self.user_features)
Example #7
0
	def test_save_and_load_model_same_session(self):
		model = TensorRec(n_components=10)
		model.fit(self.interactions, self.user_features, self.item_features, epochs=10)

		predictions = model.predict(user_features=self.user_features, item_features=self.item_features)
		ranks = model.predict_rank(user_features=self.user_features, item_features=self.item_features)
		model.save_model(directory_path=self.test_dir)

		# Reload the model, predict, and check for equal predictions
		new_model = TensorRec.load_model(directory_path=self.test_dir)
		new_predictions = new_model.predict(user_features=self.user_features, item_features=self.item_features)
		new_ranks = new_model.predict_rank(user_features=self.user_features, item_features=self.item_features)

		self.assertEqual(predictions.all(), new_predictions.all())
		self.assertEqual(ranks.all(), new_ranks.all())
    def test_save_and_load_model_same_session(self):
        model = TensorRec(n_components=10)
        model.fit(self.interactions, self.user_features, self.item_features, epochs=10)

        predictions = model.predict(user_features=self.user_features, item_features=self.item_features)
        ranks = model.predict_rank(user_features=self.user_features, item_features=self.item_features)
        model.save_model(directory_path=self.test_dir)

        # Reload the model, predict, and check for equal predictions
        new_model = TensorRec.load_model(directory_path=self.test_dir)
        new_predictions = new_model.predict(user_features=self.user_features, item_features=self.item_features)
        new_ranks = new_model.predict_rank(user_features=self.user_features, item_features=self.item_features)

        self.assertTrue((predictions == new_predictions).all())
        self.assertTrue((ranks == new_ranks).all())
Example #9
0
    ax.scatter(*zip(*movie_positions[movies_to_plot]), s=2)
    ax.set_aspect('equal')

    for i, movie in enumerate(movies_to_plot):
        movie_name = item_titles[movie]
        movie_position = movie_positions[movie]
        # Comment this line to remove movie titles to the plot.
        ax.annotate(movie_name, movie_position[0:2], fontsize='x-small')

    file = '/tmp/tensorrec/movielens/epoch_{}.jpg'.format(epoch)
    plt.savefig(file)

    logging.info("Finished epoch {}".format(epoch))

ranks = model.predict_rank(
    user_features=user_features,
    item_features=item_features,
)
p_at_k = precision_at_k(ranks, test_interactions, k=5)
r_at_k = recall_at_k(ranks, test_interactions, k=30)

logging.info("Precision@5: {}, Recall@30: {}".format(np.mean(p_at_k),
                                                     np.mean(r_at_k)))

# Use the collected JPG files to create an MP4 video of the model fitting, then delete the JPGs.
fps = 12
file_list = glob.glob('/tmp/tensorrec/movielens/*.jpg')
list.sort(file_list, key=lambda x: int(x.split('_')[1].split('.jpg')[0]))
clip = mpy.ImageSequenceClip(file_list, fps=fps)
vid_file = '/tmp/tensorrec/movielens/movielens.mp4'
clip.write_videofile(filename=vid_file,
                     fps=fps,
    ax.scatter(*zip(*user_positions[user_to_plot]), color='r', s=1)
    ax.scatter(*zip(*movie_positions[movies_to_plot]), s=2)
    ax.set_aspect('equal')

    for i, movie in enumerate(movies_to_plot):
        movie_name = item_titles[movie]
        movie_position = movie_positions[movie]
        # Comment this line to remove movie titles to the plot.
        ax.annotate(movie_name, movie_position[0:2], fontsize='x-small')

    file = '/tmp/tensorrec/movielens/epoch_{}.jpg'.format(epoch)
    plt.savefig(file)

    logging.info("Finished epoch {}".format(epoch))

ranks = model.predict_rank(user_features=user_features,
                           item_features=item_features,)
p_at_k = precision_at_k(ranks, test_interactions, k=5)
r_at_k = recall_at_k(ranks, test_interactions, k=30)

logging.info("Precision@5: {}, Recall@30: {}".format(np.mean(p_at_k), np.mean(r_at_k)))

# Use the collected JPG files to create an MP4 video of the model fitting, then delete the JPGs.
fps = 12
file_list = glob.glob('/tmp/tensorrec/movielens/*.jpg')
list.sort(file_list, key=lambda x: int(x.split('_')[1].split('.jpg')[0]))
clip = mpy.ImageSequenceClip(file_list, fps=fps)
vid_file = '/tmp/tensorrec/movielens/movielens.mp4'
clip.write_videofile(filename=vid_file, fps=fps, codec='mpeg4', preset='veryslow', ffmpeg_params=['-qscale:v', '10'])
for file in file_list:
    os.remove(file)
Example #11
0
def main():
    # 데이터 로드

    ## 마스터 데이터(상호 작용)
    masterdf = pd.read_csv('./data/Transactions.csv')
    masterdf.columns = ['Transaction ID', 'Customer ID', 'Transaction Date', 'Prod Subcat Code',
            'Prod Cat Code', 'Qty', 'Rate', 'Tax', 'Total Amt', 'Store Type'] # 데이터 정리 및 표준화를 위해 데이터 열 명칭 변경

    masterdf['Store Type Code'] = pd.factorize(masterdf['Store Type'])[0] # 상점 코드 타입을 숫자형으로 변경하여 새 열에 저장

    masterdf['Date'] =  pd.DatetimeIndex(masterdf['Transaction Date'], dayfirst=True).date # 거래 날짜를 pandas의 datetime index로 표준화 

    masterdf['Net Sales'] = masterdf['Qty'] * masterdf['Rate'] # quantity와 based price에서 총 순 매출액(Net sales) 계산 (도시마다의 세금이 다를 수 있어 세금 제외)

    masterdf['Material'] = masterdf['Prod Cat Code'].astype(str) + '-' + masterdf['Prod Subcat Code'].astype(str) + '-' + masterdf['Store Type'].astype(str) # category, subcategory, store type을 이용하여 고유한 material 표시기를 생성
    masterdf[['Prod Cat Code','Prod Subcat Code', 'Store Type', 'Material']].drop_duplicates(subset='Material')

    ## 소비자 데이터(소비자 특성)
    custdf = pd.read_csv('./data/Customer.csv')
    custdf.columns = ['Customer ID', 'DOB', 'Gender', 'City Code']

    ## 아이탬 특징 데이터
    skudf = pd.read_csv('./data/prod_cat_info.csv')
    skudf.columns = ['Prod Cat Code', 'Prod Cat', 'Prod Sub Cat Code', 'Prod Subcat']



    # 데이터 생성

    ## RECENCY (최신성)
    recency_df = masterdf.groupby('Customer ID').Date.max().reset_index()
    recency_df.columns = ['Customer ID','Last Purchase']
    recency_df['Recency'] = recency_df['Last Purchase'].apply(lambda x: (now - x).days)
    recency_df = recency_df[['Customer ID', 'Recency']]

    ## FREQUENCY (빈도)
    frequency_df = masterdf.groupby('Customer ID')['Date'].count().reset_index()
    frequency_df.columns = ['Customer ID','Frequency']

    ## MONETARY (금액)
    monetary_df = masterdf.groupby('Customer ID')['Net Sales'].sum().reset_index()
    monetary_df.columns = ['Customer ID','Monetary']

    ## VARIETY (종류)
    variety_df = masterdf.groupby('Customer ID')['Material'].nunique().reset_index()
    variety_df.columns = ['Customer ID','Variety']

    ## RFMV
    rfmv = recency_df.copy()
    rfmv = rfmv.merge(frequency_df, on='Customer ID')
    rfmv = rfmv.merge(monetary_df, on='Customer ID')
    rfmv = rfmv.merge(variety_df, on='Customer ID')

    rfmv_quantiles = rfmv.iloc[:, 1:].quantile(q = [0.25, 0.5, 0.75]).to_dict() # R, F, M, V의 25%, 50%, 75%의 사분위수를 dictonary 형식으로 저장

    rfmv2 = rfmv.copy()
    rfmv2['R_q'] = rfmv2['Recency'].apply(RecencyScore, args=('Recency', rfmv_quantiles ))
    rfmv2['F_q'] = rfmv2['Frequency'].apply(FMVScore, args=('Frequency', rfmv_quantiles ))
    rfmv2['M_q'] = rfmv2['Monetary'].apply(FMVScore, args=('Monetary', rfmv_quantiles ))
    rfmv2['V_q'] = rfmv2['Variety'].apply(FMVScore, args=('Variety', rfmv_quantiles ))

    rfmv2 = rfmv2[['Customer ID', 'R_q', 'F_q', 'M_q', 'V_q',]]

    ## 각 구성 요소의 총 점수 합계

    rfmv2['Total_Score'] = rfmv2['R_q'] + rfmv2['F_q'] + rfmv2['M_q'] + rfmv2['V_q']

    rfmv2 = rfmv2[['Customer ID', 'Total_Score']]

    # 중요(IMPORTANT) : 인덱스를 고객 번호로 설정
    rfmv2.index = rfmv2['Customer ID']
    rfmv2 = rfmv2.drop('Customer ID', 1)

    # 최적의 군집 수를 찾기 위해 elbow 방식 (차후 이 과정을 조정할 필요가 있음)
    wcss = []
    for i in range(2,10):
        kmeans = KMeans(n_clusters=i, 
                        init='k-means++')
        kmeans.fit(rfmv2)
        wcss.append(kmeans.inertia_)
        
    # 위 "elbow" 그래프의 최적의 수를 이용하여 KMean 군집 적용
    kmeans = KMeans(n_clusters=4, 
                    init='random', 
                    random_state=None)

    clusters = kmeans.fit_predict(rfmv2)

    ### 군집 결과를 원본 rfmv 데이터에 추가
    rfmv['Clusters'] = clusters


    # Recommendation Weight
    active_cust = rfmv[rfmv.Recency < 365] # 최근 1년(365일)을 기준으로 하여 실고객에게 추천

    cleaned_df = masterdf.merge(active_cust[['Customer ID','Clusters']], how='left', on='Customer ID') # 군집화된 고객 특징을 마스터 데이터에 결합
    cleaned_df = cleaned_df[cleaned_df['Clusters'].notnull()] # 군집을 기준으로 null 값이 존재하는 행 삭제
    cleaned_df = cleaned_df.merge(custdf[['Customer ID', 'City Code']], how='left', on='Customer ID') ## 소비자 데이터 추가
    cleaned_df = cleaned_df.merge(skudf[['Prod Cat', 'Prod Cat Code']], how='left', on='Prod Cat Code') # sku 특징(물품 카테고리) 를 마스터 데이터에 결합

    # 필수 열 가져오기
    final_cleaned_df = cleaned_df
    final_cleaned_df = final_cleaned_df[['Prod Cat','Material','Qty','Customer ID','Clusters',]]

    # 고유한 고객 목록 유지, 중복 제거
    cust_grouped = final_cleaned_df.groupby(['Customer ID',
                                            'Prod Cat',
                                            'Material',
                                            'Clusters']).sum().reset_index()

    ## Interaction Matrix 
    interactions = cust_grouped.groupby(['Customer ID', 'Material'])['Qty'].sum().unstack().fillna(0)

    minmaxscaler = preprocessing.MinMaxScaler()
    interactions_scaled = minmaxscaler.fit_transform(interactions)
    interactions_scaled = pd.DataFrame(interactions_scaled)

    interactions_scaled.index = interactions.index
    interactions_scaled.columns = interactions.columns

    ## User Features Matrix 
    cust_qty = cust_grouped.groupby(['Customer ID', 'Prod Cat'])['Qty'].sum().unstack().fillna(0)

    minmaxscaler = preprocessing.MinMaxScaler()
    cust_qty_scaled = minmaxscaler.fit_transform(cust_qty)
    cust_qty_scaled = pd.DataFrame(cust_qty_scaled)
    cust_qty_scaled.index = cust_qty.index
    cust_qty_scaled.columns = cust_qty.columns

    cust_clus = cust_grouped.groupby(['Customer ID', 'Clusters'])['Clusters'].nunique().unstack().fillna(0)

    customer_features = pd.merge(cust_qty_scaled, cust_clus, left_index=True, right_index=True, how='inner')
    customer_features = customer_features.rename(columns={0: 'Cluster 0', 
                                                        1: 'Cluster 1', 
                                                        2: 'Cluster 2', 
                                                        3: 'Cluster 3', 
                                                        4: 'Cluster 4'})

    ### Item Features Matrix
    item_category = pd.DataFrame(cust_grouped.groupby(['Material', 
                                                'Prod Cat'])['Qty'].sum().unstack().fillna(0).reset_index().set_index('Material'))

    minmaxscaler = preprocessing.MinMaxScaler()
    item_category_scaled = minmaxscaler.fit_transform(item_category)
    item_category_scaled = pd.DataFrame(item_category_scaled)
    item_category_scaled.index = item_category.index
    item_category_scaled.columns = item_category.columns


    interaction_f = sparse.coo_matrix(interactions_scaled)
    user_f  = sparse.coo_matrix(customer_features) 
    item_f  = sparse.coo_matrix(item_category_scaled) 

    mask_size = len(interaction_f.data)

    np.random.choice(a=[False, True], 
                    size=mask_size, 
                    p=[.2, .8])

    ## train, test data
    train_interactions, test_interactions = interaction_masking(interaction_f)

    user_features  = user_f
    item_features = item_f


    # train 

    ## 모델 파라미터
    epochs = 100 
    alpha = 0.01 
    n_components =  10

    verbose = True
    learning_rate = 0.01
    n_sampled_items = int(item_features.shape[0] * .1)
    biased = False
    
    k_val  = 100


    model = TensorRec(n_components = n_components,                 
                    user_repr_graph = DeepRepresentationGraph(),
                    item_repr_graph = NormalizedLinearRepresentationGraph(),
                    loss_graph = WMRBLossGraph(), 
                    biased=biased)

    model.fit(train_interactions, 
            user_features, 
            item_features, 
            epochs=epochs, 
            verbose=False, 
            alpha=alpha, 
            n_sampled_items=n_sampled_items,
            learning_rate=learning_rate)


    predicted_ranks = model.predict_rank(user_features=user_features,
                                        item_features=item_features)

    r_at_k_test = recall_at_k(predicted_ranks, test_interactions, k=80)
    r_at_k_train = recall_at_k(predicted_ranks, train_interactions, k=80)
    print("Recall at @k: Train: {:.2f} Test: {:.2f}".format(r_at_k_train.mean(), r_at_k_test.mean()))

    # produce the ranking into a readable table (dataframe it is)
    ranks_df = pd.DataFrame(predicted_ranks)
    ranks_df.columns = item_category_scaled.index
    ranks_df.index = customer_features.index
    ranks_df = ranks_df.T

    ranks_df.to_csv('./result/ranks_df.csv')
                                      shape=(n_users, n_items))

    # train collaborative filtering model
    epochs = 500
    alpha = 0.00001
    n_components = 10
    verbose = True
    learning_rate = 0.01
    n_sampled_items = int(n_items*0.01)
    fit_kwargs = {'epochs': epochs, 'alpha': alpha, 'verbose': verbose, 'learning_rate': learning_rate,
                  'n_sampled_items': n_sampled_items}

    cf_model = TensorRec(n_components=10,
                         user_repr_graph=NormalizedLinearRepresentationGraph(),
                         loss_graph=WMRBLossGraph())

    cf_model.fit(user_features=user_features, item_features=item_features,
                 interactions=train_interactions, **fit_kwargs)

    # calculate test ranks excluding training items
    predicted_ranks = cf_model.predict_rank(user_features=test_user_features, item_features=item_features)
    predicted_ranks[train.uid - 1, train.iid - 1] = n_items + 1
    predicted_ranks = predicted_ranks.argsort(axis=1).argsort(axis=1) + 1

    # evaluate precision and recall
    precision_results = precision_at_k(predicted_ranks, test_interactions, k=10)
    recall_results = recall_at_k(predicted_ranks, test_interactions, k=10)

    logging.info("Precision at 10: {}".format(np.mean(precision_results)))
    logging.info("Recall at 10: {}".format(np.mean(recall_results)))