def main(): # 0. read data and splite it to 80% for training and 20% for testing items = pd.read_csv('input/items.csv', sep=';', encoding='ISO-8859-1') print items.shape items_train, items_test = train_test_split(items, train_size=0.8, random_state=0) print items_train.shape, items_test.shape # 1. train tf-idf model and save it under model/tf-idf-model.pickle with the result if not os.path.isfile('model/tfidf_model.pickle'): print('traning tf-idf model ...') tfidf_model = TfidfVectorizer(norm='l2',min_df=0, use_idf=True,max_features=5000, smooth_idf=False, sublinear_tf=True, tokenizer=tokeniser) item_feature_matrix = tfidf_model.fit_transform(items_train['movie desription'].values.astype('U')) print('#1. dimension of the item-feature matrix', item_feature_matrix.shape) # 1.1 saving tf-idf model print('Saving tf-idf model ...') save_model('model/tfidf_model.pickle', tfidf_model) if not os.path.isfile('result/item_feature_matrix.pickle'): # 1.2. saving tf-idf matrix result print('Saving tf-idf matrix result ...') save_model('result/item_feature_matrix.pickle', item_feature_matrix) # 2. train dbn model and save the model into model/dbn.pickle # 2.1. load tf-idf result print('loading item feature matrix ...') item_feature_matrix = load_model('result/item_feature_matrix.pickle') if not os.path.isfile('model/dbn-model.pkl'): dbn = UnsupervisedDBN(hidden_layers_structure=[5000, 400], batch_size=10, learning_rate_rbm=0.06, n_epochs_rbm=20, activation_function='sigmoid') # 2.2. fit dbn model dbn.fit(item_feature_matrix.A) # 2.3. save dbn model print('saving DBN model ...') dbn.save('model/dbn-model.pkl') print('Loadin DBN model') dbn = UnsupervisedDBN.load('model/dbn-model.pkl') # 3. Clustering with k-mens and save model and results if not os.path.isfile('model/kmeans-model.pkl'): kmeans = KMeans(n_clusters=5, random_state=0).fit(dbn.transform(item_feature_matrix.A)) print('saving k-means model ...') save_model('model/kmeans-model.pkl', kmeans) else: kmeans = load_model('model/kmeans-model.pkl') print(kmeans.labels_)
header=None, encoding='ISO-8859-1') u_item_DF['movie desription'] = [val[2] for i, val in data_new.iterrows()] sklearn_tfidf = TfidfVectorizer(norm='l2', min_df=0, use_idf=True, max_features=5000, smooth_idf=False, sublinear_tf=True, tokenizer=tokeniser) item_feature_matrix = sklearn_tfidf.fit_transform( u_item_DF['movie desription'].values.astype('U')) print('dimension of the item-feature matrix', item_feature_matrix.shape) # Train DBN model from dbn.models import UnsupervisedDBN #[4604, 2000, 4000, 3000, 1000] dbn = UnsupervisedDBN(hidden_layers_structure=[5000, 400], batch_size=10, learning_rate_rbm=0.06, n_epochs_rbm=20, activation_function='sigmoid') dbn.fit(item_feature_matrix.A) # Save the model print('Saving Model ...') dbn.save('model-1.pkl') print('Model Saved')
def main(tfidfModel=None, tfidfMatrix=None, dbn_model=None, kmeans_model=None): # 0. read data and splite it to 80% for training and 20% for testing items_info = pd.read_csv('input/items.csv', sep=';', encoding='ISO-8859-1') u_base1 = pd.read_csv('input/u1.base', sep='\t', header=None) train = pd.DataFrame(u_base1[1].drop_duplicates()) u_test1 = pd.read_csv('input/u1.test', sep='\t', header=None) test = pd.DataFrame(u_test1[1].drop_duplicates()) train_desc = [ items_info[items_info['movie id'] == df[1]] ['movie desription'].values[0] for i, df in train.iterrows() ] test_desc = [ items_info[items_info['movie id'] == df[1]] ['movie desription'].values[0] for i, df in test.iterrows() ] # 1. train tf-idf model and save it under model/tf-idf-model.pickle with the result if not tfidfModel: print('traning tf-idf model ...') tfidf_model = TfidfVectorizer(norm='l2', min_df=0, use_idf=True, max_features=5000, smooth_idf=False, sublinear_tf=True, tokenizer=tokeniser) tfidf_model.fit(train_desc) print('- Saving tf-idf model ...') save_model('model/tfidf_model.pickle', tfidf_model) else: print('# Loading tf-idf model ...') tfidf_model = load_model(tfidfModel) if not tfidfMatrix: item_feature_matrix = tfidf_model.transform(train_desc) # 1.2. saving tf-idf matrix result print('- Saving tf-idf matrix result ...') save_model('result/item_feature_matrix.pickle', item_feature_matrix) else: print('# Loading tf-idf matrix result ...') item_feature_matrix = load_model(tfidfMatrix) if not dbn_model: dbn = UnsupervisedDBN(hidden_layers_structure=[5000, 1000, 1000, 500], batch_size=10, learning_rate_rbm=0.06, n_epochs_rbm=20, activation_function='sigmoid') # 2.2. fit dbn model dbn.fit(item_feature_matrix.A) # 2.3. save dbn model print('saving DBN model ...') dbn.save('model/dbn-model.pkl') else: print('Loadin DBN model') dbn = UnsupervisedDBN.load(dbn_model) # 3. Clustering with k-mens and save model and results if not kmeans_model: kmeans = KMeans(n_clusters=5, random_state=0).fit( dbn.transform(item_feature_matrix.A)) print('saving k-means model ...') save_model('model/kmeans-model.pkl', kmeans) else: print('loading k-means model ...') kmeans = load_model(kmeans_model) print("Done!")