def loadData(): ''' This function loads the data from various data files and does the basic preprocessing. Created to leverage the power of streamlit cache. ''' movies_df = Preprocess.loadFile("movies") ratings_df = Preprocess.loadFile("ratings") final_vector_df = Util.loadObj('final_vector_df') embeddings_matrix = final_vector_df.loc[:, final_vector_df. columns != 'movieId'] embedding_movie_list = final_vector_df['movieId'].tolist() ratings_df2 = Preprocess.loadFile("ratings") # ratings_input = [ratings_df['userId'].to_numpy(), ratings_df['movieId'].to_numpy(), ratings_df['rating'].to_numpy()] users = list(set(ratings_df['userId'].tolist())) movies = list(set(ratings_df['movieId'].tolist())) users_dict = {u: i for i, u in enumerate(users)} movies_dict = {m: i for i, m in enumerate(movies)} # Movie Id to Idx movies_idx_dict = {i: m for i, m in enumerate(movies)} #Idx to movie Id ratings_df2['userId'] = ratings_df2['userId'].apply( lambda x: users_dict[x]) ratings_df2['movieId'] = ratings_df2['movieId'].apply( lambda x: movies_dict[x]) return movies_df, ratings_df, final_vector_df, embeddings_matrix, embedding_movie_list, ratings_df2, users, movies, users_dict, movies_dict, movies_idx_dict
print(tags_grouped_df.head()) print(tags_grouped_df.shape) ## calculating the TFIDF matrix tfidf_df = Preprocess.createTFIDFMatrix(tags_grouped_df) print(tfidf_df.shape) ## dumping the tfidf matrix Util.saveObj(tfidf_df, 'tfidf_df') # ## loading the TFIDF matrix # tfidf_df = Util.loadObj('tfidf_df') # print(tfidf_df.shape) ## loading the reduced TFIDF matrix tfidf_reduced_df = Util.loadObj('tfidf_reduced_df') print(tfidf_reduced_df.shape) ## creating vector df with spacy sentence vector vector_df = createSentenceVector(imdb_df) print(vector_df.shape) ## dumping the vector df Util.saveObj(vector_df, 'vector_df') # ## loading vector df # vector_df = Util.loadObj('vector_df') # print(vector_df.shape) ## merging tfidf reduced df and vector df vector_df['movieId'] = vector_df['movieId'].apply(lambda x: int(x))
def call(self, input): encoder_out_1 = self.dropout_layer(self.EncoderDense1(input)) encoder_out_2 = self.dropout_layer(self.EncoderDense2(encoder_out_1)) bottleneck_out = self.dropout_layer( self.BottleNeckDense(encoder_out_2)) decoder_out_1 = self.dropout_layer(self.DecoderDense1(bottleneck_out)) decoder_out_2 = self.dropout_layer(self.DecoderDense2(decoder_out_1)) final_out = self.dropout_layer(self.FinalDense(decoder_out_2)) return final_out NUM_EPOCHS = 100 BATCH_SIZE = 64 tfidf_matrix = Util.loadObj('tfidf_df') X = tfidf_matrix.to_numpy() features = X.shape[1] model = AutoEncoder(features) optimizer = keras.optimizers.Adam(lr=0.000003) loss = lambda x, x_hat: tf.reduce_sum(keras.losses.mean_squared_error( x, x_hat)) model.compile(loss=loss, optimizer=optimizer, metrics=['mse']) model.fit(x=X, y=X, batch_size=BATCH_SIZE, epochs=NUM_EPOCHS) reduced = model.BottleNeckDense(model.EncoderDense2(model.EncoderDense1(X))) reduced_np = reduced.numpy() indices = tfidf_matrix.index.tolist()