Ejemplo n.º 1
0
def loadData():
    '''
    This function loads the data from various data files and does the basic preprocessing.
    Created to leverage the power of streamlit cache.
    '''
    movies_df = Preprocess.loadFile("movies")
    ratings_df = Preprocess.loadFile("ratings")
    final_vector_df = Util.loadObj('final_vector_df')
    embeddings_matrix = final_vector_df.loc[:, final_vector_df.
                                            columns != 'movieId']
    embedding_movie_list = final_vector_df['movieId'].tolist()

    ratings_df2 = Preprocess.loadFile("ratings")
    # ratings_input =  [ratings_df['userId'].to_numpy(), ratings_df['movieId'].to_numpy(), ratings_df['rating'].to_numpy()]
    users = list(set(ratings_df['userId'].tolist()))
    movies = list(set(ratings_df['movieId'].tolist()))

    users_dict = {u: i for i, u in enumerate(users)}
    movies_dict = {m: i for i, m in enumerate(movies)}  # Movie Id to Idx
    movies_idx_dict = {i: m for i, m in enumerate(movies)}  #Idx to movie Id

    ratings_df2['userId'] = ratings_df2['userId'].apply(
        lambda x: users_dict[x])
    ratings_df2['movieId'] = ratings_df2['movieId'].apply(
        lambda x: movies_dict[x])

    return movies_df, ratings_df, final_vector_df, embeddings_matrix, embedding_movie_list, ratings_df2, users, movies, users_dict, movies_dict, movies_idx_dict
Ejemplo n.º 2
0
print(tags_grouped_df.head())
print(tags_grouped_df.shape)

## calculating the TFIDF matrix
tfidf_df = Preprocess.createTFIDFMatrix(tags_grouped_df)
print(tfidf_df.shape)

## dumping the tfidf matrix
Util.saveObj(tfidf_df, 'tfidf_df')

# ## loading the TFIDF matrix
# tfidf_df =  Util.loadObj('tfidf_df')
# print(tfidf_df.shape)

## loading the reduced TFIDF matrix
tfidf_reduced_df = Util.loadObj('tfidf_reduced_df')
print(tfidf_reduced_df.shape)

## creating vector df with spacy sentence vector
vector_df = createSentenceVector(imdb_df)
print(vector_df.shape)

## dumping the vector df
Util.saveObj(vector_df, 'vector_df')

# ## loading vector df
# vector_df =  Util.loadObj('vector_df')
# print(vector_df.shape)

## merging tfidf reduced df and vector df
vector_df['movieId'] = vector_df['movieId'].apply(lambda x: int(x))
Ejemplo n.º 3
0
    def call(self, input):
        encoder_out_1 = self.dropout_layer(self.EncoderDense1(input))
        encoder_out_2 = self.dropout_layer(self.EncoderDense2(encoder_out_1))
        bottleneck_out = self.dropout_layer(
            self.BottleNeckDense(encoder_out_2))
        decoder_out_1 = self.dropout_layer(self.DecoderDense1(bottleneck_out))
        decoder_out_2 = self.dropout_layer(self.DecoderDense2(decoder_out_1))
        final_out = self.dropout_layer(self.FinalDense(decoder_out_2))
        return final_out


NUM_EPOCHS = 100
BATCH_SIZE = 64

tfidf_matrix = Util.loadObj('tfidf_df')
X = tfidf_matrix.to_numpy()
features = X.shape[1]

model = AutoEncoder(features)
optimizer = keras.optimizers.Adam(lr=0.000003)
loss = lambda x, x_hat: tf.reduce_sum(keras.losses.mean_squared_error(
    x, x_hat))

model.compile(loss=loss, optimizer=optimizer, metrics=['mse'])
model.fit(x=X, y=X, batch_size=BATCH_SIZE, epochs=NUM_EPOCHS)

reduced = model.BottleNeckDense(model.EncoderDense2(model.EncoderDense1(X)))

reduced_np = reduced.numpy()
indices = tfidf_matrix.index.tolist()