ratings.columns = ['UserID', 'MovieID', 'Rating', 'Timestamp']

# data = ratings.merge(users, on=['UserID'])
# data = data.merge(movies, on=['MovieID'])

# %%
# movies数据处理
movies['publish_date'] = movies.Title.str[-5:-1].astype(int)
movies['Title'] = LabelEncoder().fit_transform(movies.Title.str[:-7])

from sklearn.preprocessing import MultiLabelBinarizer
movie_genres = MultiLabelBinarizer().fit_transform(
    movies.Genres.map(lambda x: x.split('|')))
movie_genres = pd.DataFrame(movie_genres)
movie_genres.columns = [
    'Genres_%d' % i for i in range(len(movie_genres.columns))
]
movies = pd.concat([movies, movie_genres], axis=1)
# users数据处理
users = users.drop(['Zip-code'], axis=1)
users['Gender'] = LabelEncoder().fit_transform(users.Gender)
# ratings数据处理
ratings = ratings.sort_values(['UserID', 'Timestamp'])
ratings['MovieID'] = ratings['MovieID'].astype(str)
watching_seq = ratings.groupby('UserID')['MovieID'].transform(
    lambda x: ','.join(x))
ratings['MovieID'] = ratings['MovieID'].astype(int)
ratings['watching_seq'] = ratings['UserID'].map(watching_seq)

dt = pd.to_datetime(ratings.Timestamp).dt
ratings['day'] = dt.day