def __init__(self, cores): self.mongo_conn = MongoConnector() self.s3_conn = S3Connector() self.n_processes = self._set_n_processes(cores) self.movies_df = self._load_movies_df() self.makers_df = self._load_makers_df() self.cluster_df = self._load_cluster_df() self.movie_vectors = self._load_movie_vectors() try: mp.set_start_method('spawn') except RuntimeError: pass
def __init__(self, all): self.mongo_conn = MongoConnector() self.s3_conn = S3Connector() self.all = all self.n_processes = (mp.cpu_count() // 2) - 1 self.model_params = config['MODEL']['FASTTEXT_PARAMS'] self.morphs_df = None self.model = None if self.all: logger.info('using all sentences') print('using all sentences') logger.info(f'using {self.n_processes} cores') print(f'using {self.n_processes} cores')
def __init__(self, chunk): self.mongo_conn = MongoConnector() self.s3_conn = S3Connector() self.chunk = chunk self.n_processes = (mp.cpu_count() // 2) - 1 self.movie_id_q = Queue() self.model = None self.movie_vector_li = [] self.morphs_df = None self.movie_vectors = None self._get_movie_ids() self._load_trained_model() logger.info(f'using {self.n_processes} cores') print(f'using {self.n_processes} cores')
def main(n_clusters): s3_conn = S3Connector() # load data movie_vectors = s3_conn.load_from_s3_byte( config['AWS']['S3_BUCKET'], config['MODEL']['MOVIE_VECTORS_PATH']) movies_df = s3_conn.load_from_s3_byte(config['AWS']['S3_BUCKET'], config['DATA']['MOVIE_INFO']) # filter movie movie_vectors = movie_vectors[movie_vectors.index.isin( movies_df[movies_df['review_count'] >= 30]['movie_id'])] # train X = Normalizer(norm='l2').fit_transform(list(movie_vectors['vector'])) def train_kmeans_model(X, k): model = KMeans(init='k-means++', n_clusters=k, max_iter=10000, tol=1e-12).fit(X) return model kmeans = train_kmeans_model(X, n_clusters) # cluster vector cluster_centroids = { idx: vector for idx, vector in enumerate(kmeans.cluster_centers_) } cluster_df = pd.DataFrame() cluster_df['vector'] = cluster_centroids.values() cluster_df = cluster_df.set_index(pd.Index(cluster_centroids.keys())) # map cluster-movie movie_vectors['cluster'] = kmeans.labels_ movie_vectors = movie_vectors.sort_values(by=['cluster']) # unload s3_conn.upload_to_s3_byte(movie_vectors, config['AWS']['S3_BUCKET'], config['MODEL']['MOVIE_VECTORS_PATH']) s3_conn.upload_to_s3_byte(cluster_df, config['AWS']['S3_BUCKET'], config['MODEL']['CLUSTER_PATH'])
def __init__(self): self.mongo_conn = MongoConnector() self.s3_conn = S3Connector() self._load_data()
def __init__(self): self.mongo_conn = MongoConnector() self.s3_conn = S3Connector() self.cluster_rec = self.s3_conn.load_from_s3_byte(config['AWS']['S3_BUCKET'], config['REC']['FRONT_CLUSTER']) self.cluster_dic = self._make_cluster_dic()
def __init__(self): self.mongo_conn = MongoConnector() self.s3_conn = S3Connector() logger.info('inner class test')
def main(): mongo_conn = MongoConnector() s3_conn = S3Connector() def _load_movies_df(): movies_df = s3_conn.load_from_s3_byte(config['AWS']['S3_BUCKET'], config['DATA']['MOVIE_INFO']) return movies_df def _load_makers_df(): makers_df = pd.DataFrame(mongo_conn.makers.find()) mongo_conn.close() makers_df = pd.merge(makers_df, movies_df[['movie_id', 'review_count']], on='movie_id', validate='many_to_one') roles = ['actor_main', 'director', 'writer', 'actor_sub'] makers_df['role'] = pd.Categorical(makers_df['role'], categories=roles, ordered=True) makers_df = makers_df.rename( columns={'movie_poster_url': 'poster_url'}) return makers_df def _load_cluster_df(): cluster_df = s3_conn.load_from_s3_byte(config['AWS']['S3_BUCKET'], config['MODEL']['CLUSTER_PATH']) return cluster_df def _load_movie_vectors(): movie_vectors = s3_conn.load_from_s3_byte( config['AWS']['S3_BUCKET'], config['MODEL']['MOVIE_VECTORS_PATH']) movie_vectors = movie_vectors[movie_vectors.index.isin( movies_df['movie_id'])] return movie_vectors movies_df = _load_movies_df() makers_df = _load_makers_df() cluster_df = _load_cluster_df() movie_vectors = _load_movie_vectors() # 리뷰 많은 순 정렬 movies_df = movies_df.sort_values(by=['review_count'], ascending=False) # 역할, 리뷰 많은 순 정렬 makers_df = makers_df.sort_values(by=['role', 'review_count'], ascending=[True, False]) # 부분단어 해시 def generate_hash(names): dic = dict() for name in names: if len(name) == 1: dic = _update_name_dic(name, name, dic) continue name_split = name.split(' ') name_split.append(name.replace(' ', '')) name_split.append(name) for word in name_split: length = len(word) if length < 2: continue for i in range(2, length + 1): subword = word[:i] dic = _update_name_dic(name, subword, dic) for key in dic.keys(): dic[key] = get_unique_ordered_list(dic.get(key)) return dic def _update_name_dic(name, word, dic): if dic.get(word) is None: dic[word] = [] dic[word].append(name) return dic def get_unique_ordered_list(li): seen = set() return [x for x in li if not (x in seen or seen.add(x))] # 제목 movie_names_kor = movies_df['title_kor'] movie_names_hash = generate_hash(movie_names_kor) # 사람 maker_names = makers_df['name'] maker_names_hash = generate_hash(maker_names) # 장르 genre_names = set(flatten(movies_df['genre'])) genre_hash = generate_hash(genre_names) # 국가 nation_names = set(flatten(movies_df['nations'])) nation_names_hash = generate_hash(nation_names) # 병합 subword_hash = dict() subword_hash['movie_name'] = movie_names_hash subword_hash['maker'] = maker_names_hash subword_hash['genre'] = genre_hash subword_hash['nation'] = nation_names_hash # 이름-영화id 해시 def generate_name_id_hash(names, ids): dic = dict() for i in range(len(names)): if dic.get(names[i]) is None: dic[names[i]] = [] dic[names[i]].append(ids[i]) for key in dic.keys(): dic[key] = get_unique_ordered_list(dic.get(key)) return dic # 제목 movie_names = list(movies_df['title_kor']) movie_ids = list(movies_df['movie_id']) movie_name_id_hash = generate_name_id_hash(movie_names, movie_ids) # 이름 maker_names = list(makers_df['name']) maker_ids = list(makers_df['movie_id']) maker_id_hash = generate_name_id_hash(maker_names, maker_ids) # 장르 ex_movies_df = movies_df[['movie_id', 'genre']].explode('genre') genres = list(ex_movies_df['genre']) genre_ids = list(ex_movies_df['movie_id']) genre_id_hash = generate_name_id_hash(genres, genre_ids) # 국가 ex_movies_df = movies_df[['movie_id', 'nations']].explode('nations') nations = list(ex_movies_df['nations']) nation_ids = list(ex_movies_df['nations']) nation_id_hash = generate_name_id_hash(nations, nation_ids) # 병합 name_id_hash = dict() name_id_hash['movie_name'] = movie_name_id_hash name_id_hash['maker'] = maker_id_hash name_id_hash['genre'] = genre_id_hash name_id_hash['nation'] = nation_id_hash # unload s3_conn.upload_to_s3_byte(subword_hash, config['AWS']['S3_BUCKET'], config['DATA']['SUBWORD_HASH']) s3_conn.upload_to_s3_byte(name_id_hash, config['AWS']['S3_BUCKET'], config['DATA']['NAME_ID_HASH'])