def __init__(self, cores, rows): self.mongo_conn = MongoConnector() self.n_processes = self._set_n_processes(cores) self.rows = rows logger.info(f'using {self.n_processes} cores') print(f'using {self.n_processes} cores')
def __init__(self, pos_chunk, pos_target, all): self.mongo_conn = MongoConnector() self.pos_chunk = pos_chunk self.pos_target = pos_target self.adj_converter = self._load_adj_converter() self._set_iter_count() if all: print('reset update_checker') self._reset_update_checker()
def __init__(self, cores): self.mongo_conn = MongoConnector() self.s3_conn = S3Connector() self.n_processes = self._set_n_processes(cores) self.movies_df = self._load_movies_df() self.makers_df = self._load_makers_df() self.cluster_df = self._load_cluster_df() self.movie_vectors = self._load_movie_vectors() try: mp.set_start_method('spawn') except RuntimeError: pass
def __init__(self, pos_chunk, pos_target, all): self.mongo_conn = MongoConnector() self.pos_chunk = pos_chunk self.pos_target = pos_target if all: print('reset morph data') self._reset_morph_data() self._set_iter_count() print(pos_chunk) print(pos_target)
def __init__(self, all): self.mongo_conn = MongoConnector() self.s3_conn = S3Connector() self.all = all self.n_processes = (mp.cpu_count() // 2) - 1 self.model_params = config['MODEL']['FASTTEXT_PARAMS'] self.morphs_df = None self.model = None if self.all: logger.info('using all sentences') print('using all sentences') logger.info(f'using {self.n_processes} cores') print(f'using {self.n_processes} cores')
def __init__(self, pos_chunk, pos_target, all): self.mongo_conn = MongoConnector() self.pos_chunk = pos_chunk self.pos_target = pos_target self.all = all print(pos_chunk) print(pos_target) if self.all: logger.info('get morphs from entire corpus. dropping morphs collection.') self._drop_morphs() self._set_iter_count()
def __init__(self, chunk): self.mongo_conn = MongoConnector() self.s3_conn = S3Connector() self.chunk = chunk self.n_processes = (mp.cpu_count() // 2) - 1 self.movie_id_q = Queue() self.model = None self.movie_vector_li = [] self.morphs_df = None self.movie_vectors = None self._get_movie_ids() self._load_trained_model() logger.info(f'using {self.n_processes} cores') print(f'using {self.n_processes} cores')
def __init__(self): self.mongo_conn = MongoConnector() self.session = requests.Session() self.headers = config["SCRAPER"]["HEADERS"] self.update_checker = self._set_update_checker() self.queue = self._set_queue() self.current_target = None self.last_update_date = None self.pages = 0 self.current_page = 1 self.review_count = 0 self.reviews = [] self.valid = True
def job_estimater(): mongo_conn = MongoConnector() user_reviews = mongo_conn.user_reviews row_count = user_reviews.count_documents( {'tokenized_okt': { '$in': [None, False] }}) print(f'{row_count} rows are not tokenized.') if row_count == 0: print('all documents are tokenized. finish process.') return 0 n_iters = row_count // args.rows + 1 print(f'need process {n_iters} times') return n_iters
def __init__(self): self.mongo_conn = MongoConnector() self.s3_conn = S3Connector() self._load_data()
class MovieInfoPreprocessor(object): def __init__(self): self.mongo_conn = MongoConnector() self.s3_conn = S3Connector() self._load_data() def _load_data(self): self.movies_df = pd.DataFrame(self.mongo_conn.movies.find()) self.makers_df = pd.DataFrame(self.mongo_conn.makers.find()) self.reviews_df = pd.DataFrame(self.mongo_conn.user_reviews.find()) self.mongo_conn.close() def filter_fault_rows(self): # filter self.movies_df = self.movies_df[~( self.movies_df['title_kor'].isna() | self.movies_df['poster_url'].isna() | self.movies_df['stillcut_url'].isna())] return self def preprocess(self): # 장르 필터 self.movies_df = self.movies_df[~( self.movies_df['genre'].map(set(['에로']).issubset) | self.movies_df['genre'].map(set(['공연실황']).issubset))] # 코멘트 통계 추가 self.add_review_stat() # 코멘트 수 필터 self.movies_df = self.movies_df[self.movies_df['review_count'] > 0] # 개봉년도 추출 self.get_year_column() # 개봉일 수정 self.compansate_release_date() # 스태프 컬럼 추가 self.merge_staff_columns() # 필요 없는 컬럼 제거 self.movies_df = self.movies_df.drop( columns=['updated_at', 'review_checked_date']) # 컬럼 이름 변경 self.movies_df = self.movies_df.rename(columns={'_id': 'movie_id'}) return self def add_review_stat(self): reviews_stat = self.reviews_df.groupby('movie_id').agg( {'rate': ['mean', 'count']}) merged = pd.merge(self.movies_df, reviews_stat.rate, how='left', left_on='_id', right_index=True, validate='one_to_one') merged = merged.rename(columns={ 'mean': 'avg_rate', 'count': 'review_count' }) self.movies_df = merged return self def get_year_column(self): def year_process(row): date = row['release_date'] if type(date) is float or len(date) < 4: result = row['title_eng'].split(',')[-1].replace(' ', '') return result if result.isdigit() else '' return date[:4] self.movies_df.loc[:, 'release_year'] = self.movies_df.apply(year_process, axis=1) return self def compansate_release_date(self): def date_process(row): date = row['release_date'] year = row['release_year'] if type(date) is float: try: date = f'{year}0101' except Exception as e: logger.error(e) return date self.movies_df.loc[:, 'release_date'] = self.movies_df.apply(date_process, axis=1) return self def merge_staff_columns(self): directors = self.makers_df[self.makers_df['role'] == 'director'][[ 'movie_id', 'name', 'role' ]] writers = self.makers_df[self.makers_df['role'] == 'writer'][[ 'movie_id', 'name', 'role' ]] merged1 = pd.merge(self.movies_df, directors, left_on='_id', right_on='movie_id', how='left', validate='one_to_one') merged2 = pd.merge(merged1, writers, left_on='_id', right_on='movie_id', how='left', validate='one_to_one') merged2 = merged2.rename(columns={ '_id': 'movie_id', 'name_x': 'director', 'name_y': 'writer' }) merged2 = merged2.drop( columns=['movie_id_x', 'role_x', 'movie_id_y', 'role_y']) self.movies_df = merged2 return self def upload_to_s3(self): self.s3_conn.upload_to_s3_byte(self.movies_df, config['AWS']['S3_BUCKET'], config['DATA']['MOVIE_INFO'])
def __init__(self): self.mongo_conn = MongoConnector() self.s3_conn = S3Connector() self.cluster_rec = self.s3_conn.load_from_s3_byte(config['AWS']['S3_BUCKET'], config['REC']['FRONT_CLUSTER']) self.cluster_dic = self._make_cluster_dic()
class KeywordExtractor(object): def __init__(self): self.mongo_conn = MongoConnector() self.s3_conn = S3Connector() self.cluster_rec = self.s3_conn.load_from_s3_byte(config['AWS']['S3_BUCKET'], config['REC']['FRONT_CLUSTER']) self.cluster_dic = self._make_cluster_dic() def _make_cluster_dic(self): cluster_dic = dict() for key, entry in self.cluster_rec.items(): ids = [dic['movie_id'] for dic in entry] cluster_dic[key] = ids return cluster_dic def get_morphs(self): try: morphs = self.mongo_conn.user_review_morphs_okt.find({}, {'_id': 0, 'movie_id': 1, 'adjectives': 1}) except Exception as e: logger.error(e) finally: self.mongo_conn.close() self.morphs_df = pd.DataFrame(morphs).set_index('movie_id').sort_index() self.morphs_df = self.morphs_df.rename(columns={'adjectives': 'morphs'}) print(f'got {len(self.morphs_df)} comments.') return self def make_df_dic(self): movie_word_set = self.morphs_df.groupby('movie_id')['morphs'].apply(lambda x: np.unique(np.hstack(x))) df_dic = dict() for idx in range(len(movie_word_set)): for word in movie_word_set.iloc[idx]: if df_dic.get(word) is None: df_dic[word] = 0 df_dic[word] += 1 self.df_dic = df_dic return self def make_movie_prob_distribution(self, df_floor): self.morphs_df.loc[:, 'morphs'] = self.morphs_df['morphs'].map( lambda x: [word for word in x if self.df_dic.get(word) >= df_floor] ) self.morphs_df.loc[:, 'prob_dist'] = self.morphs_df['morphs'].map( lambda morphs: self._tf_to_prob_dist(self._make_tf_dic(morphs)) ) self.morphs_df = self.morphs_df.drop(columns=['morphs']) self.morphs_df = self.morphs_df.groupby(self.morphs_df.index).agg( {'prob_dist': lambda col: self._agg_to_movie(col)} ) self.morphs_df = pd.DataFrame( data=self.morphs_df['prob_dist'].to_list(), index=self.morphs_df.index ).fillna(0) return self def extract_cluster_keywords(self, keyword_length): uni_entropy_series = self.morphs_df.apply(lambda x: entropy(x)) uni_entropy_series.sort_values(ascending=False) cluster_entropy_dic = self._make_cluster_entropy_dic(self.cluster_dic, self.morphs_df) cluster_score_dic = self._make_cluster_score_dic(cluster_entropy_dic, uni_entropy_series) self.cluster_keyword_dic = self._make_cluster_keyword_dic(cluster_score_dic, keyword_length) return self def tag_keywords(self): cluster_rec = dict() for keywords in self.cluster_keyword_dic: rec = self.cluster_rec[self.cluster_keyword_dic[keywords]] cluster_rec[keywords] = rec self.cluster_rec = cluster_rec return self def upload_cluster_rec(self): self.s3_conn.upload_to_s3_byte(self.cluster_rec, config['AWS']['S3_BUCKET'], config['REC']['FRONT_CLUSTER']) def _make_tf_dic(self, words): tf_dic = dict() for word in words: tf_dic = self._update_tf_dic(word, tf_dic) return tf_dic def _update_tf_dic(self, word, dic): if dic.get(word) is None: dic[word] = 0 dic[word] += 1 return dic def _tf_to_prob_dist(self, dic): total_freq = sum(dic.values()) return {key: (item / total_freq) for key, item in dic.items()} def _agg_to_movie(self, col): size = len(col) agg_dic = dict() for dic in col: agg_dic.update(dic) return {key: (value / size) for key, value in agg_dic.items()} def _make_cluster_entropy_dic(self, cluster_dic, morphs): cluster_entropy_dic = dict() for key, entry in cluster_dic.items(): cluster_morphs = morphs[morphs.index.isin(entry)] entropy_series = cluster_morphs.apply(lambda x: entropy(x)) cluster_entropy_dic[key] = entropy_series return cluster_entropy_dic def _make_cluster_score_dic(self, cluster_entropy_dic, uni_entropy_series): score_dic = dict() for key, ent in cluster_entropy_dic.items(): score_dic[key] = (ent / uni_entropy_series).replace([np.inf, -np.inf], np.nan).dropna() return score_dic def _make_cluster_keyword_dic(self, cluster_score_dic, keyword_length): cluster_keyword_dic = dict() for key, keyword_series in cluster_score_dic.items(): keyword_li = keyword_series.sort_values(ascending=False)[:keyword_length].index.to_list() keyword_li = [f'#{keyword}' for keyword in keyword_li] keywords = ' '.join(keyword_li) cluster_keyword_dic[keywords] = key return cluster_keyword_dic
class WordEmbeddingModel: def __init__(self, all): self.mongo_conn = MongoConnector() self.s3_conn = S3Connector() self.all = all self.n_processes = (mp.cpu_count() // 2) - 1 self.model_params = config['MODEL']['FASTTEXT_PARAMS'] self.morphs_df = None self.model = None if self.all: logger.info('using all sentences') print('using all sentences') logger.info(f'using {self.n_processes} cores') print(f'using {self.n_processes} cores') def get_morphs(self): try: if self.all: morphs = self.mongo_conn.user_review_morphs.find() else: morphs = self.mongo_conn.user_review_morphs.find({'fasttext_trained': {'$in': [None, False]}}) except Exception as e: logger.error(e) finally: self.mongo_conn.close() df = pd.DataFrame(morphs) logger.info(f'got {len(df)} reviews.') print(f'got {len(df)} reviews.') self.document_updated = len(df) > 0 self.morphs_df = df def load_trained_model(self): if self.all: return try: model = self.s3_conn.load_from_s3_byte(config['AWS']['S3_BUCKET'], config['MODEL']['MODEL_PATH']) logger.info('model loaded') except Exception: model = None if self._validate_model(model): self.model = model def _validate_model(self, model): return type(model) == gensim.models.fasttext.FastText def build_model(self): sentences = self.morphs_df['morphs'] if len(sentences) == 0: return model = self.model if not model: logger.info('building new model.') logger.info(f'model params: {self.model_params}') model = FastText( vector_size=self.model_params['VECTOR_SIZE'], window=self.model_params['WINDOW'], sg=self.model_params['SG'], negative=self.model_params['NEGATIVE'], ns_exponent=self.model_params['NS_EXPONENT'], sample=self.model_params['SAMPLE'], min_n=self.model_params['MIN_N'], max_n=self.model_params['MAX_N'], min_count=self.model_params['MIN_COUNT'], bucket=self.model_params['BUCKET'], workers=self.n_processes ) model.build_vocab(corpus_iterable=sentences) else: model.build_vocab( corpus_iterable=sentences, update=True ) model.train( corpus_iterable=sentences, total_examples=len(sentences), epochs=self.model_params['EPOCHS'] ) self.model = model print('train model finished') def label_morphs_collection(self): morphs = self.mongo_conn.user_review_morphs try: for idx in range(len(self.morphs_df)): row = self.morphs_df.iloc[idx] morphs.update_one({'_id': row['_id']}, {'$set': {'fasttext_trained': True}}) except Exception as e: logger.error(e) finally: self.mongo_conn.close() print('label finished') def upload_file_to_s3(self): self.s3_conn.upload_to_s3_byte(self.model, config['AWS']['S3_BUCKET'], config['MODEL']['MODEL_PATH'])
def __init__(self): self.mongo_conn = MongoConnector() self.s3_conn = S3Connector() logger.info('inner class test')
class Rec: def __init__(self, cores): self.mongo_conn = MongoConnector() self.s3_conn = S3Connector() self.n_processes = self._set_n_processes(cores) self.movies_df = self._load_movies_df() self.makers_df = self._load_makers_df() self.cluster_df = self._load_cluster_df() self.movie_vectors = self._load_movie_vectors() try: mp.set_start_method('spawn') except RuntimeError: pass def make_newest_rec(self, n): newest_df = self.movies_df.sort_values(by='release_date', ascending=False) newest_df = newest_df[['movie_id', 'poster_url']][:n] newest_rec = {'newest_rec': newest_df.to_dict('records')} self.s3_conn.upload_to_s3_byte(newest_rec, config['AWS']['S3_BUCKET'], config['REC']['FRONT_NEWEST']) def make_cluster_rec(self, n): cluster_movie_df = pd.merge(self.movie_vectors, self.movies_df, on='movie_id', sort=False, validate='one_to_one') cluster_movie_df = cluster_movie_df[[ 'cluster', 'movie_id', 'vector', 'poster_url', 'avg_rate', 'review_count' ]] # 조건 필터링 cluster_movie_df = cluster_movie_df[ (cluster_movie_df['review_count'] >= 50) & (cluster_movie_df['avg_rate'] >= 8)] # 작은 군집 제거 clusters = cluster_movie_df['cluster'].value_counts() clusters = clusters[clusters > 25] clusters = list(clusters.index) cluster_movie_df = cluster_movie_df[cluster_movie_df['cluster'].isin( clusters)] ''' # 군집 묶기 cluster_df = self.cluster_df[self.cluster_df.index.isin(clusters)].copy() X = Normalizer().fit_transform(list(cluster_df['vector'])) def train_kmeans_model(X, k): model = KMeans( init='k-means++', n_clusters=k, max_iter=10000, tol=1e-12 ).fit(X) return model kmeans = train_kmeans_model(X, 20) cluster_df.loc[:, 'cluster_set'] = kmeans.labels_ # 군집셋별로 하나씩 선택 clusters = list(cluster_df.groupby('cluster_set').sample(n=1).index) ''' # 군집 중심에 가까운 순으로 정렬 cluster_movie_df = pd.merge(cluster_movie_df, self.cluster_df, left_on='cluster', right_index=True, copy=False, validate='many_to_one') cluster_movie_df.loc[:, 'vector_y'] = cluster_movie_df['vector_y'].map( lambda x: list(x)) cluster_movie_df.loc[:, 'dist'] = cluster_movie_df.apply( lambda x: cosine(x.vector_x, x.vector_y), axis=1) cluster_movie_df = cluster_movie_df[['movie_id', 'cluster', 'dist']].reset_index() cluster_rec = dict() for cluster in clusters: cluster_rec[cluster] = cluster_movie_df[cluster_movie_df['cluster'] == cluster][[ 'movie_id' ]][:n].to_dict('records') self.s3_conn.upload_to_s3_byte(cluster_rec, config['AWS']['S3_BUCKET'], config['REC']['FRONT_CLUSTER']) def make_genre_rec(self, n): genres = self.movies_df['genre'].explode().value_counts() genres = genres[genres >= n] genres = list(genres.index) genre_df = self.movies_df[['movie_id', 'poster_url', 'genre']] rows = genre_df.to_dict('records') # 멀티장르 각각 로우로 변환 genre_rows = [] for row in rows: for genre in row['genre']: genre_rows.append({ 'movie_id': row['movie_id'], 'poster_url': row['poster_url'], 'genre': genre }) genre_df = pd.DataFrame(genre_rows) # 랜덤 genre_df = genre_df.sample(frac=1) # 장르별로 분류 genre_rec = dict() for genre in genres: genre_rec[genre] = genre_df[genre_df['genre'] == genre][[ 'movie_id', 'poster_url' ]][:n].to_dict('records') self.s3_conn.upload_to_s3_byte(genre_rec, config['AWS']['S3_BUCKET'], config['REC']['FRONT_GENRE']) def make_actor_rec(self, n): movie_ids = list(self.movies_df['movie_id']) movies_df = self.movies_df.set_index('movie_id').sort_index() makers_df = pd.merge(self.makers_df, self.movies_df[['movie_id']], on='movie_id', validate='many_to_one') makers_df = makers_df.set_index(['movie_id', 'maker_id']).sort_index() actor_rec = dict() for movie_id in movie_ids: # 출연한 배우들 main_actor_ids = movies_df.loc[movie_id]['main_actor_ids'] # 배우별 출연작 데이터 actors_df = makers_df[ ~makers_df.index.isin([movie_id], level='movie_id') & makers_df.index.isin(main_actor_ids, level='maker_id')] actors_df.reset_index(inplace=True) # 주연배우 기재순 - 사용x ''' actors_df['maker_id'] = pd.Categorical( actors_df['maker_id'], categories=main_actor_ids, ordered=True ) ''' # 개봉일순으로 정렬 actors_df = actors_df.sort_values(by=['release_date'], ascending=[False]) # 중복 영화 제거 actors_df = actors_df.drop_duplicates(subset=['movie_id']) # 결과물 rec = actors_df[['movie_id', 'poster_url']][:n].to_dict('records') actor_rec[movie_id] = rec self.s3_conn.upload_to_s3_byte(actor_rec, config['AWS']['S3_BUCKET'], config['REC']['DETAIL_ACTOR']) def make_director_rec(self, n): movie_ids = list(self.movies_df['movie_id']) makers_df = pd.merge(self.makers_df, self.movies_df[['movie_id']], on='movie_id', validate='many_to_one') makers_df = makers_df.set_index(['movie_id', 'maker_id', 'role']).sort_index() director_rec = dict() for movie_id in movie_ids: # 감독, 작가 directors = makers_df[ makers_df.index.isin([movie_id], level='movie_id') & makers_df.index.isin(['director', 'writer'], level='role')].index.get_level_values( 'maker_id').to_list() # 감독, 작가별 참여작 데이터 directors_df = makers_df[ ~makers_df.index.isin([movie_id], level='movie_id') & makers_df.index.isin(directors, level='maker_id')] directors_df.reset_index(inplace=True) # 개봉일순으로 정렬 directors_df = directors_df.sort_values(by=['release_date'], ascending=[False]) # 중복 영화 제거 directors_df = directors_df.drop_duplicates(subset=['movie_id']) # 결과물 rec = directors_df[:n][['movie_id', 'poster_url']].to_dict('records') director_rec[movie_id] = rec self.s3_conn.upload_to_s3_byte(director_rec, config['AWS']['S3_BUCKET'], config['REC']['DETAIL_DIRECTOR']) def make_similar_rec(self, n): similar_rec_df = pd.merge( self.movie_vectors, self.movies_df[self.movies_df['review_count'] >= 30][[ 'movie_id', 'poster_url' ]], on='movie_id', sort=False, validate='one_to_one') movie_ids = list(self.movie_vectors.index) similar_rec = dict() for movie_id in movie_ids: movie = self.movie_vectors.loc[movie_id] # 군집 센트로이드와의 거리를 구하고 가까운 순 정렬 cluster_distances = self.cluster_df['vector'].map(lambda x: cosine( x, movie['vector'])).sort_values(ascending=True) clusters = list(cluster_distances.index) # 영화 수가 목표에 다다를때까지 군집별 영화 목록을 후보 리스트에 추가 df_li = [] movie_count = 0 for cluster in clusters: tmp_df = similar_rec_df[similar_rec_df['cluster'] == cluster] df_li.append(tmp_df) movie_count += len(tmp_df) if movie_count >= 100: break similar_df = pd.concat(df_li) # 유사도순으로 정렬 distances = similar_df['vector'].map( lambda x: cosine(x, movie['vector'])) similar_df = similar_df.assign(distance=distances).sort_values( by='distance') # 자기자신 제외 similar_df = similar_df[similar_df['movie_id'] != movie.name] # n similar_df = similar_df[:n] # 결과 rec = similar_df[['movie_id', 'poster_url']].to_dict('records') similar_rec[movie_id] = rec self.s3_conn.upload_to_s3_byte(similar_rec, config['AWS']['S3_BUCKET'], config['REC']['DETAIL_SIMILAR']) def _set_n_processes(self, cores): if (not cores) or (cores >= (mp.cpu_count() // 2)): return (mp.cpu_count() // 2) - 1 return cores def _load_movies_df(self): movies_df = self.s3_conn.load_from_s3_byte( config['AWS']['S3_BUCKET'], config['DATA']['MOVIE_INFO']) return movies_df def _load_makers_df(self): makers_df = pd.DataFrame(self.mongo_conn.makers.find()) self.mongo_conn.close() # makers_df = makers_df[makers_df['movie_id'].isin(self.movies_df['movie_id'])] makers_df = makers_df.rename( columns={'movie_poster_url': 'poster_url'}) return makers_df def _load_cluster_df(self): cluster_df = self.s3_conn.load_from_s3_byte( config['AWS']['S3_BUCKET'], config['MODEL']['CLUSTER_PATH']) return cluster_df def _load_movie_vectors(self): movie_vectors = self.s3_conn.load_from_s3_byte( config['AWS']['S3_BUCKET'], config['MODEL']['MOVIE_VECTORS_PATH']) movie_vectors = movie_vectors[movie_vectors.index.isin( self.movies_df['movie_id'])] return movie_vectors
class MorphExtractor: def __init__(self, pos_chunk, pos_target, all): self.mongo_conn = MongoConnector() self.pos_chunk = pos_chunk self.pos_target = pos_target if all: print('reset morph data') self._reset_morph_data() self._set_iter_count() print(pos_chunk) print(pos_target) def _reset_morph_data(self): tokens = self.mongo_conn.user_review_tokens_okt morphs = self.mongo_conn.user_review_morphs_okt okt_adjective_stat = self.mongo_conn.okt_adjective_stat try: morphs.drop() okt_adjective_stat.drop() tokens.update_many({}, {'$set': {'morphed': False}}) except Exception as e: logger.error(e) finally: self.mongo_conn.close() def _set_iter_count(self): tokens = self.mongo_conn.user_review_tokens_okt try: rows = tokens.count_documents({'morphed': {'$in': [None, False]}}) except Exception as e: logger.error(e) finally: self.mongo_conn.close() self.iter = (rows // self.pos_chunk + 1) if rows > 0 else 0 def get_pos(self): tokens = self.mongo_conn.user_review_tokens_okt try: pos = tokens.find({'morphed': { '$in': [None, False] }})[:self.pos_chunk] pos_df = pd.DataFrame(pos)[['_id', 'movie_id', 'tokens', 'rate']] except Exception as e: logger.error(e) finally: self.mongo_conn.close() print(len(pos_df)) print(pos_df.iloc[0]) self.pos_df = pos_df def get_morphs(self): self.pos_df.loc[:, 'nouns'] = self.pos_df['tokens'].map( lambda x: [pos[0] for pos in x if pos[1] == 'Noun']) self.pos_df.loc[:, 'adjectives'] = self.pos_df['tokens'].map( lambda x: [pos[0] for pos in x if pos[1] == 'Adjective']) morph_df = self.pos_df.drop(columns=['tokens']) self.morph_df = morph_df def save_morphs(self): tokens = self.mongo_conn.user_review_tokens_okt morphs = self.mongo_conn.user_review_morphs_okt okt_adjective_stat = self.mongo_conn.okt_adjective_stat morphs_dict = self.morph_df.to_dict('records') comment_count = len(morphs_dict) try: for doc in morphs_dict: for adj in doc['adjectives']: okt_adjective_stat.update_one({'_id': adj}, { '$set': { '_id': adj }, '$inc': { 'count': 1 } }, upsert=True) morphs.replace_one({'_id': doc['_id']}, doc, upsert=True) tokens.update_one({'_id': doc['_id']}, {'$set': { 'morphed': True }}) logger.info(f'{comment_count} comments are morphed.') print(f'{comment_count} comments are morphed.') except Exception as e: logger.error(e) finally: self.mongo_conn.close() def _target_row_count(self): pos_path = self.mongo_conn.user_review_tokens_okt try: doc_count = pos_path.count_documents({}) except Exception as e: logger.error(e) finally: self.mongo_conn.close() return doc_count
class MorphExtractor: def __init__(self, pos_chunk, pos_target, all): self.mongo_conn = MongoConnector() self.pos_chunk = pos_chunk self.pos_target = pos_target self.all = all print(pos_chunk) print(pos_target) if self.all: logger.info('get morphs from entire corpus. dropping morphs collection.') self._drop_morphs() self._set_iter_count() def _drop_morphs(self): tokens = self.mongo_conn.user_review_tokens morphs = self.mongo_conn.user_review_morphs try: morphs.drop() morphs.create_index('movie_id') tokens.update_many({}, {'$set': {'morphed': False}}) except Exception as e: logger.error(e) finally: self.mongo_conn.close() def _set_iter_count(self): tokens = self.mongo_conn.user_review_tokens try: rows = tokens.count_documents({'morphed': {'$in': [None, False]}}) except Exception as e: logger.error(e) finally: self.mongo_conn.close() self.iter = (rows // self.pos_chunk + 1) if rows > 0 else 0 def get_pos(self): tokens = self.mongo_conn.user_review_tokens try: pos = tokens.find({'morphed': {'$in': [None, False]}})[:self.pos_chunk] pos_df = pd.DataFrame(pos)[['_id', 'movie_id', 'tokens']] except Exception as e: logger.error(e) finally: self.mongo_conn.close() print(len(pos_df)) print(pos_df.iloc[0]) self.pos_df = pos_df def get_morphs(self): if self.pos_target is None: self.pos_df.loc[:, 'morphs'] = self.pos_df['tokens'].map(lambda x: [pos[0] for pos in x]) else: self.pos_df.loc[:, 'morphs'] = self.pos_df['tokens'].map(lambda x: [pos[0] for pos in x if pos[1] in self.pos_target]) morph_df = self.pos_df.drop(columns=['tokens']) self.morph_df = morph_df def save_morphs(self): tokens = self.mongo_conn.user_review_tokens morphs = self.mongo_conn.user_review_morphs morphs_dict = self.morph_df.to_dict('records') try: for doc in morphs_dict: tokens.update_one({'_id': doc['_id']}, {'$set': {'morphed': True}}) except Exception as e: logger.error(e) finally: self.mongo_conn.close() self.morph_df = self.morph_df[self.morph_df['morphs'].astype('str') != '[]'] morphs_dict = self.morph_df.to_dict('records') comment_count = len(morphs_dict) try: for doc in morphs_dict: morphs.replace_one({'_id': doc['_id']}, doc, upsert=True) logger.info(f'{comment_count} comments are morphed.') print(f'{comment_count} comments are morphed.') except Exception as e: logger.error(e) finally: self.mongo_conn.close() def _target_row_count(self): pos_path = self.mongo_conn.user_review_tokens try: doc_count = pos_path.count_documents({}) except Exception as e: logger.error(e) finally: self.mongo_conn.close() return doc_count
class Tokenizer(object): def __init__(self, cores, rows): self.mongo_conn = MongoConnector() self.n_processes = self._set_n_processes(cores) self.rows = rows logger.info(f'using {self.n_processes} cores') print(f'using {self.n_processes} cores') def get_reviews(self): user_reviews = self.mongo_conn.user_reviews try: reviews = user_reviews.find( {'tokenized_okt': { '$in': [None, False] }}) if self.rows != 0: reviews = reviews[:self.rows] except Exception as e: logger.error(e) finally: self.mongo_conn.close() df = pd.DataFrame(reviews)[['_id', 'movie_id', 'review', 'rate']] logger.info(f'got {len(df)} reviews.') print(f'got {len(df)} reviews.') self._split_df(df) def tokenize(self): try: mp.set_start_method('spawn') except RuntimeError: pass with mp.Pool(processes=self.n_processes, maxtasksperchild=1) as p: for idx, chunk in enumerate(self.reviews_df_split): print(f'chunk {idx}: size {len(chunk)}') df_job = np.array_split(chunk, self.n_processes) print('getting tokens') df_job = p.map(_add_tokens, df_job) print('filtering empty rows') df_job = p.map(_filter_empty_token_row, df_job) print('dropping columns') func = partial(_drop_columns, ['review']) df_job = p.map(func, df_job) print('concatting chunk') tokens_df_chunk = pd.concat(df_job) print('updating result') self.reviews_df_split[idx] = tokens_df_chunk tokens_df = pd.concat(self.reviews_df_split) del self.reviews_df_split self.tokens_df = tokens_df def save_tokens(self): review_tokens = self.mongo_conn.user_review_tokens_okt user_reviews = self.mongo_conn.user_reviews tokens_li = self.tokens_df.to_dict('records') try: for tokens in tokens_li: if tokens['tokens'] != []: review_tokens.replace_one({'_id': tokens['_id']}, tokens, upsert=True) user_reviews.update_one({'_id': tokens['_id']}, {'$set': { 'tokenized_okt': True }}) logger.info(f'{len(tokens_li)} comments are tokenized.') print(f'{len(tokens_li)} comments are tokenized.') except Exception as e: logger.error(e) finally: self.mongo_conn.close() def _set_n_processes(self, cores): if (not cores) or (cores >= (mp.cpu_count() // 2)): return (mp.cpu_count() // 2) - 1 return cores def _split_df(self, df): split_to = len(df) // 500000 + 1 self.reviews_df_split = np.array_split(df, split_to) logger.info(f'splited to {split_to}') print(f'splited to {split_to}')
class MorphPostProcessor(object): def __init__(self, pos_chunk, pos_target, all): self.mongo_conn = MongoConnector() self.pos_chunk = pos_chunk self.pos_target = pos_target self.adj_converter = self._load_adj_converter() self._set_iter_count() if all: print('reset update_checker') self._reset_update_checker() def _reset_update_checker(self): morphs = self.mongo_conn.user_review_morphs_okt try: morphs.update_many({}, {'$set': {'adj_converted': False}}) except Exception as e: logger.error(e) finally: self.mongo_conn.close() def _load_adj_converter(self): with open('processors/adj_converter.json', 'rb') as f: adj_converter = json.load(f) return adj_converter def _set_iter_count(self): morphs = self.mongo_conn.user_review_morphs_okt try: rows = morphs.count_documents( {'adj_converted': { '$in': [None, False] }}) except Exception as e: logger.error(e) finally: self.mongo_conn.close() self.iter = (rows // self.pos_chunk + 1) if rows > 0 else 0 def get_adjs(self): morphs = self.mongo_conn.user_review_morphs_okt try: adjs = morphs.find({'adj_converted': { '$in': [None, False] }}, { '_id': 1, 'adjectives': 1 })[:self.pos_chunk] adj_df = pd.DataFrame(adjs) except Exception as e: logger.error(e) finally: self.mongo_conn.close() print(len(adj_df)) print(adj_df.iloc[0]) self.adj_df = adj_df return self def convert_adjs(self): self.adj_df.loc[:, 'adjectives'] = self.adj_df['adjectives'].map( lambda x: self._convert_adjs(x)) return self def _convert_adjs(self, adjs): return [ self._convert_adj(adj) for adj in adjs if self._convert_adj(adj) is not None ] def _convert_adj(self, adj): if adj in self.adj_converter['stopwords']: return if re.match('.*하다$', adj): return self.adj_converter['converter_hada'].get( adj) or f'{adj[:-2]}한' return self.adj_converter['converter'].get(adj) or adj def save_adjs(self): morphs = self.mongo_conn.user_review_morphs_okt try: for idx in range(len(self.adj_df)): doc = self.adj_df.iloc[idx] morphs.update_one({'_id': doc['_id']}, { '$set': { 'adjectives': doc['adjectives'], 'adj_converted': True } }) print(f'{len(self.adj_df)} docs are converted.') except Exception as e: logger.error(e) finally: self.mongo_conn.close()
def main(): mongo_conn = MongoConnector() s3_conn = S3Connector() def _load_movies_df(): movies_df = s3_conn.load_from_s3_byte(config['AWS']['S3_BUCKET'], config['DATA']['MOVIE_INFO']) return movies_df def _load_makers_df(): makers_df = pd.DataFrame(mongo_conn.makers.find()) mongo_conn.close() makers_df = pd.merge(makers_df, movies_df[['movie_id', 'review_count']], on='movie_id', validate='many_to_one') roles = ['actor_main', 'director', 'writer', 'actor_sub'] makers_df['role'] = pd.Categorical(makers_df['role'], categories=roles, ordered=True) makers_df = makers_df.rename( columns={'movie_poster_url': 'poster_url'}) return makers_df def _load_cluster_df(): cluster_df = s3_conn.load_from_s3_byte(config['AWS']['S3_BUCKET'], config['MODEL']['CLUSTER_PATH']) return cluster_df def _load_movie_vectors(): movie_vectors = s3_conn.load_from_s3_byte( config['AWS']['S3_BUCKET'], config['MODEL']['MOVIE_VECTORS_PATH']) movie_vectors = movie_vectors[movie_vectors.index.isin( movies_df['movie_id'])] return movie_vectors movies_df = _load_movies_df() makers_df = _load_makers_df() cluster_df = _load_cluster_df() movie_vectors = _load_movie_vectors() # 리뷰 많은 순 정렬 movies_df = movies_df.sort_values(by=['review_count'], ascending=False) # 역할, 리뷰 많은 순 정렬 makers_df = makers_df.sort_values(by=['role', 'review_count'], ascending=[True, False]) # 부분단어 해시 def generate_hash(names): dic = dict() for name in names: if len(name) == 1: dic = _update_name_dic(name, name, dic) continue name_split = name.split(' ') name_split.append(name.replace(' ', '')) name_split.append(name) for word in name_split: length = len(word) if length < 2: continue for i in range(2, length + 1): subword = word[:i] dic = _update_name_dic(name, subword, dic) for key in dic.keys(): dic[key] = get_unique_ordered_list(dic.get(key)) return dic def _update_name_dic(name, word, dic): if dic.get(word) is None: dic[word] = [] dic[word].append(name) return dic def get_unique_ordered_list(li): seen = set() return [x for x in li if not (x in seen or seen.add(x))] # 제목 movie_names_kor = movies_df['title_kor'] movie_names_hash = generate_hash(movie_names_kor) # 사람 maker_names = makers_df['name'] maker_names_hash = generate_hash(maker_names) # 장르 genre_names = set(flatten(movies_df['genre'])) genre_hash = generate_hash(genre_names) # 국가 nation_names = set(flatten(movies_df['nations'])) nation_names_hash = generate_hash(nation_names) # 병합 subword_hash = dict() subword_hash['movie_name'] = movie_names_hash subword_hash['maker'] = maker_names_hash subword_hash['genre'] = genre_hash subword_hash['nation'] = nation_names_hash # 이름-영화id 해시 def generate_name_id_hash(names, ids): dic = dict() for i in range(len(names)): if dic.get(names[i]) is None: dic[names[i]] = [] dic[names[i]].append(ids[i]) for key in dic.keys(): dic[key] = get_unique_ordered_list(dic.get(key)) return dic # 제목 movie_names = list(movies_df['title_kor']) movie_ids = list(movies_df['movie_id']) movie_name_id_hash = generate_name_id_hash(movie_names, movie_ids) # 이름 maker_names = list(makers_df['name']) maker_ids = list(makers_df['movie_id']) maker_id_hash = generate_name_id_hash(maker_names, maker_ids) # 장르 ex_movies_df = movies_df[['movie_id', 'genre']].explode('genre') genres = list(ex_movies_df['genre']) genre_ids = list(ex_movies_df['movie_id']) genre_id_hash = generate_name_id_hash(genres, genre_ids) # 국가 ex_movies_df = movies_df[['movie_id', 'nations']].explode('nations') nations = list(ex_movies_df['nations']) nation_ids = list(ex_movies_df['nations']) nation_id_hash = generate_name_id_hash(nations, nation_ids) # 병합 name_id_hash = dict() name_id_hash['movie_name'] = movie_name_id_hash name_id_hash['maker'] = maker_id_hash name_id_hash['genre'] = genre_id_hash name_id_hash['nation'] = nation_id_hash # unload s3_conn.upload_to_s3_byte(subword_hash, config['AWS']['S3_BUCKET'], config['DATA']['SUBWORD_HASH']) s3_conn.upload_to_s3_byte(name_id_hash, config['AWS']['S3_BUCKET'], config['DATA']['NAME_ID_HASH'])
class MovieVectorProcessor: def __init__(self, chunk): self.mongo_conn = MongoConnector() self.s3_conn = S3Connector() self.chunk = chunk self.n_processes = (mp.cpu_count() // 2) - 1 self.movie_id_q = Queue() self.model = None self.movie_vector_li = [] self.morphs_df = None self.movie_vectors = None self._get_movie_ids() self._load_trained_model() logger.info(f'using {self.n_processes} cores') print(f'using {self.n_processes} cores') def _get_movie_ids(self): morphs = self.mongo_conn.user_review_morphs movie_ids = morphs.distinct('movie_id') for movie_id in movie_ids: self.movie_id_q.put(movie_id) self.mongo_conn.close() def _load_trained_model(self): try: model = self.s3_conn.load_from_s3_byte( config['AWS']['S3_BUCKET'], config['MODEL']['MODEL_PATH']) except Exception: model = None if self.__validate_model(model): self.model = model def __validate_model(self, model): return type(model) == gensim.models.fasttext.FastText def get_morphs(self): morphs = self.mongo_conn.user_review_morphs docu_count = 0 df_li = [] while (not self.movie_id_q.empty() and (docu_count < self.chunk)): movie_id = self.movie_id_q.get() try: morphs_df = pd.DataFrame( morphs.find({'movie_id': movie_id}, { '_id': 0, 'movie_id': 1, 'morphs': 1 })) df_li.append(morphs_df) docu_count += len(morphs_df) except Exception as e: logger.error(e) self.mongo_conn.close() self.mongo_conn.close() logger.info(f'got {docu_count} reviews.') print(f'got {docu_count} reviews.') self.morphs_df = pd.concat(df_li) def make_movie_vectors(self): word_vectors = self.model.wv movie_vectors = pd.DataFrame() movie_vectors['movie_id'] = self.morphs_df['movie_id'] # get averaged comment vector movie_vectors.loc[:, 'vector'] = self.morphs_df['morphs'].map( lambda morphs: np.average( [word_vectors[morph] for morph in morphs], axis=0)) # get movie vector movie_vectors = movie_vectors.groupby('movie_id').sum() self.movie_vector_li.append(movie_vectors) logger.info('make movie vectors finished') print('make movie vectors finished') def concat_vectors(self): self.movie_vectors = pd.concat(self.movie_vector_li)