コード例 #1
0
    def __init__(self, cores, rows):
        self.mongo_conn = MongoConnector()
        self.n_processes = self._set_n_processes(cores)
        self.rows = rows

        logger.info(f'using {self.n_processes} cores')
        print(f'using {self.n_processes} cores')
コード例 #2
0
    def __init__(self, pos_chunk, pos_target, all):
        self.mongo_conn = MongoConnector()
        self.pos_chunk = pos_chunk
        self.pos_target = pos_target
        self.adj_converter = self._load_adj_converter()
        self._set_iter_count()

        if all:
            print('reset update_checker')
            self._reset_update_checker()
コード例 #3
0
    def __init__(self, cores):
        self.mongo_conn = MongoConnector()
        self.s3_conn = S3Connector()
        self.n_processes = self._set_n_processes(cores)
        self.movies_df = self._load_movies_df()
        self.makers_df = self._load_makers_df()
        self.cluster_df = self._load_cluster_df()
        self.movie_vectors = self._load_movie_vectors()

        try:
            mp.set_start_method('spawn')
        except RuntimeError:
            pass
コード例 #4
0
    def __init__(self, pos_chunk, pos_target, all):
        self.mongo_conn = MongoConnector()
        self.pos_chunk = pos_chunk
        self.pos_target = pos_target

        if all:
            print('reset morph data')
            self._reset_morph_data()

        self._set_iter_count()

        print(pos_chunk)
        print(pos_target)
コード例 #5
0
    def __init__(self, all):
        self.mongo_conn = MongoConnector()
        self.s3_conn = S3Connector()
        self.all = all
        self.n_processes = (mp.cpu_count() // 2) - 1
        self.model_params = config['MODEL']['FASTTEXT_PARAMS']
        self.morphs_df = None
        self.model = None

        if self.all:
            logger.info('using all sentences')
            print('using all sentences')
        logger.info(f'using {self.n_processes} cores')
        print(f'using {self.n_processes} cores')
コード例 #6
0
    def __init__(self, pos_chunk, pos_target, all):
        self.mongo_conn = MongoConnector()
        self.pos_chunk = pos_chunk
        self.pos_target = pos_target
        self.all = all

        print(pos_chunk)
        print(pos_target)

        if self.all:
            logger.info('get morphs from entire corpus. dropping morphs collection.')
            self._drop_morphs()

        self._set_iter_count()
コード例 #7
0
    def __init__(self, chunk):
        self.mongo_conn = MongoConnector()
        self.s3_conn = S3Connector()
        self.chunk = chunk
        self.n_processes = (mp.cpu_count() // 2) - 1
        self.movie_id_q = Queue()
        self.model = None
        self.movie_vector_li = []
        self.morphs_df = None
        self.movie_vectors = None

        self._get_movie_ids()
        self._load_trained_model()

        logger.info(f'using {self.n_processes} cores')
        print(f'using {self.n_processes} cores')
コード例 #8
0
 def __init__(self):
     self.mongo_conn = MongoConnector()
     self.session = requests.Session()
     self.headers = config["SCRAPER"]["HEADERS"]
     self.update_checker = self._set_update_checker()
     self.queue = self._set_queue()
     self.current_target = None
     self.last_update_date = None
     self.pages = 0
     self.current_page = 1
     self.review_count = 0
     self.reviews = []
     self.valid = True
コード例 #9
0
def job_estimater():
    mongo_conn = MongoConnector()
    user_reviews = mongo_conn.user_reviews

    row_count = user_reviews.count_documents(
        {'tokenized_okt': {
            '$in': [None, False]
        }})
    print(f'{row_count} rows are not tokenized.')

    if row_count == 0:
        print('all documents are tokenized. finish process.')
        return 0

    n_iters = row_count // args.rows + 1
    print(f'need process {n_iters} times')
    return n_iters
コード例 #10
0
 def __init__(self):
     self.mongo_conn = MongoConnector()
     self.s3_conn = S3Connector()
     self._load_data()
コード例 #11
0
class MovieInfoPreprocessor(object):
    def __init__(self):
        self.mongo_conn = MongoConnector()
        self.s3_conn = S3Connector()
        self._load_data()

    def _load_data(self):
        self.movies_df = pd.DataFrame(self.mongo_conn.movies.find())
        self.makers_df = pd.DataFrame(self.mongo_conn.makers.find())
        self.reviews_df = pd.DataFrame(self.mongo_conn.user_reviews.find())
        self.mongo_conn.close()

    def filter_fault_rows(self):
        # filter
        self.movies_df = self.movies_df[~(
            self.movies_df['title_kor'].isna()
            | self.movies_df['poster_url'].isna()
            | self.movies_df['stillcut_url'].isna())]
        return self

    def preprocess(self):
        # 장르 필터
        self.movies_df = self.movies_df[~(
            self.movies_df['genre'].map(set(['에로']).issubset)
            | self.movies_df['genre'].map(set(['공연실황']).issubset))]

        # 코멘트 통계 추가
        self.add_review_stat()

        # 코멘트 수 필터
        self.movies_df = self.movies_df[self.movies_df['review_count'] > 0]

        # 개봉년도 추출
        self.get_year_column()

        # 개봉일 수정
        self.compansate_release_date()

        # 스태프 컬럼 추가
        self.merge_staff_columns()

        # 필요 없는 컬럼 제거
        self.movies_df = self.movies_df.drop(
            columns=['updated_at', 'review_checked_date'])

        # 컬럼 이름 변경
        self.movies_df = self.movies_df.rename(columns={'_id': 'movie_id'})

        return self

    def add_review_stat(self):
        reviews_stat = self.reviews_df.groupby('movie_id').agg(
            {'rate': ['mean', 'count']})
        merged = pd.merge(self.movies_df,
                          reviews_stat.rate,
                          how='left',
                          left_on='_id',
                          right_index=True,
                          validate='one_to_one')
        merged = merged.rename(columns={
            'mean': 'avg_rate',
            'count': 'review_count'
        })
        self.movies_df = merged
        return self

    def get_year_column(self):
        def year_process(row):
            date = row['release_date']
            if type(date) is float or len(date) < 4:
                result = row['title_eng'].split(',')[-1].replace(' ', '')
                return result if result.isdigit() else ''
            return date[:4]

        self.movies_df.loc[:,
                           'release_year'] = self.movies_df.apply(year_process,
                                                                  axis=1)
        return self

    def compansate_release_date(self):
        def date_process(row):
            date = row['release_date']
            year = row['release_year']
            if type(date) is float:
                try:
                    date = f'{year}0101'
                except Exception as e:
                    logger.error(e)
            return date

        self.movies_df.loc[:,
                           'release_date'] = self.movies_df.apply(date_process,
                                                                  axis=1)
        return self

    def merge_staff_columns(self):
        directors = self.makers_df[self.makers_df['role'] == 'director'][[
            'movie_id', 'name', 'role'
        ]]
        writers = self.makers_df[self.makers_df['role'] == 'writer'][[
            'movie_id', 'name', 'role'
        ]]

        merged1 = pd.merge(self.movies_df,
                           directors,
                           left_on='_id',
                           right_on='movie_id',
                           how='left',
                           validate='one_to_one')
        merged2 = pd.merge(merged1,
                           writers,
                           left_on='_id',
                           right_on='movie_id',
                           how='left',
                           validate='one_to_one')
        merged2 = merged2.rename(columns={
            '_id': 'movie_id',
            'name_x': 'director',
            'name_y': 'writer'
        })
        merged2 = merged2.drop(
            columns=['movie_id_x', 'role_x', 'movie_id_y', 'role_y'])

        self.movies_df = merged2
        return self

    def upload_to_s3(self):
        self.s3_conn.upload_to_s3_byte(self.movies_df,
                                       config['AWS']['S3_BUCKET'],
                                       config['DATA']['MOVIE_INFO'])
コード例 #12
0
 def __init__(self):
     self.mongo_conn = MongoConnector()
     self.s3_conn = S3Connector()
     self.cluster_rec = self.s3_conn.load_from_s3_byte(config['AWS']['S3_BUCKET'], config['REC']['FRONT_CLUSTER'])
     self.cluster_dic = self._make_cluster_dic()
コード例 #13
0
class KeywordExtractor(object):
    def __init__(self):
        self.mongo_conn = MongoConnector()
        self.s3_conn = S3Connector()
        self.cluster_rec = self.s3_conn.load_from_s3_byte(config['AWS']['S3_BUCKET'], config['REC']['FRONT_CLUSTER'])
        self.cluster_dic = self._make_cluster_dic()


    def _make_cluster_dic(self):
        cluster_dic = dict()
        for key, entry in self.cluster_rec.items():
            ids = [dic['movie_id'] for dic in entry]
            cluster_dic[key] = ids
        return cluster_dic


    def get_morphs(self):
        try:
            morphs = self.mongo_conn.user_review_morphs_okt.find({}, {'_id': 0, 'movie_id': 1, 'adjectives': 1})
        except Exception as e:
            logger.error(e)
        finally:
            self.mongo_conn.close()

        self.morphs_df = pd.DataFrame(morphs).set_index('movie_id').sort_index()
        self.morphs_df = self.morphs_df.rename(columns={'adjectives': 'morphs'})
        
        print(f'got {len(self.morphs_df)} comments.')
        return self

    
    def make_df_dic(self):
        movie_word_set = self.morphs_df.groupby('movie_id')['morphs'].apply(lambda x: np.unique(np.hstack(x)))
        
        df_dic = dict()
        for idx in range(len(movie_word_set)):
            for word in movie_word_set.iloc[idx]:
                if df_dic.get(word) is None:
                    df_dic[word] = 0
                df_dic[word] += 1
        
        self.df_dic = df_dic
        return self
    

    def make_movie_prob_distribution(self, df_floor):
        self.morphs_df.loc[:, 'morphs'] = self.morphs_df['morphs'].map(
            lambda x: [word for word in x if self.df_dic.get(word) >= df_floor]
        )

        self.morphs_df.loc[:, 'prob_dist'] = self.morphs_df['morphs'].map(
            lambda morphs: self._tf_to_prob_dist(self._make_tf_dic(morphs))
        )

        self.morphs_df = self.morphs_df.drop(columns=['morphs'])

        self.morphs_df = self.morphs_df.groupby(self.morphs_df.index).agg(
            {'prob_dist': lambda col: self._agg_to_movie(col)}
        )

        self.morphs_df = pd.DataFrame(
            data=self.morphs_df['prob_dist'].to_list(),
            index=self.morphs_df.index
        ).fillna(0)

        return self


    def extract_cluster_keywords(self, keyword_length):
        uni_entropy_series = self.morphs_df.apply(lambda x: entropy(x))
        uni_entropy_series.sort_values(ascending=False)

        cluster_entropy_dic = self._make_cluster_entropy_dic(self.cluster_dic, self.morphs_df)
        cluster_score_dic = self._make_cluster_score_dic(cluster_entropy_dic, uni_entropy_series)
        self.cluster_keyword_dic = self._make_cluster_keyword_dic(cluster_score_dic, keyword_length)

        return self


    def tag_keywords(self):
        cluster_rec = dict()
        for keywords in self.cluster_keyword_dic:
            rec = self.cluster_rec[self.cluster_keyword_dic[keywords]]
            cluster_rec[keywords] = rec
        
        self.cluster_rec = cluster_rec
        return self


    def upload_cluster_rec(self):
        self.s3_conn.upload_to_s3_byte(self.cluster_rec, config['AWS']['S3_BUCKET'], config['REC']['FRONT_CLUSTER'])


    def _make_tf_dic(self, words):
        tf_dic = dict()
        for word in words:
            tf_dic = self._update_tf_dic(word, tf_dic)
        return tf_dic


    def _update_tf_dic(self, word, dic):
        if dic.get(word) is None:
            dic[word] = 0
        dic[word] += 1
        return dic


    def _tf_to_prob_dist(self, dic):
        total_freq = sum(dic.values())
        return {key: (item / total_freq) for key, item in dic.items()}


    def _agg_to_movie(self, col):
        size = len(col)
        agg_dic = dict()
        for dic in col:
            agg_dic.update(dic)
        
        return {key: (value / size) for key, value in agg_dic.items()}

    
    def _make_cluster_entropy_dic(self, cluster_dic, morphs):
        cluster_entropy_dic = dict()
        for key, entry in cluster_dic.items():
            cluster_morphs = morphs[morphs.index.isin(entry)]
            entropy_series = cluster_morphs.apply(lambda x: entropy(x))
            cluster_entropy_dic[key] = entropy_series
        return cluster_entropy_dic


    def _make_cluster_score_dic(self, cluster_entropy_dic, uni_entropy_series):
        score_dic = dict()
        for key, ent in cluster_entropy_dic.items():
            score_dic[key] = (ent / uni_entropy_series).replace([np.inf, -np.inf], np.nan).dropna()
        return score_dic

    
    def _make_cluster_keyword_dic(self, cluster_score_dic, keyword_length):
        cluster_keyword_dic = dict()
        for key, keyword_series in cluster_score_dic.items():
            keyword_li = keyword_series.sort_values(ascending=False)[:keyword_length].index.to_list()
            keyword_li = [f'#{keyword}' for keyword in keyword_li]
            keywords = ' '.join(keyword_li)
            cluster_keyword_dic[keywords] = key
        return cluster_keyword_dic
コード例 #14
0
class WordEmbeddingModel:
    def __init__(self, all):
        self.mongo_conn = MongoConnector()
        self.s3_conn = S3Connector()
        self.all = all
        self.n_processes = (mp.cpu_count() // 2) - 1
        self.model_params = config['MODEL']['FASTTEXT_PARAMS']
        self.morphs_df = None
        self.model = None

        if self.all:
            logger.info('using all sentences')
            print('using all sentences')
        logger.info(f'using {self.n_processes} cores')
        print(f'using {self.n_processes} cores')


    def get_morphs(self):
        try:
            if self.all:
                morphs = self.mongo_conn.user_review_morphs.find()
            else:
                morphs = self.mongo_conn.user_review_morphs.find({'fasttext_trained': {'$in': [None, False]}})
        except Exception as e:
            logger.error(e)
        finally:
            self.mongo_conn.close()
        
        df = pd.DataFrame(morphs)
        logger.info(f'got {len(df)} reviews.')
        print(f'got {len(df)} reviews.')

        self.document_updated = len(df) > 0
        self.morphs_df = df


    def load_trained_model(self):
        if self.all:
            return

        try:
            model = self.s3_conn.load_from_s3_byte(config['AWS']['S3_BUCKET'], config['MODEL']['MODEL_PATH'])
            logger.info('model loaded')
        except Exception:
            model = None
        
        if self._validate_model(model):
            self.model = model


    def _validate_model(self, model):
        return type(model) == gensim.models.fasttext.FastText


    def build_model(self):
        sentences = self.morphs_df['morphs']
        if len(sentences) == 0:
            return
        model = self.model

        if not model:
            logger.info('building new model.')
            logger.info(f'model params: {self.model_params}')

            model = FastText(
                vector_size=self.model_params['VECTOR_SIZE'], 
                window=self.model_params['WINDOW'], 
                sg=self.model_params['SG'],
                negative=self.model_params['NEGATIVE'],
                ns_exponent=self.model_params['NS_EXPONENT'],
                sample=self.model_params['SAMPLE'],
                min_n=self.model_params['MIN_N'],
                max_n=self.model_params['MAX_N'],
                min_count=self.model_params['MIN_COUNT'],
                bucket=self.model_params['BUCKET'],
                workers=self.n_processes
            )

            model.build_vocab(corpus_iterable=sentences)

        else:
            model.build_vocab(
                corpus_iterable=sentences,
                update=True
            )

        model.train(
            corpus_iterable=sentences, 
            total_examples=len(sentences), 
            epochs=self.model_params['EPOCHS']
        )

        self.model = model
        print('train model finished')


    def label_morphs_collection(self):
        morphs = self.mongo_conn.user_review_morphs

        try:
            for idx in range(len(self.morphs_df)):
                row = self.morphs_df.iloc[idx]
                morphs.update_one({'_id': row['_id']}, {'$set': {'fasttext_trained': True}})
        except Exception as e:
            logger.error(e)
        finally:
            self.mongo_conn.close()
        
        print('label finished')
    
    
    def upload_file_to_s3(self):
        self.s3_conn.upload_to_s3_byte(self.model, config['AWS']['S3_BUCKET'], config['MODEL']['MODEL_PATH'])       
コード例 #15
0
 def __init__(self):
     self.mongo_conn = MongoConnector()
     self.s3_conn = S3Connector()
     logger.info('inner class test')
コード例 #16
0
class Rec:
    def __init__(self, cores):
        self.mongo_conn = MongoConnector()
        self.s3_conn = S3Connector()
        self.n_processes = self._set_n_processes(cores)
        self.movies_df = self._load_movies_df()
        self.makers_df = self._load_makers_df()
        self.cluster_df = self._load_cluster_df()
        self.movie_vectors = self._load_movie_vectors()

        try:
            mp.set_start_method('spawn')
        except RuntimeError:
            pass

    def make_newest_rec(self, n):
        newest_df = self.movies_df.sort_values(by='release_date',
                                               ascending=False)
        newest_df = newest_df[['movie_id', 'poster_url']][:n]
        newest_rec = {'newest_rec': newest_df.to_dict('records')}

        self.s3_conn.upload_to_s3_byte(newest_rec, config['AWS']['S3_BUCKET'],
                                       config['REC']['FRONT_NEWEST'])

    def make_cluster_rec(self, n):
        cluster_movie_df = pd.merge(self.movie_vectors,
                                    self.movies_df,
                                    on='movie_id',
                                    sort=False,
                                    validate='one_to_one')
        cluster_movie_df = cluster_movie_df[[
            'cluster', 'movie_id', 'vector', 'poster_url', 'avg_rate',
            'review_count'
        ]]

        # 조건 필터링
        cluster_movie_df = cluster_movie_df[
            (cluster_movie_df['review_count'] >= 50)
            & (cluster_movie_df['avg_rate'] >= 8)]

        # 작은 군집 제거
        clusters = cluster_movie_df['cluster'].value_counts()
        clusters = clusters[clusters > 25]
        clusters = list(clusters.index)
        cluster_movie_df = cluster_movie_df[cluster_movie_df['cluster'].isin(
            clusters)]
        '''
        # 군집 묶기
        cluster_df = self.cluster_df[self.cluster_df.index.isin(clusters)].copy()
        X = Normalizer().fit_transform(list(cluster_df['vector']))

        def train_kmeans_model(X, k):
            model = KMeans(
                init='k-means++', 
                n_clusters=k, 
                max_iter=10000, 
                tol=1e-12
            ).fit(X)
            
            return model

        kmeans = train_kmeans_model(X, 20)
        cluster_df.loc[:, 'cluster_set'] = kmeans.labels_

        # 군집셋별로 하나씩 선택
        clusters = list(cluster_df.groupby('cluster_set').sample(n=1).index)
        '''

        # 군집 중심에 가까운 순으로 정렬
        cluster_movie_df = pd.merge(cluster_movie_df,
                                    self.cluster_df,
                                    left_on='cluster',
                                    right_index=True,
                                    copy=False,
                                    validate='many_to_one')
        cluster_movie_df.loc[:, 'vector_y'] = cluster_movie_df['vector_y'].map(
            lambda x: list(x))
        cluster_movie_df.loc[:, 'dist'] = cluster_movie_df.apply(
            lambda x: cosine(x.vector_x, x.vector_y), axis=1)
        cluster_movie_df = cluster_movie_df[['movie_id', 'cluster',
                                             'dist']].reset_index()

        cluster_rec = dict()
        for cluster in clusters:
            cluster_rec[cluster] = cluster_movie_df[cluster_movie_df['cluster']
                                                    == cluster][[
                                                        'movie_id'
                                                    ]][:n].to_dict('records')

        self.s3_conn.upload_to_s3_byte(cluster_rec, config['AWS']['S3_BUCKET'],
                                       config['REC']['FRONT_CLUSTER'])

    def make_genre_rec(self, n):
        genres = self.movies_df['genre'].explode().value_counts()
        genres = genres[genres >= n]
        genres = list(genres.index)
        genre_df = self.movies_df[['movie_id', 'poster_url', 'genre']]
        rows = genre_df.to_dict('records')

        # 멀티장르 각각 로우로 변환
        genre_rows = []
        for row in rows:
            for genre in row['genre']:
                genre_rows.append({
                    'movie_id': row['movie_id'],
                    'poster_url': row['poster_url'],
                    'genre': genre
                })
        genre_df = pd.DataFrame(genre_rows)

        # 랜덤
        genre_df = genre_df.sample(frac=1)

        # 장르별로 분류
        genre_rec = dict()
        for genre in genres:
            genre_rec[genre] = genre_df[genre_df['genre'] == genre][[
                'movie_id', 'poster_url'
            ]][:n].to_dict('records')

        self.s3_conn.upload_to_s3_byte(genre_rec, config['AWS']['S3_BUCKET'],
                                       config['REC']['FRONT_GENRE'])

    def make_actor_rec(self, n):
        movie_ids = list(self.movies_df['movie_id'])
        movies_df = self.movies_df.set_index('movie_id').sort_index()
        makers_df = pd.merge(self.makers_df,
                             self.movies_df[['movie_id']],
                             on='movie_id',
                             validate='many_to_one')
        makers_df = makers_df.set_index(['movie_id', 'maker_id']).sort_index()
        actor_rec = dict()

        for movie_id in movie_ids:
            # 출연한 배우들
            main_actor_ids = movies_df.loc[movie_id]['main_actor_ids']

            # 배우별 출연작 데이터
            actors_df = makers_df[
                ~makers_df.index.isin([movie_id], level='movie_id')
                & makers_df.index.isin(main_actor_ids, level='maker_id')]
            actors_df.reset_index(inplace=True)

            # 주연배우 기재순 - 사용x
            '''
            actors_df['maker_id'] = pd.Categorical(
                actors_df['maker_id'],
                categories=main_actor_ids,
                ordered=True
            )
            '''

            # 개봉일순으로 정렬
            actors_df = actors_df.sort_values(by=['release_date'],
                                              ascending=[False])

            # 중복 영화 제거
            actors_df = actors_df.drop_duplicates(subset=['movie_id'])

            # 결과물
            rec = actors_df[['movie_id', 'poster_url']][:n].to_dict('records')
            actor_rec[movie_id] = rec

        self.s3_conn.upload_to_s3_byte(actor_rec, config['AWS']['S3_BUCKET'],
                                       config['REC']['DETAIL_ACTOR'])

    def make_director_rec(self, n):
        movie_ids = list(self.movies_df['movie_id'])
        makers_df = pd.merge(self.makers_df,
                             self.movies_df[['movie_id']],
                             on='movie_id',
                             validate='many_to_one')
        makers_df = makers_df.set_index(['movie_id', 'maker_id',
                                         'role']).sort_index()
        director_rec = dict()

        for movie_id in movie_ids:
            # 감독, 작가
            directors = makers_df[
                makers_df.index.isin([movie_id], level='movie_id')
                & makers_df.index.isin(['director', 'writer'],
                                       level='role')].index.get_level_values(
                                           'maker_id').to_list()

            # 감독, 작가별 참여작 데이터
            directors_df = makers_df[
                ~makers_df.index.isin([movie_id], level='movie_id')
                & makers_df.index.isin(directors, level='maker_id')]
            directors_df.reset_index(inplace=True)

            # 개봉일순으로 정렬
            directors_df = directors_df.sort_values(by=['release_date'],
                                                    ascending=[False])

            # 중복 영화 제거
            directors_df = directors_df.drop_duplicates(subset=['movie_id'])

            # 결과물
            rec = directors_df[:n][['movie_id',
                                    'poster_url']].to_dict('records')
            director_rec[movie_id] = rec

        self.s3_conn.upload_to_s3_byte(director_rec,
                                       config['AWS']['S3_BUCKET'],
                                       config['REC']['DETAIL_DIRECTOR'])

    def make_similar_rec(self, n):
        similar_rec_df = pd.merge(
            self.movie_vectors,
            self.movies_df[self.movies_df['review_count'] >= 30][[
                'movie_id', 'poster_url'
            ]],
            on='movie_id',
            sort=False,
            validate='one_to_one')
        movie_ids = list(self.movie_vectors.index)

        similar_rec = dict()
        for movie_id in movie_ids:
            movie = self.movie_vectors.loc[movie_id]
            # 군집 센트로이드와의 거리를 구하고 가까운 순 정렬
            cluster_distances = self.cluster_df['vector'].map(lambda x: cosine(
                x, movie['vector'])).sort_values(ascending=True)
            clusters = list(cluster_distances.index)

            # 영화 수가 목표에 다다를때까지 군집별 영화 목록을 후보 리스트에 추가
            df_li = []
            movie_count = 0
            for cluster in clusters:
                tmp_df = similar_rec_df[similar_rec_df['cluster'] == cluster]
                df_li.append(tmp_df)
                movie_count += len(tmp_df)
                if movie_count >= 100:
                    break

            similar_df = pd.concat(df_li)

            # 유사도순으로 정렬
            distances = similar_df['vector'].map(
                lambda x: cosine(x, movie['vector']))
            similar_df = similar_df.assign(distance=distances).sort_values(
                by='distance')

            # 자기자신 제외
            similar_df = similar_df[similar_df['movie_id'] != movie.name]

            # n
            similar_df = similar_df[:n]

            # 결과
            rec = similar_df[['movie_id', 'poster_url']].to_dict('records')
            similar_rec[movie_id] = rec

        self.s3_conn.upload_to_s3_byte(similar_rec, config['AWS']['S3_BUCKET'],
                                       config['REC']['DETAIL_SIMILAR'])

    def _set_n_processes(self, cores):
        if (not cores) or (cores >= (mp.cpu_count() // 2)):
            return (mp.cpu_count() // 2) - 1
        return cores

    def _load_movies_df(self):
        movies_df = self.s3_conn.load_from_s3_byte(
            config['AWS']['S3_BUCKET'], config['DATA']['MOVIE_INFO'])
        return movies_df

    def _load_makers_df(self):
        makers_df = pd.DataFrame(self.mongo_conn.makers.find())
        self.mongo_conn.close()
        # makers_df = makers_df[makers_df['movie_id'].isin(self.movies_df['movie_id'])]
        makers_df = makers_df.rename(
            columns={'movie_poster_url': 'poster_url'})
        return makers_df

    def _load_cluster_df(self):
        cluster_df = self.s3_conn.load_from_s3_byte(
            config['AWS']['S3_BUCKET'], config['MODEL']['CLUSTER_PATH'])
        return cluster_df

    def _load_movie_vectors(self):
        movie_vectors = self.s3_conn.load_from_s3_byte(
            config['AWS']['S3_BUCKET'], config['MODEL']['MOVIE_VECTORS_PATH'])
        movie_vectors = movie_vectors[movie_vectors.index.isin(
            self.movies_df['movie_id'])]
        return movie_vectors
コード例 #17
0
class MorphExtractor:
    def __init__(self, pos_chunk, pos_target, all):
        self.mongo_conn = MongoConnector()
        self.pos_chunk = pos_chunk
        self.pos_target = pos_target

        if all:
            print('reset morph data')
            self._reset_morph_data()

        self._set_iter_count()

        print(pos_chunk)
        print(pos_target)

    def _reset_morph_data(self):
        tokens = self.mongo_conn.user_review_tokens_okt
        morphs = self.mongo_conn.user_review_morphs_okt
        okt_adjective_stat = self.mongo_conn.okt_adjective_stat

        try:
            morphs.drop()
            okt_adjective_stat.drop()
            tokens.update_many({}, {'$set': {'morphed': False}})
        except Exception as e:
            logger.error(e)
        finally:
            self.mongo_conn.close()

    def _set_iter_count(self):
        tokens = self.mongo_conn.user_review_tokens_okt

        try:
            rows = tokens.count_documents({'morphed': {'$in': [None, False]}})
        except Exception as e:
            logger.error(e)
        finally:
            self.mongo_conn.close()
        self.iter = (rows // self.pos_chunk + 1) if rows > 0 else 0

    def get_pos(self):
        tokens = self.mongo_conn.user_review_tokens_okt

        try:
            pos = tokens.find({'morphed': {
                '$in': [None, False]
            }})[:self.pos_chunk]
            pos_df = pd.DataFrame(pos)[['_id', 'movie_id', 'tokens', 'rate']]
        except Exception as e:
            logger.error(e)
        finally:
            self.mongo_conn.close()

        print(len(pos_df))
        print(pos_df.iloc[0])
        self.pos_df = pos_df

    def get_morphs(self):
        self.pos_df.loc[:, 'nouns'] = self.pos_df['tokens'].map(
            lambda x: [pos[0] for pos in x if pos[1] == 'Noun'])
        self.pos_df.loc[:, 'adjectives'] = self.pos_df['tokens'].map(
            lambda x: [pos[0] for pos in x if pos[1] == 'Adjective'])

        morph_df = self.pos_df.drop(columns=['tokens'])
        self.morph_df = morph_df

    def save_morphs(self):
        tokens = self.mongo_conn.user_review_tokens_okt
        morphs = self.mongo_conn.user_review_morphs_okt
        okt_adjective_stat = self.mongo_conn.okt_adjective_stat

        morphs_dict = self.morph_df.to_dict('records')
        comment_count = len(morphs_dict)
        try:
            for doc in morphs_dict:
                for adj in doc['adjectives']:
                    okt_adjective_stat.update_one({'_id': adj}, {
                        '$set': {
                            '_id': adj
                        },
                        '$inc': {
                            'count': 1
                        }
                    },
                                                  upsert=True)
                morphs.replace_one({'_id': doc['_id']}, doc, upsert=True)
                tokens.update_one({'_id': doc['_id']},
                                  {'$set': {
                                      'morphed': True
                                  }})
            logger.info(f'{comment_count} comments are morphed.')
            print(f'{comment_count} comments are morphed.')
        except Exception as e:
            logger.error(e)
        finally:
            self.mongo_conn.close()

    def _target_row_count(self):
        pos_path = self.mongo_conn.user_review_tokens_okt

        try:
            doc_count = pos_path.count_documents({})
        except Exception as e:
            logger.error(e)
        finally:
            self.mongo_conn.close()

        return doc_count
コード例 #18
0
class MorphExtractor:
    def __init__(self, pos_chunk, pos_target, all):
        self.mongo_conn = MongoConnector()
        self.pos_chunk = pos_chunk
        self.pos_target = pos_target
        self.all = all

        print(pos_chunk)
        print(pos_target)

        if self.all:
            logger.info('get morphs from entire corpus. dropping morphs collection.')
            self._drop_morphs()

        self._set_iter_count()


    def _drop_morphs(self):
        tokens = self.mongo_conn.user_review_tokens
        morphs = self.mongo_conn.user_review_morphs

        try:
            morphs.drop()
            morphs.create_index('movie_id')
            tokens.update_many({}, {'$set': {'morphed': False}})
        except Exception as e:
            logger.error(e)
        finally:
            self.mongo_conn.close()


    def _set_iter_count(self):
        tokens = self.mongo_conn.user_review_tokens

        try:
            rows = tokens.count_documents({'morphed': {'$in': [None, False]}})
        except Exception as e:
            logger.error(e)
        finally:
            self.mongo_conn.close()
        self.iter = (rows // self.pos_chunk + 1) if rows > 0 else 0

    
    def get_pos(self):
        tokens = self.mongo_conn.user_review_tokens
        
        try:
            pos = tokens.find({'morphed': {'$in': [None, False]}})[:self.pos_chunk]
            pos_df = pd.DataFrame(pos)[['_id', 'movie_id', 'tokens']]
        except Exception as e:
            logger.error(e)
        finally:
            self.mongo_conn.close()

        print(len(pos_df))
        print(pos_df.iloc[0])
        self.pos_df = pos_df


    def get_morphs(self):
        if self.pos_target is None:
            self.pos_df.loc[:, 'morphs'] = self.pos_df['tokens'].map(lambda x: [pos[0] for pos in x])
        else:
            self.pos_df.loc[:, 'morphs'] = self.pos_df['tokens'].map(lambda x: [pos[0] for pos in x if pos[1] in self.pos_target])
        morph_df = self.pos_df.drop(columns=['tokens'])
        
        self.morph_df = morph_df


    def save_morphs(self):
        tokens = self.mongo_conn.user_review_tokens
        morphs = self.mongo_conn.user_review_morphs
        
        morphs_dict = self.morph_df.to_dict('records')
        try:
            for doc in morphs_dict:
                tokens.update_one({'_id': doc['_id']}, {'$set': {'morphed': True}})
        except Exception as e:
            logger.error(e)
        finally:
            self.mongo_conn.close()
        
        self.morph_df = self.morph_df[self.morph_df['morphs'].astype('str') != '[]']
        morphs_dict = self.morph_df.to_dict('records')
        comment_count = len(morphs_dict)
        try:
            for doc in morphs_dict:
                morphs.replace_one({'_id': doc['_id']}, doc, upsert=True)
            logger.info(f'{comment_count} comments are morphed.')
            print(f'{comment_count} comments are morphed.')
        except Exception as e:
            logger.error(e)
        finally:
            self.mongo_conn.close()
        

    def _target_row_count(self):
        pos_path = self.mongo_conn.user_review_tokens

        try:
            doc_count = pos_path.count_documents({})
        except Exception as e:
            logger.error(e)
        finally:
            self.mongo_conn.close()
        
        return doc_count
コード例 #19
0
class Tokenizer(object):
    def __init__(self, cores, rows):
        self.mongo_conn = MongoConnector()
        self.n_processes = self._set_n_processes(cores)
        self.rows = rows

        logger.info(f'using {self.n_processes} cores')
        print(f'using {self.n_processes} cores')

    def get_reviews(self):
        user_reviews = self.mongo_conn.user_reviews

        try:
            reviews = user_reviews.find(
                {'tokenized_okt': {
                    '$in': [None, False]
                }})
            if self.rows != 0:
                reviews = reviews[:self.rows]
        except Exception as e:
            logger.error(e)
        finally:
            self.mongo_conn.close()

        df = pd.DataFrame(reviews)[['_id', 'movie_id', 'review', 'rate']]

        logger.info(f'got {len(df)} reviews.')
        print(f'got {len(df)} reviews.')

        self._split_df(df)

    def tokenize(self):
        try:
            mp.set_start_method('spawn')
        except RuntimeError:
            pass

        with mp.Pool(processes=self.n_processes, maxtasksperchild=1) as p:
            for idx, chunk in enumerate(self.reviews_df_split):
                print(f'chunk {idx}: size {len(chunk)}')
                df_job = np.array_split(chunk, self.n_processes)
                print('getting tokens')
                df_job = p.map(_add_tokens, df_job)
                print('filtering empty rows')
                df_job = p.map(_filter_empty_token_row, df_job)
                print('dropping columns')
                func = partial(_drop_columns, ['review'])
                df_job = p.map(func, df_job)
                print('concatting chunk')
                tokens_df_chunk = pd.concat(df_job)
                print('updating result')
                self.reviews_df_split[idx] = tokens_df_chunk

        tokens_df = pd.concat(self.reviews_df_split)
        del self.reviews_df_split
        self.tokens_df = tokens_df

    def save_tokens(self):
        review_tokens = self.mongo_conn.user_review_tokens_okt
        user_reviews = self.mongo_conn.user_reviews

        tokens_li = self.tokens_df.to_dict('records')
        try:
            for tokens in tokens_li:
                if tokens['tokens'] != []:
                    review_tokens.replace_one({'_id': tokens['_id']},
                                              tokens,
                                              upsert=True)
                user_reviews.update_one({'_id': tokens['_id']},
                                        {'$set': {
                                            'tokenized_okt': True
                                        }})
            logger.info(f'{len(tokens_li)} comments are tokenized.')
            print(f'{len(tokens_li)} comments are tokenized.')
        except Exception as e:
            logger.error(e)
        finally:
            self.mongo_conn.close()

    def _set_n_processes(self, cores):
        if (not cores) or (cores >= (mp.cpu_count() // 2)):
            return (mp.cpu_count() // 2) - 1
        return cores

    def _split_df(self, df):
        split_to = len(df) // 500000 + 1
        self.reviews_df_split = np.array_split(df, split_to)

        logger.info(f'splited to {split_to}')
        print(f'splited to {split_to}')
コード例 #20
0
class MorphPostProcessor(object):
    def __init__(self, pos_chunk, pos_target, all):
        self.mongo_conn = MongoConnector()
        self.pos_chunk = pos_chunk
        self.pos_target = pos_target
        self.adj_converter = self._load_adj_converter()
        self._set_iter_count()

        if all:
            print('reset update_checker')
            self._reset_update_checker()

    def _reset_update_checker(self):
        morphs = self.mongo_conn.user_review_morphs_okt

        try:
            morphs.update_many({}, {'$set': {'adj_converted': False}})
        except Exception as e:
            logger.error(e)
        finally:
            self.mongo_conn.close()

    def _load_adj_converter(self):
        with open('processors/adj_converter.json', 'rb') as f:
            adj_converter = json.load(f)
        return adj_converter

    def _set_iter_count(self):
        morphs = self.mongo_conn.user_review_morphs_okt

        try:
            rows = morphs.count_documents(
                {'adj_converted': {
                    '$in': [None, False]
                }})
        except Exception as e:
            logger.error(e)
        finally:
            self.mongo_conn.close()
        self.iter = (rows // self.pos_chunk + 1) if rows > 0 else 0

    def get_adjs(self):
        morphs = self.mongo_conn.user_review_morphs_okt

        try:
            adjs = morphs.find({'adj_converted': {
                '$in': [None, False]
            }}, {
                '_id': 1,
                'adjectives': 1
            })[:self.pos_chunk]
            adj_df = pd.DataFrame(adjs)
        except Exception as e:
            logger.error(e)
        finally:
            self.mongo_conn.close()

        print(len(adj_df))
        print(adj_df.iloc[0])
        self.adj_df = adj_df

        return self

    def convert_adjs(self):
        self.adj_df.loc[:, 'adjectives'] = self.adj_df['adjectives'].map(
            lambda x: self._convert_adjs(x))
        return self

    def _convert_adjs(self, adjs):
        return [
            self._convert_adj(adj) for adj in adjs
            if self._convert_adj(adj) is not None
        ]

    def _convert_adj(self, adj):
        if adj in self.adj_converter['stopwords']:
            return
        if re.match('.*하다$', adj):
            return self.adj_converter['converter_hada'].get(
                adj) or f'{adj[:-2]}한'
        return self.adj_converter['converter'].get(adj) or adj

    def save_adjs(self):
        morphs = self.mongo_conn.user_review_morphs_okt

        try:
            for idx in range(len(self.adj_df)):
                doc = self.adj_df.iloc[idx]
                morphs.update_one({'_id': doc['_id']}, {
                    '$set': {
                        'adjectives': doc['adjectives'],
                        'adj_converted': True
                    }
                })
            print(f'{len(self.adj_df)} docs are converted.')
        except Exception as e:
            logger.error(e)
        finally:
            self.mongo_conn.close()
コード例 #21
0
ファイル: search_rec.py プロジェクト: cornandme/mirror-movie
def main():
    mongo_conn = MongoConnector()
    s3_conn = S3Connector()

    def _load_movies_df():
        movies_df = s3_conn.load_from_s3_byte(config['AWS']['S3_BUCKET'],
                                              config['DATA']['MOVIE_INFO'])
        return movies_df

    def _load_makers_df():
        makers_df = pd.DataFrame(mongo_conn.makers.find())
        mongo_conn.close()

        makers_df = pd.merge(makers_df,
                             movies_df[['movie_id', 'review_count']],
                             on='movie_id',
                             validate='many_to_one')
        roles = ['actor_main', 'director', 'writer', 'actor_sub']
        makers_df['role'] = pd.Categorical(makers_df['role'],
                                           categories=roles,
                                           ordered=True)
        makers_df = makers_df.rename(
            columns={'movie_poster_url': 'poster_url'})
        return makers_df

    def _load_cluster_df():
        cluster_df = s3_conn.load_from_s3_byte(config['AWS']['S3_BUCKET'],
                                               config['MODEL']['CLUSTER_PATH'])
        return cluster_df

    def _load_movie_vectors():
        movie_vectors = s3_conn.load_from_s3_byte(
            config['AWS']['S3_BUCKET'], config['MODEL']['MOVIE_VECTORS_PATH'])
        movie_vectors = movie_vectors[movie_vectors.index.isin(
            movies_df['movie_id'])]
        return movie_vectors

    movies_df = _load_movies_df()
    makers_df = _load_makers_df()
    cluster_df = _load_cluster_df()
    movie_vectors = _load_movie_vectors()

    # 리뷰 많은 순 정렬
    movies_df = movies_df.sort_values(by=['review_count'], ascending=False)

    # 역할, 리뷰 많은 순 정렬
    makers_df = makers_df.sort_values(by=['role', 'review_count'],
                                      ascending=[True, False])

    # 부분단어 해시
    def generate_hash(names):
        dic = dict()
        for name in names:
            if len(name) == 1:
                dic = _update_name_dic(name, name, dic)
                continue

            name_split = name.split(' ')
            name_split.append(name.replace(' ', ''))
            name_split.append(name)
            for word in name_split:
                length = len(word)
                if length < 2:
                    continue
                for i in range(2, length + 1):
                    subword = word[:i]
                    dic = _update_name_dic(name, subword, dic)

        for key in dic.keys():
            dic[key] = get_unique_ordered_list(dic.get(key))
        return dic

    def _update_name_dic(name, word, dic):
        if dic.get(word) is None:
            dic[word] = []
        dic[word].append(name)
        return dic

    def get_unique_ordered_list(li):
        seen = set()
        return [x for x in li if not (x in seen or seen.add(x))]

    # 제목
    movie_names_kor = movies_df['title_kor']
    movie_names_hash = generate_hash(movie_names_kor)

    # 사람
    maker_names = makers_df['name']
    maker_names_hash = generate_hash(maker_names)

    # 장르
    genre_names = set(flatten(movies_df['genre']))
    genre_hash = generate_hash(genre_names)

    # 국가
    nation_names = set(flatten(movies_df['nations']))
    nation_names_hash = generate_hash(nation_names)

    # 병합
    subword_hash = dict()
    subword_hash['movie_name'] = movie_names_hash
    subword_hash['maker'] = maker_names_hash
    subword_hash['genre'] = genre_hash
    subword_hash['nation'] = nation_names_hash

    # 이름-영화id 해시
    def generate_name_id_hash(names, ids):
        dic = dict()
        for i in range(len(names)):
            if dic.get(names[i]) is None:
                dic[names[i]] = []
            dic[names[i]].append(ids[i])
        for key in dic.keys():
            dic[key] = get_unique_ordered_list(dic.get(key))
        return dic

    # 제목
    movie_names = list(movies_df['title_kor'])
    movie_ids = list(movies_df['movie_id'])
    movie_name_id_hash = generate_name_id_hash(movie_names, movie_ids)

    # 이름
    maker_names = list(makers_df['name'])
    maker_ids = list(makers_df['movie_id'])
    maker_id_hash = generate_name_id_hash(maker_names, maker_ids)

    # 장르
    ex_movies_df = movies_df[['movie_id', 'genre']].explode('genre')
    genres = list(ex_movies_df['genre'])
    genre_ids = list(ex_movies_df['movie_id'])
    genre_id_hash = generate_name_id_hash(genres, genre_ids)

    # 국가
    ex_movies_df = movies_df[['movie_id', 'nations']].explode('nations')
    nations = list(ex_movies_df['nations'])
    nation_ids = list(ex_movies_df['nations'])
    nation_id_hash = generate_name_id_hash(nations, nation_ids)

    # 병합
    name_id_hash = dict()
    name_id_hash['movie_name'] = movie_name_id_hash
    name_id_hash['maker'] = maker_id_hash
    name_id_hash['genre'] = genre_id_hash
    name_id_hash['nation'] = nation_id_hash

    # unload
    s3_conn.upload_to_s3_byte(subword_hash, config['AWS']['S3_BUCKET'],
                              config['DATA']['SUBWORD_HASH'])
    s3_conn.upload_to_s3_byte(name_id_hash, config['AWS']['S3_BUCKET'],
                              config['DATA']['NAME_ID_HASH'])
コード例 #22
0
class MovieVectorProcessor:
    def __init__(self, chunk):
        self.mongo_conn = MongoConnector()
        self.s3_conn = S3Connector()
        self.chunk = chunk
        self.n_processes = (mp.cpu_count() // 2) - 1
        self.movie_id_q = Queue()
        self.model = None
        self.movie_vector_li = []
        self.morphs_df = None
        self.movie_vectors = None

        self._get_movie_ids()
        self._load_trained_model()

        logger.info(f'using {self.n_processes} cores')
        print(f'using {self.n_processes} cores')

    def _get_movie_ids(self):
        morphs = self.mongo_conn.user_review_morphs

        movie_ids = morphs.distinct('movie_id')

        for movie_id in movie_ids:
            self.movie_id_q.put(movie_id)

        self.mongo_conn.close()

    def _load_trained_model(self):
        try:
            model = self.s3_conn.load_from_s3_byte(
                config['AWS']['S3_BUCKET'], config['MODEL']['MODEL_PATH'])
        except Exception:
            model = None

        if self.__validate_model(model):
            self.model = model

    def __validate_model(self, model):
        return type(model) == gensim.models.fasttext.FastText

    def get_morphs(self):
        morphs = self.mongo_conn.user_review_morphs

        docu_count = 0
        df_li = []
        while (not self.movie_id_q.empty() and (docu_count < self.chunk)):
            movie_id = self.movie_id_q.get()
            try:
                morphs_df = pd.DataFrame(
                    morphs.find({'movie_id': movie_id}, {
                        '_id': 0,
                        'movie_id': 1,
                        'morphs': 1
                    }))
                df_li.append(morphs_df)
                docu_count += len(morphs_df)
            except Exception as e:
                logger.error(e)
                self.mongo_conn.close()

        self.mongo_conn.close()

        logger.info(f'got {docu_count} reviews.')
        print(f'got {docu_count} reviews.')

        self.morphs_df = pd.concat(df_li)

    def make_movie_vectors(self):

        word_vectors = self.model.wv

        movie_vectors = pd.DataFrame()
        movie_vectors['movie_id'] = self.morphs_df['movie_id']

        # get averaged comment vector
        movie_vectors.loc[:, 'vector'] = self.morphs_df['morphs'].map(
            lambda morphs: np.average(
                [word_vectors[morph] for morph in morphs], axis=0))

        # get movie vector
        movie_vectors = movie_vectors.groupby('movie_id').sum()

        self.movie_vector_li.append(movie_vectors)

        logger.info('make movie vectors finished')
        print('make movie vectors finished')

    def concat_vectors(self):
        self.movie_vectors = pd.concat(self.movie_vector_li)