def make_word_cluster_label_data(): """make data that key is word, value is clustered label {'将棋': 748, '囲碁': 748,,,,} """ word_label = {} word2vec = Model('word2vec') clf = Cluster(config['cluster']['kmeans']) _pickle(dict(zip(word2vec.vocab, clf.labels)), config['cluster']['labels']) _logger.info( f"{len(word2vec.vocab)} word label data saved in {config['cluster']['labels']}" )
def cluster(): """ 既存のfun2vec corpusから似た興味をグループ化して、新たなcorpusを作る """ clustered_corpus = [] corpus = _unpickle(config['corpus']['fun2vec']) word2vec_model = Model.load_model('word2vec') for i, user_funs in enumerate(corpus, 1): funs = cluster_funs(word2vec_model, user_funs) if len(funs) >= 2: clustered_corpus.append(funs) if i % 10000 == 0: _logger.info(f'Finished {i} profiles') _pickle(clustered_corpus, config['corpus']['fun2vec_clustered']) _logger.info(f"Saved corpus of {len(clustered_corpus)} profiles in {config['corpus']['fun2vec_clustered']}")
def create(): corpus = [] session = create_session() fc = Fun2vecCorpus() try: for idx, user in enumerate(session.query(User).filter(User.verified==0).yield_per(500), 1): funs = list(set(fc.extract(user.description))) if len(funs) > 0: shuffle(funs) corpus.append(funs) if idx % 10000 == 0: _logger.info(f'{idx} profiles') except Exception as e: _logger.error(e) finally: session.close() _pickle(corpus, config['corpus']['fun2vec']) _logger.info(f"Saved corpus of {len(corpus)} sentences in {config['corpus']['fun2vec']}")
def create(): corpus = [] session = create_session() wc = Word2vecCorpus() try: for idx, user in enumerate(session.query(User.description).filter(User.verified==0).yield_per(500), 1): words = wc.extract(user.description) if len(words) >= 2: corpus.append(words) if idx % 10000 == 0: wc._logger.info(f'Finished {idx} profiles') except Exception as e: wc._logger.error(e) finally: session.close() _pickle(corpus, wc._config_file['word2vec']) wc._logger.info(f"Saved corpus of {len(corpus)} sentences in {wc._config_file['word2vec']}")
def cluster_by_kmeans(): """ Use KMeans to group similar words. """ import os, sys sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../'))) from cluster import Cluster from collections import defaultdict import random clustered_corpus = [] corpus = _unpickle(config['corpus']['fun2vec']) clf = Cluster(config['cluster']['kmeans']) m = Model('word2vec') cluster = dict(zip(m.vocab, clf.predict(m.vector))) del m, clf # memory friendly for i, words in enumerate(corpus, 1): centroids = defaultdict(list) for word in words: label = cluster.get(word) centroids[label].append(word) # if there are the words which have the same labels, randomly choose one of them and remove others. clustered_words = [random.choice(v) if k is not None and len(v) >= 2 else v[0] for k, v in centroids.items()] if len(clustered_words) >= 2: clustered_corpus.append(clustered_words) if i < 100: print('----------------------------') print(words) print(clustered_words) if i % 10000 == 0: _logger.info(f'Finished {i} profiles') _pickle(clustered_corpus, config['corpus']['fun2vec_clustered']) _logger.info(f"Saved corpus of {len(clustered_corpus)} profiles in {config['corpus']['fun2vec_clustered']}")
def _save_model(model, file_path): _pickle(model, file_path) # edit protocol later if error occurs _logger.info(f'Saved model in {file_path}')