コード例 #1
0
ファイル: main.py プロジェクト: maomao905/fun2vec
def make_word_cluster_label_data():
    """make data that key is word, value is clustered label
    {'将棋': 748, '囲碁': 748,,,,}
    """
    word_label = {}
    word2vec = Model('word2vec')
    clf = Cluster(config['cluster']['kmeans'])
    _pickle(dict(zip(word2vec.vocab, clf.labels)), config['cluster']['labels'])
    _logger.info(
        f"{len(word2vec.vocab)} word label data saved in {config['cluster']['labels']}"
    )
コード例 #2
0
def cluster():
    """
    既存のfun2vec corpusから似た興味をグループ化して、新たなcorpusを作る
    """
    clustered_corpus = []
    corpus = _unpickle(config['corpus']['fun2vec'])
    word2vec_model = Model.load_model('word2vec')
    for i, user_funs in enumerate(corpus, 1):
        funs = cluster_funs(word2vec_model, user_funs)
        if len(funs) >= 2:
            clustered_corpus.append(funs)
        if i % 10000 == 0:
            _logger.info(f'Finished {i} profiles')
    _pickle(clustered_corpus, config['corpus']['fun2vec_clustered'])
    _logger.info(f"Saved corpus of {len(clustered_corpus)} profiles in {config['corpus']['fun2vec_clustered']}")
コード例 #3
0
def create():
    corpus = []
    session = create_session()
    fc = Fun2vecCorpus()
    try:
        for idx, user in enumerate(session.query(User).filter(User.verified==0).yield_per(500), 1):
            funs = list(set(fc.extract(user.description)))
            if len(funs) > 0:
                shuffle(funs)
                corpus.append(funs)
            if idx % 10000 == 0:
                _logger.info(f'{idx} profiles')
    except Exception as e:
        _logger.error(e)
    finally:
        session.close()
    _pickle(corpus, config['corpus']['fun2vec'])
    _logger.info(f"Saved corpus of {len(corpus)} sentences in {config['corpus']['fun2vec']}")
コード例 #4
0
ファイル: corpus_word2vec.py プロジェクト: maomao905/fun2vec
def create():
    corpus = []
    session = create_session()
    wc = Word2vecCorpus()

    try:
        for idx, user in enumerate(session.query(User.description).filter(User.verified==0).yield_per(500), 1):
            words = wc.extract(user.description)
            if len(words) >= 2:
                corpus.append(words)

            if idx % 10000 == 0:
                wc._logger.info(f'Finished {idx} profiles')
    except Exception as e:
        wc._logger.error(e)
    finally:
        session.close()
    _pickle(corpus, wc._config_file['word2vec'])
    wc._logger.info(f"Saved corpus of {len(corpus)} sentences in {wc._config_file['word2vec']}")
コード例 #5
0
def cluster_by_kmeans():
    """
    Use KMeans to group similar words.
    """
    import os, sys
    sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../')))
    from cluster import Cluster
    from collections import defaultdict
    import random

    clustered_corpus = []
    corpus = _unpickle(config['corpus']['fun2vec'])
    clf = Cluster(config['cluster']['kmeans'])
    m = Model('word2vec')
    cluster = dict(zip(m.vocab, clf.predict(m.vector)))
    del m, clf # memory friendly

    for i, words in enumerate(corpus, 1):
        centroids = defaultdict(list)
        for word in words:
            label = cluster.get(word)
            centroids[label].append(word)
        # if there are the words which have the same labels, randomly choose one of them and remove others.
        clustered_words = [random.choice(v) if k is not None and len(v) >= 2 else v[0] for k, v in centroids.items()]

        if len(clustered_words) >= 2:
            clustered_corpus.append(clustered_words)
            if i < 100:
                print('----------------------------')
                print(words)
                print(clustered_words)

        if i % 10000 == 0:
            _logger.info(f'Finished {i} profiles')
    _pickle(clustered_corpus, config['corpus']['fun2vec_clustered'])
    _logger.info(f"Saved corpus of {len(clustered_corpus)} profiles in {config['corpus']['fun2vec_clustered']}")
コード例 #6
0
 def _save_model(model, file_path):
     _pickle(model, file_path) # edit protocol later if error occurs
     _logger.info(f'Saved model in {file_path}')