예제 #1
0
파일: Trainer.py 프로젝트: JohnProg/22
def decompose_and_cluster(tasks, word2vec, output_file, method='KMeans', option=10):
    """
        You should pass parameter 'method', 'option' as follows
        method = 'DBSCAN' or 'KMeans'
        option = eps or n_clusters
    """
    print 'Get task vector'
    whole_vector = tasks_to_vectors(tasks, word2vec)
    print 'Down dimension...'
    pca = PCA(5)
    d_vector = pca.fit_transform(whole_vector)
#    print 'PCA Log Likelihood Score : ' + str(pca.score())

    if method=='KMeans':
        print 'Training K-means...'
        cluster = KMeans(n_clusters=option, n_jobs=3)
    else:
        print 'Training DBSCAN ... '
        cluster = DBSCAN(eps=option)

    cluster.fit(d_vector)
    labels = cluster.predict(d_vector)

    pipe = Pipeline(steps=[
        ('w2v_200_to_5_PCA', pca), ('clustering', cluster)
    ])
    if os.path.exists(output_file):
        os.remove(output_file)
    joblib.dump(pipe, output_file, compress=3)
    print 'Complete dumping'

    return pipe, labels
예제 #2
0
def decompose_and_cluster(tasks, word2vec, output_file, n_clusters):
    print "Get task vector"
    whole_vector = tasks_to_vectors(tasks, word2vec)
    print "Down dimension..."
    pca = PCA(5)
    d_vector = pca.fit_transform(whole_vector)

    print "Training K-means..."
    kmeans = KMeans(n_clusters=n_clusters, n_jobs=3)
    kmeans.fit(d_vector)
    labels = kmeans.predict(d_vector)

    pipe = Pipeline(steps=[("w2v_200_to_5_PCA", pca), ("Kmeans_1000", kmeans)])
    if os.path.exists(output_file):
        os.remove(output_file)
    joblib.dump(pipe, output_file, compress=3)
    print "Complete dumping"

    return pipe, labels