Exemple #1
0
def clustering_word2vec(data_features):
    n_start = int(len(data_features)/2)
    n_stop = len(data_features)-5
    step = 5
    n_cls_range = range(n_start, n_stop, step)
    n_cls = silhoutte.number_clusters(data_features, n_cls_range)
    spectral = KMeans(n_clusters=n_cls).fit(data_features)
    label = spectral.fit_predict(data_features)
    return label
Exemple #2
0
def clustering_word2vec(data_features):
    n_start = int(len(data_features) / 2)
    n_stop = len(data_features) - 5
    step = 5
    n_cls_range = range(n_start, n_stop, step)
    n_cls = silhoutte.number_clusters(data_features, n_cls_range)
    spectral = KMeans(n_clusters=n_cls).fit(data_features)
    label = spectral.fit_predict(data_features)
    return label
Exemple #3
0
def clustering(list_perproblem, lang):
    vectorizer = TfidfVectorizer(analyzer="char", tokenizer=None, preprocessor=None, stop_words=None,
                                 max_features=5000, min_df=2, ngram_range=(3, 8))
    clean_text = []
    for j in xrange(0, len(list_perproblem)):
        clean_text.append(list_perproblem[j])

    data_features = vectorizer.fit_transform(clean_text)
    data_features = data_features.toarray()
    
    n_start = int(len(data_features)/2)
    n_stop = len(data_features)-5
    step = 5
    n_cls_range = range(n_start, n_stop, step)
    n_cls = silhoutte.number_clusters(data_features, n_cls_range)
    spectral = KMeans(n_clusters=n_cls).fit(data_features)
    label = spectral.fit_predict(data_features)
    return data_features, label
Exemple #4
0
def clustering(list_perproblem, lang):
    vectorizer = TfidfVectorizer(analyzer="char",
                                 tokenizer=None,
                                 preprocessor=None,
                                 stop_words=None,
                                 max_features=5000,
                                 min_df=2,
                                 ngram_range=(3, 8))
    clean_text = []
    for j in xrange(0, len(list_perproblem)):
        clean_text.append(list_perproblem[j])

    data_features = vectorizer.fit_transform(clean_text)
    data_features = data_features.toarray()

    n_start = int(len(data_features) / 2)
    n_stop = len(data_features) - 5
    step = 5
    n_cls_range = range(n_start, n_stop, step)
    n_cls = silhoutte.number_clusters(data_features, n_cls_range)
    spectral = KMeans(n_clusters=n_cls).fit(data_features)
    label = spectral.fit_predict(data_features)
    return data_features, label