def clustering_word2vec(data_features): n_start = int(len(data_features)/2) n_stop = len(data_features)-5 step = 5 n_cls_range = range(n_start, n_stop, step) n_cls = silhoutte.number_clusters(data_features, n_cls_range) spectral = KMeans(n_clusters=n_cls).fit(data_features) label = spectral.fit_predict(data_features) return label
def clustering_word2vec(data_features): n_start = int(len(data_features) / 2) n_stop = len(data_features) - 5 step = 5 n_cls_range = range(n_start, n_stop, step) n_cls = silhoutte.number_clusters(data_features, n_cls_range) spectral = KMeans(n_clusters=n_cls).fit(data_features) label = spectral.fit_predict(data_features) return label
def clustering(list_perproblem, lang): vectorizer = TfidfVectorizer(analyzer="char", tokenizer=None, preprocessor=None, stop_words=None, max_features=5000, min_df=2, ngram_range=(3, 8)) clean_text = [] for j in xrange(0, len(list_perproblem)): clean_text.append(list_perproblem[j]) data_features = vectorizer.fit_transform(clean_text) data_features = data_features.toarray() n_start = int(len(data_features)/2) n_stop = len(data_features)-5 step = 5 n_cls_range = range(n_start, n_stop, step) n_cls = silhoutte.number_clusters(data_features, n_cls_range) spectral = KMeans(n_clusters=n_cls).fit(data_features) label = spectral.fit_predict(data_features) return data_features, label
def clustering(list_perproblem, lang): vectorizer = TfidfVectorizer(analyzer="char", tokenizer=None, preprocessor=None, stop_words=None, max_features=5000, min_df=2, ngram_range=(3, 8)) clean_text = [] for j in xrange(0, len(list_perproblem)): clean_text.append(list_perproblem[j]) data_features = vectorizer.fit_transform(clean_text) data_features = data_features.toarray() n_start = int(len(data_features) / 2) n_stop = len(data_features) - 5 step = 5 n_cls_range = range(n_start, n_stop, step) n_cls = silhoutte.number_clusters(data_features, n_cls_range) spectral = KMeans(n_clusters=n_cls).fit(data_features) label = spectral.fit_predict(data_features) return data_features, label