def to_weka_arff(ngram, number_of_features): count_vect = TfidfVectorizer(ngram_range=(1, ngram), norm='l2', sublinear_tf=True) label_list = get_labels() tweet_list = get_labelled_tweets() features = count_vect.fit_transform(tweet_list) features = SelectKBest(chi2, k=number_of_features).fit_transform(features, label_list) print features.shape arff_data = [] arff_data.append("@RELATION sport") for i in range(features.shape[1]): arff_data.append("@ATTRIBUTE feature" + str(i) + " REAL") arff_data.append("@ATTRIBUTE sportclass {neutral,neg,pos}") arff_data.append("@DATA") array_features = features.toarray() for i in range(len(array_features)): feature = array_features[i] label = label_list[i] csv_feature = ",".join(str(x) for x in feature) csv_feature = csv_feature + "," + label arff_data.append(csv_feature) with open('data/sport.arff', 'w') as file: for item in arff_data: file.write("%s\n" % item)
def to_weka_arff(ngram, number_of_features): count_vect = TfidfVectorizer(ngram_range=(1, ngram), norm='l2', sublinear_tf=True) label_list = get_labels() tweet_list = get_labelled_tweets() features = count_vect.fit_transform(tweet_list) features = SelectKBest(chi2, k=number_of_features).fit_transform( features, label_list) print features.shape arff_data = [] arff_data.append("@RELATION sport") for i in range(features.shape[1]): arff_data.append("@ATTRIBUTE feature" + str(i) + " REAL") arff_data.append("@ATTRIBUTE sportclass {neutral,neg,pos}") arff_data.append("@DATA") array_features = features.toarray() for i in range(len(array_features)): feature = array_features[i] label = label_list[i] csv_feature = ",".join(str(x) for x in feature) csv_feature = csv_feature + "," + label arff_data.append(csv_feature) with open('data/sport.arff', 'w') as file: for item in arff_data: file.write("%s\n" % item)
def main(): locations = lrf_config.get_locations() ref_data_dir = locations['REF_DATA_PATH'] x_filename = 'sentiment_data/tweets.txt' y_filename = 'sentiment_data/labels.txt' ##load and process samples print('start loading and process samples...') tweets = [] microblog_features = [] lexicon_features = [] tweets_lst = [] with open(os.path.join(ref_data_dir, x_filename)) as f: for i, line in enumerate(f): tweet_obj = json.loads(line.strip(), encoding='utf-8') # Twitter Text contents content = tweet_obj['text'].replace("\n", " ") tweets_lst.append(pre_process_lst(content)) postprocessed_tweet, microblogging_features, mpqa_sentiment_score = pre_process( content) tweets.append(postprocessed_tweet) microblog_features.append(microblogging_features) lexicon_features.append(mpqa_sentiment_score) lexicon_features = np.asarray(lexicon_features) microblog_features = np.asarray(microblog_features) tf_idf_vectorizer = utility.get_tf_idf_vectorizer(tweets_lst, ngram_range=2) transformed_data_rahul = tf_idf_vectorizer.fit_transform(tweets_lst) # # tf_idf_vectorizer = utility.get_tf_idf_vectorizer(tweets,ngram_range=2) # # transformed_data_mine = tf_idf_vectorizer.fit_transform(tweets) with open(os.path.join(ref_data_dir, y_filename)) as f: y_data = f.readlines() y_data = [y.strip('\n') for y in y_data] y_data = np.asarray(y_data) num_of_features = 50 accuracy_in_each_turn = [] while num_of_features <= 3000: X_new = SelectKBest(chi2, k=num_of_features).fit_transform( transformed_data_rahul, y_data) extended_features_1 = np.append(X_new.toarray(), lexicon_features, axis=1) extended_features_2 = np.append(extended_features_1, microblog_features, axis=1) sentiment_map = lrf_config.get_sentiment_map() inv_sentiment_map = {str(v): k for k, v in sentiment_map.items()} X_data = X_new.toarray() kf = KFold(n_splits=5) kf.get_n_splits(X_data) train_list = [] test_list = [] for train_index, test_index in kf.split(X_data): X_train = X_data[train_index] Y_train = y_data[train_index] X_test = X_data[test_index] Y_test = y_data[test_index] Y_pred, train_acc, test_acc = classifier.classify( 'svc', X_train=X_train, Y_train=Y_train, X_test=X_test, Y_test=Y_test, class_map=inv_sentiment_map, is_X_text=False) # print('_______________________________________________________') # print(train_acc) # print(test_acc) train_list.append(train_acc) test_list.append(test_acc) # print('Train_Acc : ',np.mean(train_acc)) # print('Test_Acc : ', np.mean(test_acc)) accuracy_in_each_turn.append([np.mean(train_acc), np.mean(test_acc)]) for elem in accuracy_in_each_turn: print(elem)
kmeans_model = KMeans(n_clusters=k).fit(X_new1) Kclustering_labels = kmeans_model.labels_ '''This function returns the Silhouette Coefficient for each sample. The best value is 1 and the worst value is -1. Values near 0 indicate overlapping clusters.''' Silhouette = metrics.silhouette_score(X_new1, Kclustering_labels, metric='euclidean') KSilhouette.append(Silhouette) '''normalized_mutual_info_score score between 0.0 and 1.0. 1.0 stands for perfectly complete labeling''' normalized_mutual_info_score = metrics.normalized_mutual_info_score( classification_labels, Kclustering_labels, average_method='arithmetic') Knormalized_mutual_info_score.append(normalized_mutual_info_score) 'Heirarical bottom up clustering' single_linkage_model = AgglomerativeClustering(n_clusters=k, linkage='ward').fit( X_new1.toarray()) Hclustering_labels = single_linkage_model.labels_ '''This function returns the Silhouette Coefficient for each sample. The best value is 1 and the worst value is -1. Values near 0 indicate overlapping clusters.''' Silhouette = metrics.silhouette_score(X_new1, Hclustering_labels, metric='euclidean') HSilhouette.append(Silhouette) '''normalized_mutual_info_score score between 0.0 and 1.0. 1.0 stands for perfectly complete labeling''' normalized_mutual_info_score = metrics.normalized_mutual_info_score( classification_labels, Hclustering_labels, average_method='arithmetic') Hnormalized_mutual_info_score.append(normalized_mutual_info_score) print("Cluster range", n_cluster) print("Silhouette scores for K-means clustering", KSilhouette) print("Silhouette scores for AgglomerativeClustering(hierarchical clustering)",
def cluster(training_file): feature_vectors, targets = load_svmlight_file(training_file) X = feature_vectors y = targets X_new1 = SelectKBest(chi2, k=1000).fit_transform(X, y) X_new1 = X_new1.toarray() range_clusters = [ 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25 ] silhouette_score_kmeans = [] mutual_information_score_kmeans = [] silhouette_score_agglomerative = [] mutual_information_score_agglomerative = [] for n in range_clusters: print("for value " + str(n)) kmeans_model = KMeans(n_clusters=n).fit(X_new1) clustering_labels = kmeans_model.labels_ silhouette_score = metrics.silhouette_score(X_new1, clustering_labels, metric='euclidean') silhouette_score_kmeans.append(silhouette_score) print("Kmeans clustering") print("silhouette score is " + str(silhouette_score)) single_linkage_model = AgglomerativeClustering( n_clusters=n, linkage='ward').fit(X_new1) clustering_labels = single_linkage_model.labels_ silhouette_score = metrics.silhouette_score(X_new1, clustering_labels, metric='euclidean') silhouette_score_agglomerative.append(silhouette_score) print("Hierarchical clustering") print("silhouette score is " + str(silhouette_score)) for n in range_clusters: print("for value " + str(n)) kmeans_model = KMeans(n_clusters=n).fit(X_new1) clustering_labels = kmeans_model.labels_ mutual_information_score = metrics.normalized_mutual_info_score( y, clustering_labels) mutual_information_score_kmeans.append(mutual_information_score) print("Kmeans clustering") print("mutual information score is " + str(mutual_information_score)) single_linkage_model = AgglomerativeClustering( n_clusters=n, linkage='ward').fit(X_new1) clustering_labels = single_linkage_model.labels_ mutual_information_score = metrics.normalized_mutual_info_score( y, clustering_labels) mutual_information_score_agglomerative.append(mutual_information_score) print("Hierarchical clustering") print("mutual information score is " + str(mutual_information_score)) f, axarr = plt.subplots(2, sharex=True) axarr[0].plot(range_clusters, silhouette_score_kmeans, label="kmeans") axarr[0].plot(range_clusters, silhouette_score_agglomerative, label="agglomerative") axarr[1].plot(range_clusters, mutual_information_score_kmeans, label="kmeans") axarr[1].plot(range_clusters, mutual_information_score_agglomerative, label="agglomerative") axarr[1].set_xlabel("X-axis: Number of clusters") axarr[0].set_ylabel("Y-axis: silhouette score") axarr[1].set_ylabel("Y-axis: Mutual information score") axarr[0].set_title("silhouette score") axarr[1].set_title("mutual information score") f.legend(loc="upper right") plt.show()
from sklearn import metrics from sklearn.cluster import KMeans, AgglomerativeClustering import matplotlib.pyplot as pyplot from sklearn.feature_selection import chi2, mutual_info_classif from sklearn.feature_selection import SelectKBest feature_vectors3, targets3 = load_svmlight_file("training_data_file.TFIDF") numberOfClusters = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25] kMeansSilhouette = [] kMeansMutualInformation = [] agglomerativeClusteringSilhoutte = [] agglomerativeClusteringMutualInformation = [] topHundredFeatures = SelectKBest(mutual_info_classif, k=100).fit_transform(feature_vectors3, targets3) topHundredFeatures = topHundredFeatures.toarray() for i in numberOfClusters: kmeans_model = KMeans(n_clusters=i).fit(topHundredFeatures) clustering_labels = kmeans_model.labels_ silhouettescore = metrics.silhouette_score(topHundredFeatures, clustering_labels, metric='euclidean') mutualInformationscore = metrics.normalized_mutual_info_score(targets3, clustering_labels) kMeansSilhouette.append(silhouettescore) kMeansMutualInformation.append(mutualInformationscore) #for i in numberOfClusters: single_linkage_model = AgglomerativeClustering(n_clusters=i, linkage='ward').fit(topHundredFeatures) clustering_labels2 = single_linkage_model.labels_ silhouettescore2 = metrics.silhouette_score(topHundredFeatures, clustering_labels2, metric='euclidean') mutualInformationscore2 = metrics.normalized_mutual_info_score(targets3, clustering_labels2)
from sklearn.cluster import KMeans, AgglomerativeClustering from sklearn import metrics from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import chi2 from sklearn.datasets import load_svmlight_file import warnings import matplotlib.pyplot as plt warnings.filterwarnings('ignore') #Loading training data file feature_vectors, targets = load_svmlight_file('training_data_file') #selecting features using CHI squared method select_features = SelectKBest(chi2, k=1000).fit_transform(feature_vectors, targets) select_features = select_features.toarray() #list of number of clusters num_clusters = [] #list of Silhoutte and Normalised Mutual Information measures for KMeans and Agglomerative clustering Silhoutte_Kmeans = [] NMI_Kmeans = [] Silhoutte_Agg = [] NMI_Agg = [] #Calculating Silhoutte and Normalised Mutual Information measures for KMeans and Agglomerative clustering for number of clusters ranging from 2 to 25 for n in range(2, 26): #print(n) num_clusters.append(n) kmeans_model = KMeans(n_clusters=n).fit(select_features) clustering_labels = kmeans_model.labels_ Silhoutte_Kmeans.append( metrics.silhouette_score(select_features, clustering_labels, metric='euclidean'))
start = time.time() training_tfidf, test_tfidf = load_svmlight_file('training_data_file.TFIDF') featureSize = 20000 training_tfidf_chi2 = SelectKBest(chi2, k=featureSize).fit_transform( training_tfidf, test_tfidf) clusterSize = [] kmeans_model_silhouette_score = [] kmeans_model_normalized_mutual_info_score = [] single_linkage_model_silhouette_score = [] single_linkage_model_normalized_mutual_info_score = [] for cluster in range(2, 25): clusterSize.append(cluster) kmeans_model = KMeans(n_clusters=cluster).fit(training_tfidf_chi2) single_linkage_model = AgglomerativeClustering( n_clusters=cluster, linkage='ward').fit(training_tfidf_chi2.toarray()) warnings.filterwarnings("ignore") kmeans_model_silhouette_score.append( metrics.silhouette_score(training_tfidf_chi2, kmeans_model.labels_, metric='euclidean')) kmeans_model_normalized_mutual_info_score.append( metrics.normalized_mutual_info_score(test_tfidf, kmeans_model.labels_, average_method='arithmetic')) single_linkage_model_silhouette_score.append( metrics.silhouette_score(training_tfidf_chi2, single_linkage_model.labels_, metric='euclidean'))
x = vector.fit_transform(x) print (x.shape) # remove features with zero variance selector = VarianceThreshold(threshold=0) x = selector.fit_transform(x) print (x.shape) # select best 300 features using chi2 stats x = SelectKBest(chi2, k=300).fit_transform(x, y) print (x.shape) # Numpy arrays are easy to work with, so convert the result to an # array and also they are given as input to SVM x = x.toarray() X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, shuffle=True) test_dataset = pd.DataFrame(columns=['SampleType']) test_dataset['SampleType'] = y_test test_dataset.to_csv('test_file_y.csv', sep=',', encoding='utf-8') np.savetxt('test_file_x.txt', X_test, fmt='%d') # tf - idf code transformer = TfidfTransformer(smooth_idf=False) tfidf_train = transformer.fit_transform(X_train).toarray() tfidf_test = transformer.fit_transform(X_test).toarray() # target names required for per class data target_names = ["Cardiovascular / Pulmonary", "Orthopedic", "Gastroenterology", "Neurology", "Urology", "Obstetrics / Gynecology", "ENT - Otolaryngology", "Hematology - Oncology", "Ophthalmology", "Nephrology"]
user="******", password="******", port=3306, charset='utf8mb4') # 链接数据库 cur = db.cursor() cur.execute(insert_data) db.commit() vectorizer = CountVectorizer( max_df=0.6, min_df=5) # 将文本中的词语转换为词频矩阵 矩阵元素a[i][j] 表示j词在i类文本下的词频 X = vectorizer.fit_transform(corpus) word = vectorizer.get_feature_names() # 获取词袋模型中的所有词语 # print(word) print(word.__len__()) # 词典的长度 # print(word.__getitem__(1))#获得词典中特定位置的词 pd.DataFrame(X.toarray(), columns=word) # print(pd.DataFrame(X.toarray(), columns=word)) #输出词频矩阵 transformer = TfidfTransformer( ) # 该类会统计每个词语的tf-idf权值,设置当设置为浮点数时,过滤出现在超过max_df/低于min_df比例的句子中的词语;正整数时,则是超过max_df句句子。这样就可以帮助我们过滤掉出现太多的无意义词语。 tfidf = transformer.fit_transform(vectorizer.fit_transform( corpus)) # 第一个fit_transform是计算tf-idf 第二个fit_transform是将文本转为词频矩阵 weight = tfidf.toarray() # 将tf-idf矩阵抽取出来,元素w[i][j]表示j词在i类文本中的tf-idf权重 # print(weight.__len__()) weight_data = pd.DataFrame(weight, columns=word) print('Start Kmeans:') clf = KMeans(n_clusters=33, random_state=1, algorithm='auto', n_init=10,
from sklearn.datasets import load_svmlight_file feature_vectors, targets = load_svmlight_file("training_data_file.IDF") from sklearn import metrics from sklearn.cluster import KMeans, AgglomerativeClustering from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import chi2 import matplotlib.pyplot as plt X_new1 = SelectKBest(chi2, k=1000).fit_transform(feature_vectors, targets) kmeans_silhoutte_scores = [] kmeans_normalized_mutual_scores = [] krange = range(2, 25) for n in krange: kmeans_model = KMeans(n_clusters=n).fit(X_new1.toarray()) clustering_labels = kmeans_model.labels_ kmeans_silhoutte_scores.append( metrics.silhouette_score(X_new1.toarray(), clustering_labels, metric='euclidean')) kmeans_normalized_mutual_scores.append( metrics.normalized_mutual_info_score(targets, clustering_labels)) single_linkage_silhoutte_scores = [] single_linkage_normalized_mutual_scores = [] for n in krange: single_linkage_model = AgglomerativeClustering(n_clusters=n, linkage='ward').fit( X_new1.toarray())