def testClusteringTopics(): D = noPreprocessTopics('topics.txt') print(len(D)) description, clusters, vectorspace, labels, distances = clustering(D, 'topics', number_clusters = 38) clusterToInterpret = list(filter(lambda x: 11 in x, clusters)) print(clusterToInterpret) medoid, mean = interpret(np.array(clusterToInterpret), D, distances, 'topics') print("Medoid: ", medoid) print("Mean: ", mean) sil_score, avgCoheision, avgSeparation = evaluation(D, vectorspace, labels, distances) print("Silhouette: ", sil_score) print("AvgCoheision: ", avgCoheision) print("AvgSeparation: ", avgSeparation) cohesionValues = cohesion(labels, distances) separationValues = separation(labels, distances) plt.title('Cohesions values distribuition for topics') plt.xlabel('Cohesion') plt.ylabel('Count') plt.hist(list(cohesionValues.values()), bins = 38) plt.show() plt.title('Separation values distribuition for tpoics') plt.xlabel('Separation') plt.ylabel('Count') plt.hist(list(separationValues.values()), bins = 38) plt.show()
def testClustering2(): print("Getting docs...") #D = list(getEvaledDocs('qrels.train')) D = getAllFiles('./rcv1/D_train/') description, clusters, vectorspace, labels, distances = clustering(D, 'docs', number_clusters = 100) medoid, mean = interpret(clusters[0], D, distances, 'docs') print("Medoid: ", medoid) print("Mean: ", mean) sil_score, avgCoheision, avgSeparation = evaluation(D, vectorspace, labels, distances) print("Silhouette: ", sil_score) print("AvgCoheision: ", avgCoheision) print("AvgSeparation: ", avgSeparation) cohesionValues = cohesion(labels, distances) separationValues = separation(labels, distances) plt.title('Cohesions values distribuition for Dtrain') plt.xlabel('Cohesion') plt.ylabel('Count') plt.hist(list(cohesionValues.values()), bins = 38) plt.show() plt.title('Separation values distribuition for Dtrain') plt.xlabel('Separation') plt.ylabel('Count') plt.hist(list(separationValues.values()), bins = 38) plt.show()
def testClustering1(): D = [] for i in ['R101' , 'R121', 'R150', 'R170', 'R180']: D += getEvaledDocsForTopic('qrels.test', i, 'test') description, clusters, vectorspace, labels, distances = clustering(D, 'docs', number_clusters = 11) medoid, mean = interpret(clusters[0], D, distances, 'docs') print("Medoid: ", medoid) print("Mean: ", mean) sil_score, avgCoheision, avgSeparation = evaluation(D, vectorspace, labels, distances) print("Silhouette: ", sil_score) print("AvgCoheision: ", avgCoheision) print("AvgSeparation: ", avgSeparation) cohesionValues = cohesion(labels, distances) separationValues = separation(labels, distances) print(cohesionValues) print(separationValues) plt.title('Cohesions values distribuition for document collection') plt.xlabel('Cohesion') plt.ylabel('Count') plt.hist(list(cohesionValues.values()), bins = 11) plt.show() plt.title('Separation values distribuition for document collection') plt.xlabel('Separation') plt.ylabel('Count') plt.hist(list(separationValues.values()), bins = 11) plt.show()
def main(): cluster.clustering(os.path.join(messages_path, '附件3_clean.xlsx'), os.path.join(results_path, 'result_last.xlsx')) hot_points.hotpoints_classify(os.path.join(results_path, 'result_last.xlsx'), os.path.join(results_path, 'result_hotpoints.xlsx')) max_label = merge_xlsx(os.path.join(results_path, 'result_last.xlsx'), os.path.join(results_path, 'result_hotpoints.xlsx'), os.path.join(results_path, 'result_merge.xlsx')) messages_dcit = filter_data(os.path.join(results_path, 'result_merge.xlsx'), os.path.join(results_path, 'result_filter.xlsx'), max_label=max_label) ans = get_score(os.path.join(results_path, 'result_filter.xlsx')) score = sorted(ans.items(), key=lambda x: x[1], reverse=True) print(score) generate_hotpoints_xlsx(messages_dcit, score)
def predictionLabel(data_test): label_pred = cluster.clustering() for i in range(len(label_pred)): if label_pred[i] == 0: x = [i for i in range(24)] plt.axis([0, 24, 0, 1.2]) plt.xlabel('时刻/h') plt.ylabel('负荷') plt.title('这是第1类负荷趋势') plt.plot(x, data1[i], 'm') plt.show() for i in range(len(label_pred)): if label_pred[i] == 1: x = [i for i in range(24)] plt.axis([0, 24, 0, 1.2]) plt.xlabel('时刻/h') plt.ylabel('负荷') plt.title('这是第2类负荷趋势') plt.plot(x, data1[i], 'g') plt.show() for i in range(len(label_pred)): if label_pred[i] == 2: x = [i for i in range(24)] plt.axis([0, 24, 0, 1.2]) plt.xlabel('时刻/h') plt.ylabel('负荷') plt.title('这是第3类负荷趋势') plt.plot(x, data1[i], 'r') plt.show() for i in range(len(label_pred)): if label_pred[i] == 3: x = [i for i in range(24)] plt.axis([0, 24, 0, 1.2]) plt.xlabel('时刻/h') plt.ylabel('负荷') plt.title('这是4类负荷趋势') plt.plot(x, data1[i], 'k') plt.show() model = decisionTree(date_rowDate, tem_rowDate, label_pred) label = model.predict(data_test) #print("这天所属类别为第"+str(label)+"类") #print("这天所属类别为第"+str(label+1)+"类") #print(label) return label
def main(argv): evaluation_dir = '' output_dir = '' if len(sys.argv) == 5: try: opts, args = getopt.getopt(argv, "hc:o:", ["cfile=", "ofile="]) except getopt.GetoptError: print 'error' sys.exit(2) for opt, arg in opts: if opt == '-h': print 'main.py -c /path/to/evaluation/directory -o path/to/output/directory' sys.exit() elif opt in ("-c", "--cfile"): evaluation_dir = arg elif opt in ("-o", "--ofile"): output_dir = arg filelog = "Log.out" logging.basicConfig(filename=filelog, filemode='a', level=logging.DEBUG) info_path = os.path.join(evaluation_dir, "info.json") problem_folder = evaluation_dir dict_file = read_data.file_info(info_path) print dict_file # create folder to store csv file csv_folder = r"./csv" if not os.path.exists(csv_folder): os.makedirs(csv_folder) print "convert to csv.." # convert to csv read_data.convert_to_csv(problem_folder, csv_folder, dict_file) # create folder to store output file output_folder = output_dir if not os.path.exists(output_folder): os.makedirs(output_folder) produce_output.create_output_folder_problem(output_folder, dict_file) print "processing each file.." # read each csv file and input to clustering algorithm for k, v in dict_file.iteritems(): csv_path = "./csv/" + k + "." + v + ".csv" print csv_path out_path = os.path.join(output_folder, k) print out_path text = pd.read_csv(csv_path, header=0, quoting=csv.QUOTE_MINIMAL) labels = [] vec_features = [] dict_output = {} dict_features = {} if v == 'en': model = gensim.models.word2vec.Word2Vec.load_word2vec_format( 'GoogleNews-vectors-negative300.bin', binary=True) vec_features = word2vec_average.getAvgFeatureVecs( word2vec_average.getCleanReviews(text, v), model, 300) labels = cluster.clustering_word2vec(vec_features) elif v == "nl": model = gensim.models.word2vec.Word2Vec.load( 'nl-word2vec-model-300-word.bin') vec_features = word2vec_average.getAvgFeatureVecs( word2vec_average.getCleanReviews(text, v), model, 300) labels = cluster.clustering_word2vec(vec_features) elif v == 'gr': all_sent = [] for article in text['article']: all_sent.append(article) vec_features, labels = cluster.clustering( all_sent, dict_file[k]) i = 0 for id in text["id"]: dict_output[id] = labels[i] dict_features[id] = vec_features[i] i += 1 list_all = produce_output.write_cluster(dict_output, out_path) # similarity between documents list_comb, all_sim = cluster.similarity_score(list_all, dict_features) list_sim = produce_output.write_ranking(list_comb, all_sim, out_path)
return xm.dot(inverse_cov).dot(xm.T) function_id = 8 dimension = 2 obj = CEC2005(dimension)[function_id].objective_function xlim = [1.5 ,3.25] ylim = [-2, -1] positions = np.random.uniform([xlim[0], ylim[0]], [xlim[1], ylim[1]], size=(20,2)) fitnesses = np.array([ obj(x) for x in positions ]) draw( obj, 'test.png', xlim=xlim, ylim=ylim, scatter = positions ) from cluster import weighted_gaussian mean, cov = weighted_gaussian( positions, fitnesses ) from scipy.stats import multivariate_normal rv = multivariate_normal(mean, cov) from cluster import clustering labels = clustering(positions, fitnesses) print('mean:', mean) print('cov:\n', cov) print(labels) x, y = np.mgrid[xlim[0]:xlim[1]:((xlim[1]-xlim[0])/100), ylim[0]:ylim[1]:((ylim[1]-ylim[0])/100)] pos = np.empty(x.shape+(2,)) pos[:,:,0] = x pos[:,:,1] = y plt.contourf(x, y, rv.pdf(pos)) plt.savefig('contourf.png') plt.close() new_p = np.array([2.35, -1.6])
# Transform projection in space between (0.1, 0.9) for j in range(X_projected.shape[1]): _min = numpy.min(X_projected[:, j]) _max = numpy.max(X_projected[:, j]) if _min < 0: X_projected[:, j] += numpy.abs(_min) _max += numpy.abs(_min) X_projected[:, j] = 0.1 + 0.8 * (X_projected[:, j] / _max) # Plot the original data. for lbl in numpy.unique(y_truth[:, i]): ax[0,i].plot(X_projected[:,0][y_truth[:,i]==lbl], \ X_projected[:,1][y_truth[:,i]==lbl], 'o', color=PLOTCOLS[lbl]) # Cluster the data. y = clustering(X[:, :, i], mode="KMEANS", n_clusters=k) # Compute the silhouette score. sil = silhouette_score(X[:, :, i], y, metric="euclidean") # Compute overlap. Note that clusters might # overlap, but be labelled with a different value. # Values are arbitrary, so this should be # corrected. # Run through all possible permutations of labels. perms = list(itertools.permutations(range(k))) overlap = numpy.zeros(y.shape[0], dtype=bool) max_overlap = 0 closest_perm = list(numpy.unique(y)) for perm in perms: # Create permutated array.
# Load data. X = numpy.loadtxt(fpath_in, dtype=float, delimiter=",", \ skiprows=1, unpack=False) y_truth = X[:,0].astype(int) X = X[:,1:] # Create empty matrices to hold the outcomes in. y = numpy.zeros((X.shape[0],len(CLUSTER_K[cluster_method])), dtype=int) \ * numpy.NaN s = numpy.zeros((X.shape[0],len(CLUSTER_K[cluster_method])), dtype=float) \ * numpy.NaN # Run through the requested numbers of clusters. for ki, k in enumerate(CLUSTER_K[cluster_method]): # Run cluster analysis. y[:,ki] = clustering(X, mode=cluster_method, n_clusters=k) # Compute a silhouette coefficient for each sample, but only if # more than one cluster was detected. if len(numpy.unique(y[:,ki])) == 1: s[:,ki] = 0.0 else: s[:,ki] = silhouette_samples(X, y[:,ki], metric='euclidean') # Write to file. with open(fpath_out, "w") as f: header = ["cluster"] for ki, k in enumerate(CLUSTER_K[cluster_method]): header.append("y_k={}".format(k)) header.append("s_k={}".format(k)) f.write(",".join(map(str,header))) for i in range(X.shape[0]):
def main(argv): evaluation_dir = '' output_dir = '' if len(sys.argv) == 5: try: opts, args = getopt.getopt(argv, "hc:o:", ["cfile=", "ofile="]) except getopt.GetoptError: print 'error' sys.exit(2) for opt, arg in opts: if opt == '-h': print 'main.py -c /path/to/evaluation/directory -o path/to/output/directory' sys.exit() elif opt in ("-c", "--cfile"): evaluation_dir = arg elif opt in ("-o", "--ofile"): output_dir = arg filelog = "Log.out" logging.basicConfig(filename=filelog, filemode='a', level=logging.DEBUG) info_path = os.path.join(evaluation_dir, "info.json") problem_folder = evaluation_dir dict_file = read_data.file_info(info_path) print dict_file # create folder to store csv file csv_folder = r"./csv" if not os.path.exists(csv_folder): os.makedirs(csv_folder) print "convert to csv.." # convert to csv read_data.convert_to_csv(problem_folder, csv_folder, dict_file) # create folder to store output file output_folder = output_dir if not os.path.exists(output_folder): os.makedirs(output_folder) produce_output.create_output_folder_problem(output_folder, dict_file) print "processing each file.." # read each csv file and input to clustering algorithm for k, v in dict_file.iteritems(): csv_path = "./csv/" + k + "." + v + ".csv" print csv_path out_path = os.path.join(output_folder, k) print out_path text = pd.read_csv(csv_path, header=0, quoting=csv.QUOTE_MINIMAL) labels = [] vec_features = [] dict_output = {} dict_features = {} if v == 'en': model = gensim.models.word2vec.Word2Vec.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True) vec_features = word2vec_average.getAvgFeatureVecs(word2vec_average.getCleanReviews(text, v), model, 300) labels = cluster.clustering_word2vec(vec_features) elif v == "nl": model = gensim.models.word2vec.Word2Vec.load('nl-word2vec-model-300-word.bin') vec_features = word2vec_average.getAvgFeatureVecs(word2vec_average.getCleanReviews(text, v), model, 300) labels = cluster.clustering_word2vec(vec_features) elif v == 'gr': all_sent = [] for article in text['article']: all_sent.append(article) vec_features, labels = cluster.clustering(all_sent, dict_file[k]) i = 0 for id in text["id"]: dict_output[id] = labels[i] dict_features[id] = vec_features[i] i += 1 list_all = produce_output.write_cluster(dict_output, out_path) # similarity between documents list_comb, all_sim = cluster.similarity_score(list_all, dict_features) list_sim = produce_output.write_ranking(list_comb, all_sim, out_path)
def main(videoPath, expect, showIt): start = time.time() ''' Argument: disHor --圆环中心距离机器人水平距离 disVer --圆环中心距离机器人垂直距离 ringActual --圆环实际半径大小 cameraHight --摄像机高度 ballR --球的实际半径 ''' disHor = 600 disVer = 200 robotR = 20 ringActual = 40 cameraHight = 20 ballR = 17 disHor -= robotR disVer += ringActual #pathL,pathR=videoPath camera = cv2.VideoCapture(videoPath) res, image = camera.read() res, image = camera.read() if not res: print("Image not captured!") return camera.release() x, y, r, points, isDetect = imageToPoints(image, disHor, videoPath, showIt) if (not isDetect): return decide(0, expect) if (len(points) > 7): points = cluster.clustering(points, showIt) #如果检测出来的球个数少于7个,则无法进行曲线拟合,跳出循环 print('points number:', len(points)) if (len(points) < 4): print("Detected Ball not enough, Exit System") return decide(0, expect) coordinate, ringXYZ = obtainCoordinate(image, disHor, disVer, ringActual, cameraHight, ballR, points, x, y, r) if (max(coordinate[1]) < disHor / 3): return decide(0, expect) line.drawGraph(coordinate.tolist(), showIt) bp = leastsq.draw3DLine(coordinate, showIt) bx, bz = leastsq.predictBallPos(disHor, bp) print("Ball position around ring\n (x,y,z)=({:.2f},{:.2f},{:.2f})".format( bx, disHor, bz)) print("Ring position:\n (x,y,z)=({:.2f},{:.2f},{:.2f})".format( ringXYZ[0], ringXYZ[1], ringXYZ[2])) if (disVer - ringActual - 10 < bz < disVer + ringActual + 10): result = 1 else: result = 0 print("Use {:.2f} seconds".format(time.time() - start)) return decide(result, expect), (bx, disHor, bz), ringXYZ
def cluster(df, NGroups=2, category_dic={ PROD: [], COUNTRY: ['Ethiopia'] }, mode=0, Alg=0, init_mode=0, norm=False, PCA=False, dim=10): """ cluster de dataframe aan de hand van de category_dic en mode. category_dic is waar je op categoriseerd. Een lege lijst betekend alles. mode: - 0 pakt de PRICE als value - 1 pakt de Gradient als value - 2 pakt de PRICE en Gradient als value Alg: - 0 pakt de distance als clustering method - 1 pakt de cosine als clustering method init_mode: - 0 zorgt ervoor dat de category cluster toegewezen krijgen door de range(NGroups) te herhalen en daarna opvullen met 0. bijv 0 1 2 0 1 2 0 0 - 1 zorgt ervoor dat de category cluster toegewezen krijgen door de eerste n categorieen cluster 0 te geven dan 1 etc. bijv. 0 0 0 1 1 1 2 2 - 2 zorgt ervoor dat de category random cluster toegewezen krijgen. norm: - False is niet genormaliseerde data - True is genormaliseerde data PCA: - False pas PCA niet toe - True pas PCA wel toe. (plotten kan dan niet meer, tenzij je niet dim gebruikt) """ value = PRICE if mode == 1: value = 'Gradient' # creeer de dataset voor k-means dates, categories, data = df_to_np_date_price(df, category_dic, value=value) # print(categories) #print de catogorien die worden geclusterd # print(len(categories)) if norm: data = (data - np.nanmin(data, axis=1)[:, None]) / ( np.nanmax(data, axis=1) - np.nanmin(data, axis=1))[:, None] if mode == 2: _, _, data2 = df_to_np_date_price(df, category_dic, value='Gradient') tmp_data = data if norm: data2 = (data2 - np.nanmin(data2, axis=1)[:, None]) / ( np.nanmax(data2, axis=1) - np.nanmin(data2, axis=1))[:, None] data = np.concatenate((data, data2), axis=1) # clustering, als het verschil tussen de het nieuwe en oude gemiddelde convergeert is is het clusteren klaar. i = 0 if PCA and norm: data = clus.PCA(data, dim) datagroup = clus.clustering(data, NGroups, init_mode) while np.max( np.sqrt( np.nansum((datagroup.GroupAvg - datagroup.NewGroupAvg)**2, axis=1))) > 0.01 and i < 100: # print(datagroup.data[:,-1]) #print de tussen stappen van k-means if Alg == 0: datagroup.Clustering() elif Alg == 1: datagroup.Clustering2() i += 1 # maak de data weer met alleen PRICE if mode == 2: data = tmp_data # maak een dictionary met de cluster groepen. dic = {} for cat, group in zip(categories, datagroup.data[:, -1]): if group in dic: dic[group].append(cat) else: dic[group] = [cat] # print de dictionary # i = 0 # for group, catLst in dic.items(): # print(group, len(catLst)) # print(catLst) # print(np.nanmean(datagroup.NewGroupAvg[i])) # i += 1 # plt.rcParams['axes.prop_cycle'] = "cycler('ls', ['-','--','-.',':']) * cycler(u'color', ['r','g','b','c','k','y','m','934c00'])" #changes the colour of the graph lines # for i, row in enumerate(data): # # if i == 0: # # continue # # if i > 3: # # break # D = [float(date.split("-")[0]) + (float(date.split("-")[1]) - 1) / 12 for date in dates] # plt.plot(D, row, label=categories[i]) # # # # plot de cluster gemiddelde # # for i in range(NGroups): # # D = [float(date.split("-")[0]) + (float(date.split("-")[1]) - 1) / 12 for date in dates] # # if mode == 2: # # plt.plot(D, datagroup.NewGroupAvg[i, :data.shape[1]], label=i) # # else: # # plt.plot(D, datagroup.NewGroupAvg[i, :], label=i) # # # plot # plt.rcParams.update({'font.size': 16}) # plt.rcParams['legend.fontsize'] = 16 # plt.legend(fancybox=True,loc="best",framealpha=0.8) # plt.ylabel('Betaalbaarheid-index', fontsize=18) # plt.xlabel('Datum (jaren)', fontsize=16) # plt.show(True) return dic, data