def cluster_text_job(): logger.info('cluster_text_job() start...') # 政府资讯的 pulish time 是 0 点 compare_time_gap = 1.5 start_time = time_utils.n_days_ago_milli_time(compare_time_gap) end_time = time_utils.current_milli_time() logger.info("load text data start_time: {}".format(start_time)) logger.info("load text data end_time: {}".format(end_time)) original_data_file = 'logs/original_data.txt' extradata_file = 'logs/extra_data.txt' # 由于是按着 publish time 增量,需要自己进行比对 data_file, new_file_data = load_text_data.get_extradata_from_api( start_time, end_time, original_data_file, extradata_file) logger.info("load text data file path...: {}".format(data_file)) # 两个版本 #ner_content_data, raw_data = cluster.fetch_data(data_file) ner_content_data, word_content_data, text_data, word_title_data, raw_data = cluster2.fetch_data( data_file) length_data = len(raw_data) logger.info('cluster corpus size: ' + str(length_data)) # 太少的数据不做聚类 if length_data < 100: logger.info("corpus size is too small only update end_time: {}".format( end_time)) return # 覆盖掉 original_data_file load_text_data.update_original_data_file(new_file_data, original_data_file) new_file_data = None # 取上一次聚类结果(增量聚类) origin_cluster_file_path = 'logs/origin_cluster.txt' n_reserve_days_for_1size_cluster = 1 n_reserve_days = 1 # 两个版本 同上 #origin_cluster_result = cluster.get_origin_cluster_result(origin_cluster_file_path, end_time, n_reserve_days_for_1size_cluster, n_reserve_days) origin_cluster_result = cluster2.get_origin_cluster_result( origin_cluster_file_path, end_time, n_reserve_days_for_1size_cluster, n_reserve_days) # 开始聚类 # 两个版本 同上 #cluster_result = cluster.cluster(origin_cluster_result, ner_content_data, raw_data) cluster_result = cluster2.cluster(origin_cluster_result, ner_content_data, word_content_data, text_data, word_title_data, raw_data) # 在本地此次保存更新后的聚类结果(如只保留近 1 天,删掉之前的时间簇,目的是参与下一次聚类,作为下一次 # 增量聚类的基数,即考虑近 1 天的事件进行合并) cluster_result, cluster_already_merged = save_cluster_result.cluster_result_futher_merge( cluster_result, origin_cluster_file_path) save_cluster_result.dele_already_merged_cluster(cluster_already_merged) save_cluster_result.save_cluster_result(cluster_result, 1) logger.info('cluster_text_job() end...')
def main(): #--------Read in old variables data------- # read in preprocessed values d = pd.read_csv("newProcessed.csv") dataframe = d.loc[:, ['PriceChange', 'VolumeChange']] # designated weight weight = [0.2, 0.78, 0.015, 0.005] #--------Read in new variables data-------- # # Uncomment this to run on new variable data # #read in preprocessed values # d = pd.read_csv("SandPMarch31.csv") # dataframe = d.loc[:, ['PriceChange', 'frac']] # #designated weight # weight = [0.09, 0.1, 0.1, 0.71] #--------Pre-process data------------------ X = np.array(dataframe.to_numpy()) #summarize 30 day data and put into matrix data = sum30Day(X) np.set_printoptions(precision=4, suppress=True) np.random.seed(2) # convert data to np.array raw_data = np.asarray(data, dtype=np.float32) # define the number of clusters k = 7 #---------Optimization of parameters-------------- ### Uncomment to either manually or use scipy to optimize weight # # get the sihouette score # def rosen(weight): # return opt.opt_helper(k, weight, raw_data) # # manually test different weights # opt.manual_minimize(rosen) # # calculate distance depending on the weight # def distance(item1, item2): # return cluster.distance(weight,item1, item2) # # use the scipy minimization to find optimal parameters # opt.minimizeHelper(rosen, weight) #-------------K-mean cluster once----------- # Perform K-mean cluster once for the designated weight, in order # to generate the Markov Chain plot print("\nBegin k-means clustering demo \n") # normalize the raw data so that they are all in the range of (0,1) (norm_data, mins, maxs) = cluster.mm_normalize(raw_data) # perform clustering print("\nClustering normalized data with k=" + str(k)) clustering = cluster.cluster(weight, norm_data, k) print("\nDone. Clustering:") print("\nRaw data grouped by cluster: ") clusters = cluster.display(norm_data, clustering, k)
def processing(start_time, end_time): dir_path = './data1/' day = get_standard_time(end_time) logger.info("day processing... : {}".format(day)) logger.info("load text data start_time: {}".format(start_time)) logger.info("load text data end_time: {}".format(end_time)) data_file = dir_path + day + '.txt' data_file_ = dir_path + day + '_.txt' cluster_result_file = dir_path + day + '_cluster_result.txt' cluster_triple_file = dir_path + day + '_cluster_triple.txt' triple_cluster_file = dir_path + day + '_triple_cluster.txt' load_text_data.load_data_from_api(start_time, end_time, data_file) logger.info("load text data file path...: {}".format(data_file)) data_file_ = cluster2.data_event_process(data_file, data_file_) ner_content_data, word_content_data, text_data, word_title_data, raw_data = cluster2.fetch_data( data_file_) length_data = len(raw_data) logger.info('cluster corpus size: ' + str(length_data)) # 太少的数据不做聚类 if length_data < 100: logger.info("corpus size is too small only update end_time: {}".format( end_time)) origin_cluster_result = [] cluster_result = cluster2.cluster(origin_cluster_result, ner_content_data, word_content_data, text_data, word_title_data, raw_data) with io.open(cluster_result_file, 'w', encoding='utf-8') as f1: for x in cluster_result: f1.write(json.dumps(x, ensure_ascii=False) + "\n") all_cluster_event_infos = load_cluster_and_process.load_cluster_info_process( cluster_result_file, cluster_triple_file, hot_filter=0) load_cluster_and_process.all_cluster_event_infos_process( all_cluster_event_infos, triple_cluster_file)
def opt_helper(k, weight_, raw_data): """ Calculate silhouette score based on given weight and k """ # normalize the raw data so that they are all in the range of (0,1) (norm_data, mins, maxs) = cluster.mm_normalize(raw_data) # define the number of clusters #weight_ = [weight[0], weight[1], weight[2], 0] print("weight:", weight_) def distance(item1, item2): return cluster.distance(weight_, item1, item2) clustering = cluster.cluster(weight_, norm_data, k) # clusters = cluster.display(norm_data, clustering, k) result = -metrics.silhouette_score(norm_data, clustering, metric=distance, sample_size=1000, random_state=2) print("result: ", result) return result
def main(): # read in preprocessed values d = pd.read_csv("newProcessed.csv") dataframe = d.loc[:, ['PriceChange', 'VolumeChange']] X = np.array(dataframe.to_numpy()) # data is the matrix that holds all the pca 5-elements lists # it has a dimension of (n, 5) where n is the number of pcas we have data = [] # Perform PCA on every 30 data points using the shifting strategy i = 0 pca = PCA() # declare PCA object with constructor ''' Summarizing 30 days data in the following format: [mean of percent price change, mean of percent volume change change, principal eigenvalue, secondary principal eigenvalue, theta] ''' while (i<len(X)-30): oneMonth = X[i:i+30] returnList=[] # Append mean of % change price and % change volume returnList.append(mean(oneMonth[:, 0])) returnList.append(mean(oneMonth[:, 1])) # Append the two eigenvalues, the bigger eigenvalues go first # and hold more weight pca.fit(oneMonth) evals = pca.explained_variance_ returnList.append(evals[0]) returnList.append(evals[1]) # calculate theta y = pca.components_[0][1] x = pca.components_[0][0] theta = math.atan(y/x)*180/(math.pi) # append theta returnList.append(theta) # increment i i = i+1 # append returnList to data data.append(returnList) print("\nBegin k-means clustering demo \n") np.set_printoptions(precision=4, suppress=True) np.random.seed(2) # convert data to np.array raw_data = np.asarray(data, dtype=np.float32) # normalize the raw data so that they are all in the range of (0,1) (norm_data, mins, maxs) = cluster.mm_normalize(raw_data) # define the number of clusters k = 7 # perform clustering print("\nClustering normalized data with k=" + str(k)) clustering = cluster.cluster(norm_data, k) # print results print("\nDone. Clustering:") print(clustering) print("\nRaw data grouped by cluster: ") clusters = cluster.display(norm_data, clustering, k) # Uncomment in order to visualize the result by plotting all elicpses on the same plot # # draw_ellipse(data, clustering) ### Uncoment if want to visualize the optimal k value, must comment out the block above # Find the optimal k value by calculating the average distance associated with each # # distance_L = [] # for k in range(1, 8): # print("k = "+ str(k)) # clustering = cluster.cluster(norm_data, k) # clusters = cluster.display(norm_data, clustering, k) # distance = cluster_distance(clusters) # distance_L.append([k, distance]) # threshold_plot(distance_L) print("\nEnd k-means demo ") # Split the eclipses in 9 Xtrain: 1 Xtest for the Markov Model splitPt = int(len(clustering) * 0.9) Xtrain = clustering[0:splitPt] Xtest = clustering[splitPt:] # Training and Testing Markov Model dictionary = get_cluster_dict(Xtrain) correctness, not_found = test_markov(dictionary, Xtest) print("Accuracy is " + str(correctness*100) + "%") print("Cases not found: ", not_found)
import pandas as pd import vector as v import preprocessing as p import cluster2 as c import classifier as r a = pd.read_csv("Z:/TermPaper/twitter_cred-master/data.csv") print("cleaning....") doc, id1 = p.clean(a) print("vectorizing....") dvec, global_vector = v.vectorize(doc) print("clustering....") g, t = c.cluster(dvec, global_vector, id1) cnt = 0 x = [] print(len(t)) print("credibility calculating") r.classifier(g)
logger.info('cluster corpus size: ' + str(length_data)) # 太少的数据不做聚类 if length_data < 100: logger.info("corpus size is too small only update end_time: {}".format( end_time)) # 覆盖掉 original_data_file #load_text_data.update_original_data_file(new_file_data, original_data_file) #new_file_data = None # 取上一次聚类结果(增量聚类) origin_cluster_file_path = 'logs/origin_cluster.txt' n_reserve_days_for_1size_cluster = 1 n_reserve_days = 1 # 两个版本 同上 #origin_cluster_result = cluster.get_origin_cluster_result(origin_cluster_file_path, end_time, n_reserve_days_for_1size_cluster, n_reserve_days) origin_cluster_result = cluster2.get_origin_cluster_result( origin_cluster_file_path, end_time, n_reserve_days_for_1size_cluster, n_reserve_days) # 开始聚类 # 两个版本 同上 #cluster_result = cluster.cluster(origin_cluster_result, ner_content_data, raw_data) cluster_result = cluster2.cluster(origin_cluster_result, ner_content_data, word_content_data, text_data, word_title_data, raw_data) with io.open('logs1/cluster_result.txt', 'w', encoding='utf-8') as f1: for x in cluster_result: f1.write(json.dumps(x, ensure_ascii=False) + "\n")