def task2A(): dataOriginal = sampling.originalData() #dataOriginal.drop(['A15'], axis=1) del dataOriginal['A15'] dataRandom = sampling.randomSampling() del dataRandom['A15'] dataStrat = sampling.stratifiedSampling() del dataStrat['A15'] def intrinsicDim(data): x = StandardScaler().fit_transform(data) A = np.asmatrix(x.T) * np.asmatrix(x) U, S, V = np.linalg.svd(A) eigVals = S**2 / np.sum(S**2) cumulative = [sum(eigVals[:i]) for i in range(1, 15)] intrinsicDim = pd.DataFrame({ "dimension": [np.arange(1, 15)], "eigenValues": [eigVals], "cumulativeEigVals": [cumulative] }) return intrinsicDim.to_json() #Original Data intrinsicDimOrg = intrinsicDim(dataOriginal) intrinsicDimRand = intrinsicDim(dataRandom) intrinsicDimStrat = intrinsicDim(dataStrat) data = [intrinsicDimOrg, intrinsicDimRand, intrinsicDimStrat] data = pd.DataFrame(data) data = data.to_json() return render_template('task.html', taskJS="task2a", data=data)
def task1B(): dataOriginal = sampling.originalData() #dataOriginal.drop(['A15'], axis=1) del dataOriginal['A15'] dataRandom = sampling.randomSampling() del dataRandom['A15'] dataStrat = sampling.stratifiedSampling() del dataStrat['A15'] def kmeansElbow(data): dictionary = {} for i in range(1, 10): km = KMeans(n_clusters=i) #kmeans for stratified sampled data alldistances = km.fit(data) #totalDistance = np.min(alldistances, axis=1).sum() dictionary[i] = alldistances.inertia_ #totalDistance/i; return dictionary dictOriginal = kmeansElbow(dataOriginal) dictRandom = kmeansElbow(dataRandom) dictStrat = kmeansElbow(dataStrat) dict = [dictOriginal, dictRandom, dictStrat] # kmeansData=pd.DataFrame({"clusters":[clusters],"avgDistance":[avgDistance]}) kmeansData = pd.DataFrame(dict) data = kmeansData.to_json() return render_template('task.html', taskJS="task1b", data=data)
def task3A(): dataOriginal = sampling.originalData() dataOriginalY = dataOriginal['A15'] del dataOriginal['A15'] dataOriginal = StandardScaler().fit_transform(dataOriginal) dataRandom = sampling.randomSampling() dataRandomY = dataRandom['A15'] del dataRandom['A15'] dataRandom = StandardScaler().fit_transform(dataRandom) dataStrat = sampling.stratifiedSampling() dataStratY = dataStrat['A15'] del dataStrat['A15'] dataStrat = StandardScaler().fit_transform(dataStrat) pca = PCA(n_components=2) originalPCA = pca.fit_transform(dataOriginal) randomPCA = pca.fit_transform(dataRandom) stratPCA = pca.fit_transform(dataStrat) print(originalPCA[:, :2]) data = [ originalPCA, dataOriginalY, randomPCA, dataRandomY, stratPCA, dataStratY ] data = pd.DataFrame(data) data = data.to_json() return render_template('task.html', taskJS="task3a", data=data)
def task3c(): dataOriginal = sampling.originalData() dataOriginalY = dataOriginal['A15'] del dataOriginal['A15'] dataOriginal = StandardScaler().fit_transform(dataOriginal) dataRandom = sampling.randomSampling() dataRandomY = dataRandom['A15'] del dataRandom['A15'] dataRandom = StandardScaler().fit_transform(dataRandom) dataStrat = sampling.stratifiedSampling() dataStratY = dataStrat['A15'] del dataStrat['A15'] dataStrat = StandardScaler().fit_transform(dataStrat) pca = PCA(n_components=3) def return_dict_arr(data, yVal): array = [] yVal = np.array(yVal) for i in range(len(data)): array.append({ "target": yVal[i], "PCA1": data[i, 0], "PCA2": data[i, 1], "PCA3": data[i, 2] }) return array originalPCA = pca.fit_transform(dataOriginal) originalPCA = {"values": return_dict_arr(originalPCA, dataOriginalY)} randomPCA = pca.fit_transform(dataRandom) randomPCA = {"values": return_dict_arr(randomPCA, dataRandomY)} stratPCA = pca.fit_transform(dataStrat) stratPCA = {"values": return_dict_arr(stratPCA, dataStratY)} # print(originalPCA[:,:2]) data = [ json.dumps(originalPCA), json.dumps(randomPCA), json.dumps(stratPCA) ] #,"randomPCA":dataOriginalY,dataRandomY,stratPCA,dataStratY} data = pd.DataFrame(data) data = data.to_json() #data=json.dumps(data) return render_template('task3c.html', taskJS="task3c", data=data)
def task3B(): dataOriginal = sampling.originalData() dataOriginalY = dataOriginal['A15'] del dataOriginal['A15'] dataOriginal = StandardScaler().fit_transform(dataOriginal) dataRandom = sampling.randomSampling() dataRandomY = dataRandom['A15'] del dataRandom['A15'] dataRandom = StandardScaler().fit_transform(dataRandom) dataStrat = sampling.stratifiedSampling() dataStratY = dataStrat['A15'] del dataStrat['A15'] dataStrat = StandardScaler().fit_transform(dataStrat) mds_data = manifold.MDS(n_components=2, dissimilarity='precomputed') similarity = pairwise_distances(dataOriginal, metric='euclidean') originalMDSEu = mds_data.fit_transform(similarity) similarity = pairwise_distances(dataRandom, metric='euclidean') randomMDSEu = mds_data.fit_transform(similarity) similarity = pairwise_distances(dataStrat, metric='euclidean') stratMDSEu = mds_data.fit_transform(similarity) similarity = pairwise_distances(dataOriginal, metric='correlation') originalMDSCo = mds_data.fit_transform(similarity) similarity = pairwise_distances(dataRandom, metric='correlation') randomMDSCo = mds_data.fit_transform(similarity) similarity = pairwise_distances(dataStrat, metric='correlation') stratMDSCo = mds_data.fit_transform(similarity) data=[originalMDSEu,dataOriginalY,randomMDSEu,dataRandomY,stratMDSEu,dataStratY,\ originalMDSCo,dataOriginalY,randomMDSCo,dataRandomY,stratMDSCo,dataStratY] data = pd.DataFrame(data) data = data.to_json() return render_template('task.html', taskJS="task3b", data=data)
def txt2tuple(): csv_list = glob.glob("../data/csv/{}*.csv".format(FIELD)) length = len(csv_list) if length == 0: raise Exception("No matching file!") for count in range(length): input_file = csv_list[count] # file_name = re.search('{}(.+?)-13-17.csv'.format(FIELD), input_file).group(1)[1:] file_name = re.search('{}(.+?).csv'.format(FIELD), input_file).group(1)[1:] start = time.time() print("Generating {} tuple ... ".format(file_name), flush=True) file = txt2list.txt2list(input_file) records = file.convert("\t") rows = records.shape[0] # with sampling, 40 records per year # optional operation if count < length: print(count) sampling_data = [] sampling_labels = np.arange(2013, 2018) for i in range(rows): record = records[i] bh = i py = int(record[1]) sampling_data.append([bh, py]) sampling_input = np.array(sampling_data) sampling_index = 0 label_index = 1 # 每年的采样数应该一致 sampling_type = 'rs' scale = 40 * len(sampling_labels) sampling_output = sampling.stratifiedSampling( sampling_input, sampling_labels, label_index, sampling_index, sampling_type, scale) # print("Length of sampling data is: {}\nShape of sampling result is: {}".format(len(sampling_data), sampling_output.shape)) for j in range(len(sampling_labels)): result = sampling_output[j] py = sampling_labels[j] output_file = open("../data/tuple/{}-{}-{}.txt".format( FIELD, file_name, str(py)), 'w+', encoding='utf-8') output_file.write('bh,py,src,speech,word\n') for bh in result: record = records[bh] TIss = pynlpir.segment(record[0]) # do not segment keywords # KWs = re.split(r'[|]',record[2])[:-1] # segment keywords KWs = pynlpir.segment(record[2]) AB = re.split(r'[<正>]', record[3])[-1] ABss = pynlpir.segment(AB) write_tuple(bh, py, TIss, 4, output_file, file_name) write_tuple(bh, py, KWs, 2, output_file, file_name) write_tuple(bh, py, ABss, 1, output_file, file_name) output_file.close() print("finished in {:.2f} sec.".format(time.time() - start), flush=True) # without sampling, full records elif count > length: output_file = open("../data/tuple/Full/{}-{}.txt".format( FIELD, file_name), 'w+', encoding='utf-8') output_file.write('bh,py,src,speech,word' + '\n') for i in range(rows): record = records[i] bh = i TIss = pynlpir.segment(record[0]) py = record[1] # KWs = re.split(r'[|]',record[2])[:-1] KWs = pynlpir.segment(record[2]) AB = re.split(r'<[正]>', record[3])[-1] ABss = pynlpir.segment(AB) write_tuple(bh, py, TIss, 4, output_file) write_tuple(bh, py, KWs, 2, output_file) write_tuple(bh, py, ABss, 1, output_file) output_file.close() print("finished in {:.2f} sec.".format(time.time() - start), flush=True) else: pass
def task1A(): dataStrat = sampling.stratifiedSampling().to_json() dataRandom = sampling.randomSampling().to_json() data = {"stratifiedSampling": dataStrat, "randomSampling": dataRandom} return render_template('task.html', taskJS="task1a", data=data)