def Ttest(name): cluster_dim = request.get_json() #bytes data = [] Tnum = float(cluster_dim["Tnum"]) # print(name) fileN = db.searchFile(name) # print(fileN) fileN = fileN[0]['filename'] # print(fileN) if fileN == "dont_have_file": ret = {"route": "nofile"} return json.dumps(ret) with open(name + '/' + fileN, 'r', newline='') as csvFile: csv_file = csv.reader(csvFile) for content in csv_file: content = list(map(float, content)) if len(content) != 0: data.append(content) result = stats.ttest_1samp(data, Tnum) print("result\n") print(result) with open(name + '/' + 'Ttest_' + fileN, 'w', newline='') as new_file: csv_writer = csv.writer(new_file) csv_writer.writerow(result[1]) if result[1] < 0.05: csv_writer.writerow(["The difference is significant"]) else: csv_writer.writerow(["The difference is not significant"]) ret = {"route": 'Ttest_' + fileN} return json.dumps(ret)
def pdf2word(name): # pip install pdfminer3k # pip install python_docx fileN = db.searchFile(name) # print(fileN) fileN = fileN[0]['filename'] # print(fileN) if fileN == "dont_have_file": ret = {"route": "nofile"} return json.dumps(ret) pdfname = fileN document = Document() pdf = os.open(name + '/' + pdfname, os.O_RDWR) fn = open(pdf, 'rb') parser = PDFParser(fn) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) resource = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(resource, laparams=laparams) interpreter = PDFPageInterpreter(resource, device) for i in doc.get_pages(): interpreter.process_page(i) layout = device.get_result() for out in layout: if hasattr(out, "get_text"): content = out.get_text().replace(u'\n', u'') document.add_paragraph(content, style='ListBullet') document.save(name + '/' + pdfname[0:-4] + '.docx') print('处理完成') ret = {"route": pdfname[0:-4] + '.docx'} return json.dumps(ret)
def nb(name): print("first") fileN = db.searchFile(name) print("second") print(fileN) isfile = fileN[0]['filename'] # print(fileN) if isfile == "dont_have_file": ret = {"route": "nofile"} return json.dumps(ret) if len(fileN) < 2: ret = {"route": '需要上传训练集和测试集以进行模型训练与测试'} return json.dumps(ret) train = [] test = [] train_file = fileN[0]['filename'] test_file = fileN[1]['filename'] print(train_file) print(test_file) train = get_nb_train_dataset(name, train_file) test = get_nb_test_dataset(name, test_file) print('train=', train) print('test=', test) train_data, train_target, valid_data, valid_target = nb_train_and_valid_data( train) nbc_6 = Pipeline([ ('vect', TfidfVectorizer()), ('clf', MultinomialNB(alpha=1.0)), ]) nbc_6.fit(train_data, train_target) #训练我们的多项式模型贝叶斯分类器 valid = nbc_6.predict(valid_data) #在测试集上预测结果 print(valid) count = 0 #统计预测正确的结果个数 for left, right in zip(valid, valid_target): if left == right: count += 1 print(count / len(valid_target)) predict = nbc_6.predict(test) print(predict) p_list = [] for p in predict: list_tem = [] list_tem.append(p) p_list.append(list_tem) print(p_list) with open(name + '/' + 'NaiveBayes.csv', 'w', newline='') as new_file: csv_writer = csv.writer(new_file) for line in p_list: # print(line) csv_writer.writerow(list(line)) ret = {"route": 'NaiveBayes.csv'} return json.dumps(ret)
def workCloud(name): # print(name) fileN = db.searchFile(name) # print(fileN) fileN = fileN[0]['filename'] # print(fileN) if fileN == "dont_have_file": ret = {"route": "nofile"} return json.dumps(ret) txtfile = fileN print(txtfile) f = open(name + '/' + txtfile, encoding='gb18030') text = f.read() print(text) f.close() # Read the whole text. # #获取当前的项目文件加的路径 # d=path.dirname(__file__) # text = open(path.join(d, txtfile)) # 结巴分词 wordlist = jieba.cut(text, cut_all=True) wl = " ".join(wordlist) print(wl) #输出分词之后的txt coloring = np.array(Image.open("wordcloud/background.jpg")) # 设置停用词 stopwords = set(STOPWORDS) stopwords.add("said") # 你可以通过 mask 参数 来设置词云形状 # wc = WordCloud(background_color="white", max_words=2000, mask=coloring, # max_font_size=50, random_state=42,font_path='fangsong_GB2312.ttf') wc = WordCloud(background_color="white", max_words=2000, mask=coloring, max_font_size=50, random_state=42, font_path='Hiragino Sans GB.ttc') wc.generate(wl) # create coloring from image image_colors = ImageColorGenerator(coloring) # show # 在只设置mask的情况下,你将会得到一个拥有图片形状的词云 # plt.imshow(wc, interpolation="bilinear") # plt.axis("off") # plt.figure() # plt.show() wc.to_file(name + '/' + "workcloud.png") ret = {"route": 'workcloud.png'} return json.dumps(ret)
def MIC(name): # pip install minepy fileN = db.searchFile(name) isfile = fileN[0]['filename'] # print(fileN) if isfile == "dont_have_file": ret = {"route": "nofile"} return json.dumps(ret) if len(fileN) < 2: ret = {"route": '需要上传两个文件以进行MIC计算'} return json.dumps(ret) x = [] y = [] file1 = fileN[0]['filename'] file2 = fileN[1]['filename'] csvFile1 = open(name + '/' + file1, encoding='utf-8-sig') csv_file1 = csv.reader(csvFile1) for content in csv_file1: print(content) content = list(map(float, content)) if len(content) != 0: x.append(float(content[0])) csvFile1.close() print('x=', x) csvFile2 = open(name + '/' + file2, encoding='utf-8-sig') csv_file2 = csv.reader(csvFile2) for content in csv_file2: content = list(map(float, content)) if len(content) != 0: y.append(float(content[0])) csvFile2.close() print('y=', y) mine = MINE(alpha=0.6, c=15) mine.compute_score(x, y) print("MIC", mine.mic()) #将MIC值写入文件 with open(name + '/' + 'MIC_result.csv', 'w', newline='') as new_file: csv_writer = csv.writer(new_file) csv_writer.writerow(["MIC result"]) data = [] data.append(str(mine.mic())) csv_writer.writerow(data) ret = {"route": 'MIC_result.csv'} return json.dumps(ret)
def randomForest(name): fileN = db.searchFile(name) isfile = fileN[0]['filename'] # print(fileN) if isfile == "dont_have_file": ret = {"route": "nofile"} return json.dumps(ret) if len(fileN) < 2: ret = {"route": '需要上传训练集和测试集以进行模型训练与测试'} return json.dumps(ret) train = [] test = [] train_file = fileN[0]['filename'] test_file = fileN[1]['filename'] print(train_file) print(test_file) train = get_train_dataset(name, train_file) test = get_test_dataset(name, test_file) print('train=', train) print('test=', test) train_data, train_target, valid_data, valid_target = train_valid_data( train) # Create the model with 100 trees model = RandomForestClassifier(n_estimators=100, bootstrap=True, max_features='sqrt') # Fit on training data model.fit(train_data, train_target) print("train score:", model.score(train_data, train_target)) print("valid score:", model.score(valid_data, valid_target)) predict = model.predict(test) print(predict) p_list = [] for p in predict: list_tem = [] list_tem.append(p) p_list.append(list_tem) print(p_list) with open(name + '/' + 'RandomForest.csv', 'w', newline='') as new_file: csv_writer = csv.writer(new_file) for line in p_list: # print(line) csv_writer.writerow(list(line)) ret = {"route": 'RandomForest.csv'} return json.dumps(ret)
def linearRegression(name): fileN = db.searchFile(name) isfile = fileN[0]['filename'] # print(fileN) if isfile == "dont_have_file": ret = {"route": "nofile"} return json.dumps(ret) if len(fileN) < 2: ret = {"route": '需要上传训练集和测试集以进行模型训练与测试'} return json.dumps(ret) train = [] test = [] train_file = fileN[0]['filename'] test_file = fileN[1]['filename'] print(train_file) print(test_file) train = get_train_dataset(name, train_file) test = get_test_dataset(name, test_file) print('train=', train) print("\n") train_data, train_target, valid_data, valid_target = train_valid_data( train) linreg = LinearRegression() model = linreg.fit(train_data, train_target) print(model) # 训练后模型截距 print(linreg.intercept_) # 训练后模型权重(特征个数无变化) print(linreg.coef_) predict = linreg.predict(test) p_list = [] for p in predict: list_tem = [] list_tem.append(p) p_list.append(list_tem) print(p_list) with open(name + '/' + 'linearRegression.csv', 'w', newline='') as new_file: csv_writer = csv.writer(new_file) for line in p_list: # print(line) csv_writer.writerow(list(line)) ret = {"route": 'linearRegression.csv'} return json.dumps(ret)
def svr(name): fileN = db.searchFile(name) isfile = fileN[0]['filename'] # print(fileN) if isfile == "dont_have_file": ret = {"route": "nofile"} return json.dumps(ret) if len(fileN) < 2: ret = {"route": '需要上传训练集和测试集以进行模型训练与测试'} return json.dumps(ret) train = [] test = [] train_file = fileN[0]['filename'] test_file = fileN[1]['filename'] print(train_file) print(test_file) train = get_train_dataset(name, train_file) test = get_test_dataset(name, test_file) print('train=', train) print('test=', test) train_data, train_target, valid_data, valid_target = train_valid_data( train) svrlassifier = SVR(kernel='rbf', c=20) # Fit on training data svrlassifier.fit(train_data, train_target) print("train score:", svrlassifier.score(train_data, train_target)) print("valid score:", svrlassifier.score(valid_data, valid_target)) predict = svrlassifier.predict(test) print(predict) p_list = [] for p in predict: list_tem = [] list_tem.append(p) p_list.append(list_tem) print(p_list) with open(name + '/' + 'svr.csv', 'w', newline='') as new_file: csv_writer = csv.writer(new_file) for line in p_list: # print(line) csv_writer.writerow(list(line)) ret = {"route": 'svr.csv'} return json.dumps(ret)
def decisionTree(name): fileN = db.searchFile(name) isfile = fileN[0]['filename'] # print(fileN) if isfile == "dont_have_file": ret = {"route": "nofile"} return json.dumps(ret) if len(fileN) < 2: ret = {"route": '需要上传训练集和测试集以进行模型训练与测试'} return json.dumps(ret) train = [] test = [] train_file = fileN[0]['filename'] test_file = fileN[1]['filename'] print(train_file) print(test_file) train = get_train_dataset(name, train_file) test = get_test_dataset(name, test_file) # print('test=',test) train_data, train_target, valid_data, valid_target = train_valid_data( train) model = tree.DecisionTreeClassifier(criterion='gini') # print('train_data=',train_data) # print('train_target=',train_target) model.fit(train_data, train_target) print("train score:", model.score(train_data, train_target)) print("valid score:", model.score(valid_data, valid_target)) predict = model.predict(test) print(predict) p_list = [] for p in predict: list_tem = [] list_tem.append(p) p_list.append(list_tem) print(p_list) with open(name + '/' + 'DecisionTree.csv', 'w', newline='') as new_file: csv_writer = csv.writer(new_file) for line in p_list: # print(line) csv_writer.writerow(list(line)) ret = {"route": 'DecisionTree.csv'} return json.dumps(ret)
def pca(name): pca_dim = request.get_json() #bytes data = [] traffic_feature = [] traffic_target = [] # fileN = file_name.pop() # print(name) fileN = db.searchFile(name) # print(fileN) fileN = fileN[0]['filename'] # print(fileN) if fileN == "dont_have_file": ret = {"route": "nofile"} return json.dumps(ret) csvFile = open(name + '/' + fileN) csv_file = csv.reader(csvFile) for content in csv_file: content = list(map(float, content)) if len(content) != 0: data.append(content) traffic_feature.append(content[0:-2]) traffic_target.append(content[-1]) csvFile.close() min_max_scaler = preprocessing.MinMaxScaler() traffic_feature = min_max_scaler.fit_transform(traffic_feature) # print('data=',data) # print('traffic_feature=',traffic_feature) # print('traffic_target=',traffic_target) dim = int(pca_dim["pca"]) sklearn_pca = sklearnPCA(n_components=dim) sklearn_transf = sklearn_pca.fit_transform(traffic_feature) # print(sklearn_transf) with open(name + '/' + 'PCA_' + fileN, 'w', newline='') as new_file: csv_writer = csv.writer(new_file) for line in sklearn_transf: # print(line) csv_writer.writerow(line) ret = {"route": 'PCA_' + fileN} return json.dumps(ret)
def kmeans(name): cluster_dim = request.get_json() #bytes data = [] cluster = int(cluster_dim["kmeans"]) # print(name) fileN = db.searchFile(name) # print(fileN) fileN = fileN[0]['filename'] # print(fileN) if fileN == "dont_have_file": ret = {"route": "nofile"} return json.dumps(ret) with open(name + '/' + fileN, 'r', newline='') as csvFile: csv_file = csv.reader(csvFile) for content in csv_file: content = list(map(float, content)) if len(content) != 0: data.append(content) estimator = KMeans(n_clusters=cluster) #构造聚类器 estimator.fit(data) #聚类 label_pred = estimator.labels_ #获取聚类标签 centroids = estimator.cluster_centers_ #获取聚类中心 with open(name + '/' + 'kmeans_' + fileN, 'w', newline='') as new_file: csv_writer = csv.writer(new_file) csv_writer.writerow(["label"]) for line in label_pred: # print(line) temp = str(line) # print(temp) csv_writer.writerow([temp]) csv_writer.writerow(["clusterCenter"]) for line in centroids: # print(line) csv_writer.writerow(line) ret = {"route": 'kmeans_' + fileN} return json.dumps(ret)
def polydata(name): fileN = db.searchFile(name) # print(fileN) fileN = fileN[0]['filename'] # print(fileN) if fileN == "dont_have_file": ret = {"route": "nofile"} return json.dumps(ret) csv_file = fileN csv_data = pd.read_csv(name + '/' + csv_file, low_memory=False) #防止弹出警告 data = pd.DataFrame(csv_data) for i in data.columns: for j in range(len(data)): if (np.isnan(data[i][j])): data[i][j] = ploy(data[i], j) # print(data[i][j]) data.to_csv(name + '/' + csv_file[0:-4] + '.csv', index=False, header=False) print('处理完成') ret = {"route": csv_file[0:-4] + '.csv'} return json.dumps(ret)
def dtw(name): fileN = db.searchFile(name) isfile = fileN[0]['filename'] # print(fileN) if isfile == "dont_have_file": ret = {"route": "nofile"} return json.dumps(ret) if len(fileN) < 2: ret = {"route": '需要上传两个文件以进行时序列dtw分析'} return json.dumps(ret) data1 = [] data2 = [] file1 = fileN[0]['filename'] file2 = fileN[1]['filename'] csvFile1 = open(name + '/' + file1) csv_file1 = csv.reader(csvFile1) for content in csv_file1: content = list(map(float, content)) if len(content) != 0: data1.append(float(content[0])) csvFile1.close() print('data1=', data1) csvFile2 = open(name + '/' + file2) csv_file2 = csv.reader(csvFile2) for content in csv_file2: content = list(map(float, content)) if len(content) != 0: data2.append(float(content[0])) csvFile2.close() print('data2=', data2) r, c = len(data1), len(data2) D0 = zeros((r + 1, c + 1)) D0[0, 1:] = inf D0[1:, 0] = inf D1 = D0[1:, 1:] #浅复制 # print D1 for i in range(r): for j in range(c): D1[i, j] = euclidean_distances(data1[i], data2[j]) #生成原始距离矩阵 M = D1.copy() for i in range(r): for j in range(c): D1[i, j] += min(D0[i, j], D0[i, j + 1], D0[i + 1, j]) #代码核心,动态计算最短距离 i, j = array(D0.shape) - 2 #最短路径 # print i,j p, q = [i], [j] while (i > 0 or j > 0): tb = argmin((D0[i, j], D0[i, j + 1], D0[i + 1, j])) if tb == 0: i -= 1 j -= 1 elif tb == 1: i -= 1 else: j -= 1 p.insert(0, i) q.insert(0, j) print(M) #原始距离矩阵 print(zip(p, q)) #匹配路径过程 print(D1) #Cost Matrix或者叫累积距离矩阵 print(D1[-1, -1]) dis = [] dis.append(D1[-1, -1]) print(dis) #序列距离 with open(name + '/' + 'DTW_result.csv', 'w', newline='') as new_file: csv_writer = csv.writer(new_file) csv_writer.writerow(["Cost Matrix"]) for line in D1: # print(line) csv_writer.writerow(line) csv_writer.writerow(["Sequence distance"]) csv_writer.writerow(dis) ret = {"route": 'DTW_result.csv'} return json.dumps(ret)