def deleteViewData(): viewsName = request.form.get("viewsName") projectName = request.form.get("projectName") fileName = request.form.get("fileName") print('viewsName: {}, projectName: {}, fileName: {}'.format(viewsName, projectName, fileName)) urls = getProjectCurrentDataUrl(projectName) if (viewsName == 'FullTableStatisticsView'): viewsName = '全表统计' elif (viewsName == 'FrequencyStatisticsView'): viewsName = '频次统计' elif (viewsName == 'CorrelationCoefficientView'): viewsName = '相关系数' elif (viewsName == 'ScatterPlot'): viewsName = '散点图' file_path = urls['projectAddress']+'/'+viewsName+'/' + fileName + '.json' print('删除文件的路径:',file_path) # 删除文件 try: if(os.path.exists(file_path)): os.remove(file_path) print('删除成功') return getfileListFun(viewsName, projectName) else: print('文件不存在') return '文件不存在' except Exception as e: print('删除失败', e) return '删除失败' + str(e)
def executeAgain(): projectName = request.form.get('projectName') userId = request.form.get('userId') project = getProjectByNameAndUserId(projectName, userId) # print(project) processflow = getProcessFlowByProjectId(project.id) operates = json.loads(processflow.operates) fileUrl = getProjectCurrentDataUrl(projectName)['fileUrl'] # print(operates) spark = SparkSession \ .builder \ .master("local") \ .config("spark.some.config.option", "some-value") \ .getOrCreate() df = spark.read.format("CSV").option("header", "true").load(fileUrl) for item in operates: if (item['type'] == '1'): # 解析参数格式 condition = parsingFilterParameters(item['operate']) # 过滤函数 df = process.filterCore(spark, df, condition) df.show() # 处理后的数据写入文件 df.toPandas().to_csv("/home/zk/data/test.csv", header=True) # 返回前50条数据 data2 = df.limit(50).toJSON().collect() print(data2) data3 = ",".join(data2) print(data3) data4 = '[' + data3 + ']' print(data4) return jsonify({'length': df.count(), 'data': json.loads(data4)})
def currentDataPreview(): if request.method == 'GET': start = request.form.get('start') end = request.form.get('end') projectName = request.form.get('projectName') else: start = request.form.get('start') end = request.form.get('end') projectName = request.form.get('projectName') print('start: {}, end: {}, projectName: {}'.format(start, end, projectName)) try: urls = getProjectCurrentDataUrl(projectName) # print(urls) fileUrl = urls['fileUrl'] # print(fileUrl) except: return "error" try: data = pd.read_csv(fileUrl, encoding='utf-8') data2 = data[int(start):int(end)].to_json(orient='records', force_ascii=False) return jsonify({'length': len(data), 'data': json.loads(data2)}) except: return "error read"
def pcaCore(requestStr): # 对参数格式进行转化:json->字典,并进一步进行解析 requestDict = json.loads(requestStr) projectName = requestDict['projectName'] columnNames = requestDict['columnNames'] # 新列列名,默认为“降维结果”,若用户指定,以用户指定为准 try: newColumnName = requestDict['newColumnName'] except: newColumnName = "降维结果" # spark会话 spark = SparkSession \ .builder \ .master("local") \ .config("spark.some.config.option", "some-value") \ .getOrCreate() # 解析项目路径,读取csv urls = getProjectCurrentDataUrl(projectName) if urls == 'error': return 'error_projectUrl' # 错误类型:项目名或项目路径有误 fileUrl = urls['fileUrl'] df = spark.read.csv(fileUrl, header=True, inferSchema=True) # 默认目标维度k为3,若用户指定,以指定为准 try: k = int(requestDict['k']) except: k = 3 # 目标维度k需要小于原始维度,否则返回错误信息 if k >= len(columnNames): return "error_targetDimensions" # 转化列类型 -> 向量, 输入列必须为数值型,否则返回错误信息 vecAssembler = VectorAssembler(inputCols=columnNames, outputCol="features") try: df = vecAssembler.transform(df) except utils.IllegalArgumentException: return "error_numerical" # 设定pca模型 pca = PCA(k=k, inputCol="features", outputCol=newColumnName) # 训练 df = pca.fit(df).transform(df) df = df.drop("features") df.show() # 追加处理流程记录 operateParameter = {} operateParameter['type'] = '12' operateParameter['operate'] = requestStr addProcessingFlow(projectName, "admin", operateParameter) return df
def fullTableStatistics2(): # columnNames = request.form.getlist("columns") # projectName = request.form.getlist("projectName") columnNames = [ "行 ID", "订单 ID", "订购日期", "装运日期", "装运方式", "客户 ID", "客户名称", "细分市场", "邮政编码 (Postal Code)", "城市 (City)", "省/市/自治区 (State/Province)", "国家/地区 (Country)", "地区", "市场", "产品 ID", "类别", "子类别", "产品名称", "销售额", "数量", "折扣", "利润", "装运成本", "订单优先级"] projectName = "爱德信息分析项目" fileUrl = getProjectCurrentDataUrl(projectName) if fileUrl[-4:] == ".csv": df_excel = pd.read_csv(fileUrl, encoding="utf-8") else: df_excel = pd.read_excel(fileUrl, encoding="utf-8") res = [] statistics = ['字段名','类型','总数','最小值','最小值位置','25%分位数','中位数','75%分位数','均值','最大值','最大值位置','平均绝对偏差','方差','标准差','偏度','峰度'] for columnName in columnNames: info = {}.fromkeys(statistics) info['字段名'] = columnName info['类型'] = df_excel[columnName].dtype if info['类型'] == 'int64' or info['类型'] == 'float64': info['总数'] = df_excel[columnName].count() info['最小值'] = df_excel[columnName].min() info['最小值位置'] = df_excel[columnName].idxmin() info['25%分位数'] = df_excel[columnName].quantile(.25) info['中位数'] = df_excel[columnName].median() info['75%分位数'] = df_excel[columnName].quantile(.75) info['均值'] = df_excel[columnName].mean() info['最大值'] = df_excel[columnName].max() info['最大值位置'] = df_excel[columnName].idxmax() info['平均绝对偏差'] = df_excel[columnName].mad() info['方差'] = df_excel[columnName].var() info['标准差'] = df_excel[columnName].std() info['偏度'] = df_excel[columnName].skew() info['峰度'] = df_excel[columnName].kurt() else: info['类型'] = "text" res.append(info)
def vectorIndexerCore(requestStr): # 对参数格式进行转化:json->字典,并进一步进行解析 requestDict = json.loads(requestStr) projectName = requestDict['projectName'] columnName = requestDict['columnName'] # 只能输入一列,否则报错 if len(columnName.split(",")) != 1: return "error_columnInputNumSingle" # 新列的列名默认为columnName + "(向量索引转换)",若用户指定,以用户指定为准 try: newColumnName = requestDict['newColumnName'] except: newColumnName = columnName + "(向量索引转换)" # spark会话 spark = SparkSession \ .builder \ .master("local") \ .config("spark.some.config.option", "some-value") \ .getOrCreate() # 解析项目路径,读取csv urls = getProjectCurrentDataUrl(projectName) if urls == 'error': return 'error_projectUrl' # 错误类型:项目名或项目路径有误 fileUrl = urls['fileUrl'] df = spark.read.csv(fileUrl, header=True, inferSchema=True) # 默认分类阈值maxCategories为20,若用户指定,以指定为准 try: maxCategories = int(requestDict['maxCategories']) except: maxCategories = 20 # 转化列类型 -> 向量, 输入列必须为数值型,否则返回错误信息 vecAssembler = VectorAssembler(inputCols=[columnName], outputCol="features") try: df = vecAssembler.transform(df) except utils.IllegalArgumentException: return "error_numerical" # 定义indexer(向量索引转换模型) indexer = VectorIndexer(maxCategories=maxCategories, inputCol="features", outputCol=newColumnName) # 训练 df = indexer.fit(df).transform(df) df = df.drop("features") df.show() # 追加处理流程记录 operateParameter = {} operateParameter['type'] = '10' operateParameter['operate'] = requestStr addProcessingFlow(projectName, "admin", operateParameter) return df
def getColumnNames(): if request.method == 'GET': projectName = request.args.get('projectName') else: projectName = request.form.get('projectName') fileUrl = getProjectCurrentDataUrl(projectName)['fileUrl'] try: data = pd.read_csv(fileUrl, encoding='utf-8') return data.columns.values.tolist() except: return "error read"
def scatterPlot(): # 接受请求传参,参数包括:projectName,columnNames(必须选两列,数值型) # 例如:projectName=订单分析,columnNames= projectName = request.form.get("projectName") columnNameStr = request.form.get("columnNames") columnNameStr = columnNameStr.strip("[]") columnNames = columnNameStr.split(',') for i in range(len(columnNames)): columnNames[i] = columnNames[i].strip('""') print('projectName: {}, columnNames: {}'.format(projectName, columnNames)) # 报错:若选择多于两列,则报错 if len(columnNames) != 2: return "请选择两列,目前的选择为" + str(columnNames) # 读取项目对应的当前数据 urls = getProjectCurrentDataUrl(projectName) if urls == 'error': return '项目名或项目路径有误' fileUrl = urls['fileUrl'] projectAddress = urls['projectAddress'] if fileUrl[-4:] == ".csv": df = pd.read_csv(fileUrl, encoding="utf-8") else: df = pd.read_excel(fileUrl, encoding="utf-8") # 判断所选列的数据类型是否为“数字型”,若不符,返回错误信息 acceptTypes = ['int64', 'float64'] for columnName in columnNames: if df[columnName].dtype not in acceptTypes: return "只能画出数值型列的散点图,但是列 <" + columnName + "> 的类型为 " + str(df[columnName].dtype) # 写入散点数据 col1 = columnNames[0] col2 = columnNames[1] res = {} res.setdefault("keys", [col1, col2]) if len(df) > 50: data = df[[col1, col2]].sample(n=50).values.tolist() else: data = df[[col1, col2]].values.tolist() res.setdefault("values", data) # 写入文件 mkdir(projectAddress + '/散点图') json_str = json.dumps(res, ensure_ascii=False) with open(projectAddress + '/散点图/' + jsonFileName, "w", encoding="utf-8") as f: json.dump(json_str, f, ensure_ascii=False) print("加载入文件完成...") response = jsonify(res) response.headers.add('Access-Control-Allow-Origin', '*') return response
def correlationCoefficient(): # 接受请求传参, 参数包括projectName,columnNames(多选,仅限数值型) # 例如:projectName=订单分析;columnNames=销售额,折扣,装运成本 projectName = request.form.get("projectName") columnNameStr = request.form.get("columnNames") columnNameStr = columnNameStr.strip("[]") columnNames = columnNameStr.split(',') for i in range(len(columnNames)): columnNames[i] = columnNames[i].strip('""') print('projectName: {}, columnNames: {}'.format(projectName, columnNames)) # 读取项目对应的当前数据 urls = getProjectCurrentDataUrl(projectName) if urls == 'error': return '项目名或项目路径有误' fileUrl = urls['fileUrl'] projectAddress = urls['projectAddress'] if fileUrl[-4:] == ".csv": df = pd.read_csv(fileUrl, encoding="utf-8") else: df = pd.read_excel(fileUrl, encoding="utf-8") # 报错信息:如果所选列不是数值型,则报错 acceptTypes = ['int64', 'float64'] for columnName in columnNames: if df[columnName].dtype not in acceptTypes: return "只能计算数值型列的相关系数,但是 <" + columnName + "> 的类型为 " + str(df[columnName].dtype) # 计算出相关系数矩阵df df = df.corr() res = {} print(df) # 转存成为dict,此时对数据进行过滤,只显示用户在columnNames里面选择的列 for index in df.index: if index in columnNames: temp = {} for column in df.columns: if column in columnNames: temp.setdefault(column, df.loc[index, column]) res.setdefault(index, temp) print(res) # 写入文件 mkdir(projectAddress + '/相关系数') json_str = json.dumps(res, ensure_ascii=False) with open(projectAddress + '/相关系数/' + jsonFileName, "w", encoding="utf-8") as f: json.dump(json_str, f, ensure_ascii=False) print("加载入文件完成...") response = jsonify(res) response.headers.add('Access-Control-Allow-Origin', '*') return response
def polynomialExpansionCore(requestStr): # 对参数格式进行转化:json->字典,并进一步进行解析 requestDict = json.loads(requestStr) projectName = requestDict['projectName'] columnNamesStr = requestDict['columnNames'] columnNames = columnNamesStr.split(",") # 新列的列名默认为"多项式扩展" + columnNames,若用户指定,以用户指定为准 try: newColumnName = requestDict['newColumnName'] except: newColumnName = "多项式扩展" + "(" + columnNamesStr + ")" # spark会话 spark = SparkSession \ .builder \ .master("local") \ .config("spark.some.config.option", "some-value") \ .getOrCreate() # 解析项目路径,读取csv urls = getProjectCurrentDataUrl(projectName) if urls == 'error': return 'error_projectUrl' # 错误类型:项目名或项目路径有误 fileUrl = urls['fileUrl'] df = spark.read.csv(fileUrl, header=True, inferSchema=True) # 转化列类型 -> 向量, 输入列必须为数值型,否则返回错误信息 vecAssembler = VectorAssembler(inputCols=columnNames, outputCol="features") try: df = vecAssembler.transform(df) except utils.IllegalArgumentException: return "error_numerical" # 设定多项式扩展模型 px = PolynomialExpansion(inputCol="features", outputCol=newColumnName) # 训练 df = px.transform(df) df = df.drop("features") df.show() # 追加处理流程记录 operateParameter = {} operateParameter['type'] = '9' operateParameter['operate'] = requestStr addProcessingFlow(projectName, "admin", operateParameter) return df
def getColumnNameWithNumberType(): if request.method == 'GET': projectName = request.args.get('projectName') else: projectName = request.form.get('projectName') fileUrl = getProjectCurrentDataUrl(projectName)['fileUrl'] try: data = pd.read_csv(fileUrl, encoding='utf-8') res = [] for col in data.columns.values.tolist(): if(data[col].dtype == 'int64' or data[col].dtype == 'float64'): res.append(col) return res except: return "error read"
def quantileDiscretization(): # 接受请求传参,例如: {"projectName":"订单分析","columnName":"装运成本","newColumnName":"装运成本(分位数离散化)","numBuckets":10} # 参数中可指定分箱数numBuckets, 默认为5 if request.method == 'GET': requestStr = request.args.get("requestStr") else: requestStr = request.form.get("requestStr") # 对参数格式进行转化:json->字典,并进一步进行解析 requestDict = json.loads(requestStr) projectName = requestDict['projectName'] # spark会话 spark = SparkSession \ .builder \ .master("local") \ .config("spark.some.config.option", "some-value") \ .getOrCreate() # 解析项目路径,读取csv urls = getProjectCurrentDataUrl(projectName) if urls == 'error': return 'error_projectUrl' # 错误类型:项目名或项目路径有误 fileUrl = urls['fileUrl'] df = spark.read.csv(fileUrl, header=True, inferSchema=True) # 执行主函数,获取df(spark格式) df = quantileDiscretizationCore(requestStr,df) if df == "error_projectUrl": return "error: 项目名或项目路径有误" elif df == "error_columnInputNumSingle": return "error: 只能选择一列进行分位数离散化" elif df == "error_numerical": return "error: 只能离散化数值型的列,请检查列名输入是否有误" df.show() # 处理后的数据写入文件(借助pandas进行存储、返回) df_pandas = df.toPandas() df_pandas.to_csv(save_dir, header=True) # 追加处理流程记录 operateParameter = {} operateParameter['type'] = '8' operateParameter['operate'] = requestStr addProcessingFlow(projectName, "admin", operateParameter) return jsonify({'length': df.count(), 'data': df_pandas.to_json(force_ascii=False)})
def getColumnNamesAndViews(): if request.method == 'GET': projectName = request.args.get('projectName') else: projectName = request.form.get('projectName') fileUrl = getProjectCurrentDataUrl(projectName)['fileUrl'] result = {} try: data = pd.read_csv(fileUrl, encoding='utf-8') result['columnNames'] = data.columns.values.tolist() result['FullTableStatisticsView'] = getfileListFun('FullTableStatisticsView', projectName) result['FrequencyStatisticsView'] = getfileListFun('FrequencyStatisticsView', projectName) result['CorrelationCoefficientView'] = getfileListFun('CorrelationCoefficientView', projectName) result['ScatterPlot'] = getfileListFun('ScatterPlot', projectName) return result except: return "error read"
def oneHotEncoderCore(requestStr): # 对参数格式进行转化:json->字典,并进一步进行解析 requestDict = json.loads(requestStr) projectName = requestDict['projectName'] columnName = requestDict['columnName'] # 只能输入一列,否则报错 if len(columnName.split(",")) != 1: return "error_columnInputNumSingle" # 新列的列名默认为columnName + "(独热编码)",若用户指定,以用户指定为准 try: newColumnName = requestDict['newColumnName'] except: newColumnName = columnName + "(独热编码)" # spark会话 spark = SparkSession \ .builder \ .master("local") \ .config("spark.some.config.option", "some-value") \ .getOrCreate() # 解析项目路径,读取csv urls = getProjectCurrentDataUrl(projectName) if urls == 'error': return 'error_projectUrl' # 错误类型:项目名或项目路径有误 fileUrl = urls['fileUrl'] df = spark.read.csv(fileUrl, header=True, inferSchema=True) # 设定独热编码模型 ohe = OneHotEncoderEstimator(inputCols=[columnName], outputCols=[newColumnName]) # 训练 try: df = ohe.fit(df).transform(df) except: return "error_intOnly" df.show() # 追加处理流程记录 operateParameter = {} operateParameter['type'] = '7' operateParameter['operate'] = requestStr addProcessingFlow(projectName, "admin", operateParameter) return df
def stringIndexerCore(requestStr): # 对参数格式进行转化:json->字典,并进一步进行解析 requestDict = json.loads(requestStr) projectName = requestDict['projectName'] columnName = requestDict['columnName'] # 新列名称,默认为columnName + “(标签化,按频率排序,0为频次最高)”,若用户指定,以用户指定为准 try: newColumnName = requestDict['newColumnName'] except: newColumnName = columnName + "(标签化,按频率排序,0为频次最高)" # 只能输入一列,否则报错 if len(columnName.split(",")) != 1: return "error_columnInputNumSingle" # spark会话 spark = SparkSession \ .builder \ .master("local") \ .config("spark.some.config.option", "some-value") \ .getOrCreate() # 解析项目路径,读取csv urls = getProjectCurrentDataUrl(projectName) if urls == 'error': return 'error_projectUrl' # 错误类型:项目名或项目路径有误 fileUrl = urls['fileUrl'] df = spark.read.csv(fileUrl, header=True, inferSchema=True) # 设定si(字符串转标签模型) si = StringIndexer(inputCol=columnName, outputCol=newColumnName) # 训练 df = si.fit(df).transform(df) df.show() # 追加处理流程记录 operateParameter = {} operateParameter['type'] = '16' operateParameter['operate'] = requestStr addProcessingFlow(projectName, "admin", operateParameter) return df
def getViewData(): viewsName = request.form.get("viewsName") projectName = request.form.get("projectName") viewFileName = request.form.get("viewFileName") print('viewsName: {}, projectName: {}, viewFileName: {}'.format(viewsName, projectName, viewFileName)) urls = getProjectCurrentDataUrl(projectName) if (viewsName == 'FullTableStatisticsView'): viewsName = '全表统计' elif (viewsName == 'FrequencyStatisticsView'): viewsName = '频次统计' elif (viewsName == 'CorrelationCoefficientView'): viewsName = '相关系数' elif (viewsName == 'ScatterPlot'): viewsName = '散点图' viewFileUrl = urls['projectAddress']+'/'+viewsName+'/' + viewFileName +'.json' # 读入数据 with open(viewFileUrl, 'r') as load_f: load_dict = json.load(load_f) load_dict = json.loads(load_dict) return load_dict
def frequencyStatistics(): # 接受请求传参,参数包括:projectName,columnName(只能选一列) # 例如:projectName=医药病例分类分析;columnName=Item projectName = request.form.get("projectName") columnName = request.form.get("columnNames") # 报错信息,限选一列 if len(columnName.split(",")) > 1: print(len(columnName.split(","))) return "频次统计只能选择一列,请勿多选" columnName = columnName.strip("[]") columnName = columnName.strip('""') # 读取项目对应的当前数据 urls = getProjectCurrentDataUrl(projectName) if urls == 'error': return '项目名或项目路径有误' fileUrl = urls['fileUrl'] projectAddress = urls['projectAddress'] if fileUrl[-4:] == ".csv": df = pd.read_csv(fileUrl, encoding="utf-8") else: df = pd.read_excel(fileUrl, encoding="utf-8") # 频次统计 res = {} for i in range(len(df[columnName].value_counts().index)): res.setdefault(df[columnName].value_counts().index[i], str(df[columnName].value_counts().values[i])) i += 1 # 写入文件 mkdir(projectAddress + '/频次统计') json_str = json.dumps(res, ensure_ascii=False) with open(projectAddress + '/频次统计/' + jsonFileName, "w", encoding="utf-8") as f: json.dump(json_str, f, ensure_ascii=False) print("加载入文件完成...") response = jsonify(res) print(res) response.headers.add('Access-Control-Allow-Origin', '*') return response
def getfileListFun(viewsName,projectName): urls = getProjectCurrentDataUrl(projectName) if(viewsName == 'FullTableStatisticsView'): viewsName = '全表统计' elif(viewsName == 'FrequencyStatisticsView'): viewsName = '频次统计' elif (viewsName == 'CorrelationCoefficientView'): viewsName = '相关系数' elif (viewsName == 'ScatterPlot'): viewsName = '散点图' projectAddress = urls['projectAddress']+'/'+viewsName if not os.path.exists(projectAddress): return [] for root, dirs, files in os.walk(projectAddress): # print(root) #当前目录路径 # print(dirs) #当前路径下所有子目录 print(files) #当前路径下所有非目录子文件! result = [] for i in range(len(files)): if files[i] != jsonFileName: le = len(files[i]) result.append(files[i][0:le - 5]) return result
def saveViewData(): viewsName = request.form.get("viewsName") projectName = request.form.get("projectName") newFileName = request.form.get("newFileName") print('viewsName: {}, projectName: {}, newFileName: {}'.format(viewsName, projectName, newFileName)) urls = getProjectCurrentDataUrl(projectName) if (viewsName == 'FullTableStatisticsView'): viewsName = '全表统计' elif (viewsName == 'FrequencyStatisticsView'): viewsName = '频次统计' elif (viewsName == 'CorrelationCoefficientView'): viewsName = '相关系数' elif (viewsName == 'ScatterPlot'): viewsName = '散点图' viewFileUrl = urls['projectAddress']+'/'+viewsName+'/' + jsonFileName newfile_path = urls['projectAddress']+'/'+viewsName+'/' + newFileName + '.json' # 复制文件 try: shutil.copyfile(viewFileUrl, newfile_path) print('保存成功') return getfileListFun(viewsName, projectName) except Exception as e: print('保存失败', e) return '保存失败' + str(e)
def fullTableStatistics(): # 接受参数并处理 print(request.form) columnNameStr = request.form.get('columnNames') projectName = request.form.get("projectName") columnNameStr = columnNameStr[1:len(columnNameStr)-1] columnNames = columnNameStr.split(',') for i in range(len(columnNames)): columnNames[i] = columnNames[i][1:len(columnNames[i])-1] print('projectName: {}, columnNames: {}'.format(projectName, columnNames)) # 读取项目对应的当前数据 urls = getProjectCurrentDataUrl(projectName) fileUrl = urls['fileUrl'] projectAddress = urls['projectAddress'] if fileUrl[-4:] == ".csv": df_excel = pd.read_csv(fileUrl, encoding="utf-8") else: df_excel = pd.read_excel(fileUrl, encoding="utf-8") # 全表统计 res = [] statistics = [' 字段名',' 类型','总数','最小值','最小值位置','25%分位数','中位数','75%分位数','均值','最大值','最大值位置','平均绝对偏差','方差','标准差','偏度','峰度'] for columnName in columnNames: info = {}.fromkeys(statistics) info[' 字段名'] = columnName info[' 类型'] = df_excel[columnName].dtype if info[' 类型'] == 'int64' or info[' 类型'] == 'float64': info[' 类型'] = 'number' info['总数'] = str(df_excel[columnName].count()) info['最小值'] = str(df_excel[columnName].min()) info['最小值位置'] = str(df_excel[columnName].idxmin()) info['25%分位数'] = str(df_excel[columnName].quantile(.25)) info['中位数'] = str(df_excel[columnName].median()) info['75%分位数'] = str(df_excel[columnName].quantile(.75)) info['均值'] = str(df_excel[columnName].mean()) info['最大值'] = str(df_excel[columnName].max()) info['最大值位置'] = str(df_excel[columnName].idxmax()) info['平均绝对偏差'] = str(df_excel[columnName].mad()) info['方差'] = str(df_excel[columnName].var()) info['标准差'] = str(df_excel[columnName].std()) info['偏度'] = str(df_excel[columnName].skew()) info['峰度'] = str(df_excel[columnName].kurt()) print('int') else: info[' 类型'] = "text" info['总数'] = str(df_excel[columnName].count()) print("text") res.append(info) # 写入文件 mkdir(projectAddress+'/全表统计') from app.constFile import const save_dir = const.SAVEDIR # jsonFileName = str(int(time.time()))+'.json' json_str = json.dumps(res, ensure_ascii=False) with open(projectAddress+'/全表统计/' + jsonFileName, "w", encoding="utf-8") as f: json.dump(json_str, f, ensure_ascii=False) print("加载入文件完成...") result = {} result['fileName'] = jsonFileName result['data'] = res response = jsonify(result) response.headers.add('Access-Control-Allow-Origin', '*') return response
def chiSqSelectorCore(requestStr): # 对参数格式进行转化:json->字典,并进一步进行解析 requestDict = json.loads(requestStr) projectName = requestDict['projectName'] columnNamesStr = requestDict['columnNames'] columnName_label = requestDict['columnName_label'] # columnName_label必须为单列,否则报错 if len(columnName_label.split(",")) != 1: return "error_columnInputNumSingle" # 获取卡方选择结果topN的数目,默认numTopFeatures为1 try: numTopFeatures = requestDict['numTopFeatures'] except: numTopFeatures = 1 columnNames = columnNamesStr.split(",") # columnNames的数目必须大于1,否则报错 if len(columnNames) < 2: return "error_columnInputNumMultiple" # 新列的列名默认为"卡方选择" + (与 columnName_label 相关的前 numTopFeatures 个特征列),若用户指定,以用户指定为准 try: newColumnName = requestDict['newColumnName'] except: newColumnName = "卡方选择" + "(与 [" + str(columnName_label) + "] 相关的前 " + str(numTopFeatures) + " 个特征列)" # spark会话 spark = SparkSession \ .builder \ .master("local") \ .config("spark.some.config.option", "some-value") \ .getOrCreate() # 解析项目路径,读取csv urls = getProjectCurrentDataUrl(projectName) if urls == 'error': return 'error_projectUrl' # 错误类型:项目名或项目路径有误 fileUrl = urls['fileUrl'] df = spark.read.csv(fileUrl, header=True, inferSchema=True) # 转化列类型 -> 向量, 输入列必须为数值型,否则返回错误信息 vecAssembler = VectorAssembler(inputCols=columnNames, outputCol="features") try: df = vecAssembler.transform(df) except utils.IllegalArgumentException: return "error_numerical" # 设定标签列label df = df.withColumn("label", df[columnName_label]) # 设定多项式扩展模型 selector = ChiSqSelector(numTopFeatures=numTopFeatures, outputCol=newColumnName) # 训练,若label的类型不是数值型,报错 try: df = selector.fit(df).transform(df) except utils.IllegalArgumentException: return "error_numerical" df = df.drop("features") df.show() # 追加处理流程记录 operateParameter = {} operateParameter['type'] = '13' operateParameter['operate'] = requestStr addProcessingFlow(projectName, "admin", operateParameter) return df