Beispiel #1
0
def deleteViewData():
    viewsName = request.form.get("viewsName")
    projectName = request.form.get("projectName")
    fileName = request.form.get("fileName")
    print('viewsName: {}, projectName: {}, fileName: {}'.format(viewsName, projectName, fileName))
    urls = getProjectCurrentDataUrl(projectName)
    if (viewsName == 'FullTableStatisticsView'):
        viewsName = '全表统计'
    elif (viewsName == 'FrequencyStatisticsView'):
        viewsName = '频次统计'
    elif (viewsName == 'CorrelationCoefficientView'):
        viewsName = '相关系数'
    elif (viewsName == 'ScatterPlot'):
        viewsName = '散点图'
    file_path = urls['projectAddress']+'/'+viewsName+'/' + fileName + '.json'
    print('删除文件的路径:',file_path)
    # 删除文件
    try:
        if(os.path.exists(file_path)):
            os.remove(file_path)
            print('删除成功')
            return getfileListFun(viewsName, projectName)
        else:
            print('文件不存在')
            return '文件不存在'
    except Exception as e:
        print('删除失败', e)
        return '删除失败' + str(e)
Beispiel #2
0
def executeAgain():
    projectName = request.form.get('projectName')
    userId = request.form.get('userId')
    project = getProjectByNameAndUserId(projectName, userId)
    # print(project)
    processflow = getProcessFlowByProjectId(project.id)
    operates = json.loads(processflow.operates)
    fileUrl = getProjectCurrentDataUrl(projectName)['fileUrl']
    # print(operates)
    spark = SparkSession \
        .builder \
        .master("local") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()
    df = spark.read.format("CSV").option("header", "true").load(fileUrl)
    for item in operates:
        if (item['type'] == '1'):
            # 解析参数格式
            condition = parsingFilterParameters(item['operate'])
            # 过滤函数
            df = process.filterCore(spark, df, condition)
            df.show()
    # 处理后的数据写入文件
    df.toPandas().to_csv("/home/zk/data/test.csv", header=True)
    # 返回前50条数据
    data2 = df.limit(50).toJSON().collect()
    print(data2)
    data3 = ",".join(data2)
    print(data3)
    data4 = '[' + data3 + ']'
    print(data4)
    return jsonify({'length': df.count(), 'data': json.loads(data4)})
Beispiel #3
0
def currentDataPreview():
    if request.method == 'GET':
        start = request.form.get('start')
        end = request.form.get('end')
        projectName = request.form.get('projectName')
    else:
        start = request.form.get('start')
        end = request.form.get('end')
        projectName = request.form.get('projectName')
    print('start: {}, end: {}, projectName: {}'.format(start, end,
                                                       projectName))
    try:
        urls = getProjectCurrentDataUrl(projectName)
        # print(urls)
        fileUrl = urls['fileUrl']
        # print(fileUrl)
    except:
        return "error"
    try:
        data = pd.read_csv(fileUrl, encoding='utf-8')
        data2 = data[int(start):int(end)].to_json(orient='records',
                                                  force_ascii=False)
        return jsonify({'length': len(data), 'data': json.loads(data2)})
    except:
        return "error read"
Beispiel #4
0
def pcaCore(requestStr):
    # 对参数格式进行转化:json->字典,并进一步进行解析
    requestDict = json.loads(requestStr)
    projectName = requestDict['projectName']
    columnNames = requestDict['columnNames']
    # 新列列名,默认为“降维结果”,若用户指定,以用户指定为准
    try:
        newColumnName = requestDict['newColumnName']
    except:
        newColumnName = "降维结果"

    # spark会话
    spark = SparkSession \
        .builder \
        .master("local") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()

    # 解析项目路径,读取csv
    urls = getProjectCurrentDataUrl(projectName)
    if urls == 'error':
        return 'error_projectUrl'  # 错误类型:项目名或项目路径有误
    fileUrl = urls['fileUrl']
    df = spark.read.csv(fileUrl, header=True, inferSchema=True)

    # 默认目标维度k为3,若用户指定,以指定为准
    try:
        k = int(requestDict['k'])
    except:
        k = 3

    # 目标维度k需要小于原始维度,否则返回错误信息
    if k >= len(columnNames):
        return "error_targetDimensions"

    # 转化列类型 -> 向量, 输入列必须为数值型,否则返回错误信息
    vecAssembler = VectorAssembler(inputCols=columnNames, outputCol="features")
    try:
        df = vecAssembler.transform(df)
    except utils.IllegalArgumentException:
        return "error_numerical"

    # 设定pca模型
    pca = PCA(k=k, inputCol="features", outputCol=newColumnName)

    # 训练
    df = pca.fit(df).transform(df)

    df = df.drop("features")
    df.show()

    # 追加处理流程记录
    operateParameter = {}
    operateParameter['type'] = '12'
    operateParameter['operate'] = requestStr
    addProcessingFlow(projectName, "admin", operateParameter)

    return df
Beispiel #5
0
def fullTableStatistics2():
    # columnNames = request.form.getlist("columns")
    # projectName = request.form.getlist("projectName")
    columnNames = [ "行 ID",
                    "订单 ID",
                    "订购日期",
                    "装运日期",
                    "装运方式",
                    "客户 ID",
                    "客户名称",
                    "细分市场",
                    "邮政编码 (Postal Code)",
                    "城市 (City)",
                    "省/市/自治区 (State/Province)",
                    "国家/地区 (Country)",
                    "地区",
                    "市场",
                    "产品 ID",
                    "类别",
                    "子类别",
                    "产品名称",
                    "销售额",
                    "数量",
                    "折扣",
                    "利润",
                    "装运成本",
                    "订单优先级"]
    projectName = "爱德信息分析项目"
    fileUrl = getProjectCurrentDataUrl(projectName)
    if fileUrl[-4:] == ".csv":
        df_excel = pd.read_csv(fileUrl, encoding="utf-8")
    else:
        df_excel = pd.read_excel(fileUrl, encoding="utf-8")
    res = []
    statistics = ['字段名','类型','总数','最小值','最小值位置','25%分位数','中位数','75%分位数','均值','最大值','最大值位置','平均绝对偏差','方差','标准差','偏度','峰度']
    for columnName in columnNames:
        info = {}.fromkeys(statistics)
        info['字段名'] = columnName
        info['类型'] = df_excel[columnName].dtype
        if info['类型'] == 'int64' or info['类型'] == 'float64':
            info['总数'] = df_excel[columnName].count()
            info['最小值'] = df_excel[columnName].min()
            info['最小值位置'] = df_excel[columnName].idxmin()
            info['25%分位数'] = df_excel[columnName].quantile(.25)
            info['中位数'] = df_excel[columnName].median()
            info['75%分位数'] = df_excel[columnName].quantile(.75)
            info['均值'] = df_excel[columnName].mean()
            info['最大值'] = df_excel[columnName].max()
            info['最大值位置'] = df_excel[columnName].idxmax()
            info['平均绝对偏差'] = df_excel[columnName].mad()
            info['方差'] = df_excel[columnName].var()
            info['标准差'] = df_excel[columnName].std()
            info['偏度'] = df_excel[columnName].skew()
            info['峰度'] = df_excel[columnName].kurt()
        else:
            info['类型'] = "text"
        res.append(info)
Beispiel #6
0
def vectorIndexerCore(requestStr):
    # 对参数格式进行转化:json->字典,并进一步进行解析
    requestDict = json.loads(requestStr)
    projectName = requestDict['projectName']
    columnName = requestDict['columnName']
    # 只能输入一列,否则报错
    if len(columnName.split(",")) != 1:
        return "error_columnInputNumSingle"
    # 新列的列名默认为columnName + "(向量索引转换)",若用户指定,以用户指定为准
    try:
        newColumnName = requestDict['newColumnName']
    except:
        newColumnName = columnName + "(向量索引转换)"

    # spark会话
    spark = SparkSession \
        .builder \
        .master("local") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()

    # 解析项目路径,读取csv
    urls = getProjectCurrentDataUrl(projectName)
    if urls == 'error':
        return 'error_projectUrl'  # 错误类型:项目名或项目路径有误
    fileUrl = urls['fileUrl']
    df = spark.read.csv(fileUrl, header=True, inferSchema=True)

    # 默认分类阈值maxCategories为20,若用户指定,以指定为准
    try:
        maxCategories = int(requestDict['maxCategories'])
    except:
        maxCategories = 20

    # 转化列类型 -> 向量, 输入列必须为数值型,否则返回错误信息
    vecAssembler = VectorAssembler(inputCols=[columnName], outputCol="features")
    try:
        df = vecAssembler.transform(df)
    except utils.IllegalArgumentException:
        return "error_numerical"

    # 定义indexer(向量索引转换模型)
    indexer = VectorIndexer(maxCategories=maxCategories, inputCol="features", outputCol=newColumnName)

    # 训练
    df = indexer.fit(df).transform(df)

    df = df.drop("features")
    df.show()

    # 追加处理流程记录
    operateParameter = {}
    operateParameter['type'] = '10'
    operateParameter['operate'] = requestStr
    addProcessingFlow(projectName, "admin", operateParameter)

    return df
Beispiel #7
0
def getColumnNames():
    if request.method == 'GET':
        projectName = request.args.get('projectName')
    else:
        projectName = request.form.get('projectName')
    fileUrl = getProjectCurrentDataUrl(projectName)['fileUrl']
    try:
        data = pd.read_csv(fileUrl, encoding='utf-8')
        return data.columns.values.tolist()
    except:
        return "error read"
Beispiel #8
0
def scatterPlot():
    # 接受请求传参,参数包括:projectName,columnNames(必须选两列,数值型)
    # 例如:projectName=订单分析,columnNames=
    projectName = request.form.get("projectName")
    columnNameStr = request.form.get("columnNames")
    columnNameStr = columnNameStr.strip("[]")
    columnNames = columnNameStr.split(',')
    for i in range(len(columnNames)):
        columnNames[i] = columnNames[i].strip('""')
    print('projectName: {}, columnNames: {}'.format(projectName, columnNames))

    # 报错:若选择多于两列,则报错
    if len(columnNames) != 2:
        return "请选择两列,目前的选择为" + str(columnNames)

    # 读取项目对应的当前数据
    urls = getProjectCurrentDataUrl(projectName)
    if urls == 'error':
        return '项目名或项目路径有误'
    fileUrl = urls['fileUrl']
    projectAddress = urls['projectAddress']
    if fileUrl[-4:] == ".csv":
        df = pd.read_csv(fileUrl, encoding="utf-8")
    else:
        df = pd.read_excel(fileUrl, encoding="utf-8")

    # 判断所选列的数据类型是否为“数字型”,若不符,返回错误信息
    acceptTypes = ['int64', 'float64']
    for columnName in columnNames:
        if df[columnName].dtype not in acceptTypes:
            return "只能画出数值型列的散点图,但是列 <" + columnName + "> 的类型为 " + str(df[columnName].dtype)
    # 写入散点数据
    col1 = columnNames[0]
    col2 = columnNames[1]
    res = {}
    res.setdefault("keys", [col1, col2])
    if len(df) > 50:
        data = df[[col1, col2]].sample(n=50).values.tolist()
    else:
        data = df[[col1, col2]].values.tolist()
    res.setdefault("values", data)

    # 写入文件
    mkdir(projectAddress + '/散点图')
    json_str = json.dumps(res, ensure_ascii=False)
    with open(projectAddress + '/散点图/' + jsonFileName, "w", encoding="utf-8") as f:
        json.dump(json_str, f, ensure_ascii=False)
        print("加载入文件完成...")

    response = jsonify(res)
    response.headers.add('Access-Control-Allow-Origin', '*')
    return response
Beispiel #9
0
def correlationCoefficient():
    # 接受请求传参, 参数包括projectName,columnNames(多选,仅限数值型)
    # 例如:projectName=订单分析;columnNames=销售额,折扣,装运成本
    projectName = request.form.get("projectName")
    columnNameStr = request.form.get("columnNames")
    columnNameStr = columnNameStr.strip("[]")
    columnNames = columnNameStr.split(',')
    for i in range(len(columnNames)):
        columnNames[i] = columnNames[i].strip('""')
    print('projectName: {}, columnNames: {}'.format(projectName, columnNames))

    # 读取项目对应的当前数据
    urls = getProjectCurrentDataUrl(projectName)
    if urls == 'error':
        return '项目名或项目路径有误'
    fileUrl = urls['fileUrl']
    projectAddress = urls['projectAddress']
    if fileUrl[-4:] == ".csv":
        df = pd.read_csv(fileUrl, encoding="utf-8")
    else:
        df = pd.read_excel(fileUrl, encoding="utf-8")

    # 报错信息:如果所选列不是数值型,则报错
    acceptTypes = ['int64', 'float64']
    for columnName in columnNames:
        if df[columnName].dtype not in acceptTypes:
            return "只能计算数值型列的相关系数,但是 <" + columnName + "> 的类型为 " + str(df[columnName].dtype)

    # 计算出相关系数矩阵df
    df = df.corr()
    res = {}
    print(df)
    # 转存成为dict,此时对数据进行过滤,只显示用户在columnNames里面选择的列
    for index in df.index:
        if index in columnNames:
            temp = {}
            for column in df.columns:
                if column in columnNames:
                    temp.setdefault(column, df.loc[index, column])
            res.setdefault(index, temp)
    print(res)

    # 写入文件
    mkdir(projectAddress + '/相关系数')
    json_str = json.dumps(res, ensure_ascii=False)
    with open(projectAddress + '/相关系数/' + jsonFileName, "w", encoding="utf-8") as f:
        json.dump(json_str, f, ensure_ascii=False)
        print("加载入文件完成...")
    response = jsonify(res)
    response.headers.add('Access-Control-Allow-Origin', '*')
    return response
Beispiel #10
0
def polynomialExpansionCore(requestStr):
    # 对参数格式进行转化:json->字典,并进一步进行解析
    requestDict = json.loads(requestStr)
    projectName = requestDict['projectName']
    columnNamesStr = requestDict['columnNames']
    columnNames = columnNamesStr.split(",")
    # 新列的列名默认为"多项式扩展" + columnNames,若用户指定,以用户指定为准
    try:
        newColumnName = requestDict['newColumnName']
    except:
        newColumnName = "多项式扩展" + "(" + columnNamesStr + ")"

    # spark会话
    spark = SparkSession \
        .builder \
        .master("local") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()

    # 解析项目路径,读取csv
    urls = getProjectCurrentDataUrl(projectName)
    if urls == 'error':
        return 'error_projectUrl'  # 错误类型:项目名或项目路径有误
    fileUrl = urls['fileUrl']
    df = spark.read.csv(fileUrl, header=True, inferSchema=True)

    # 转化列类型 -> 向量, 输入列必须为数值型,否则返回错误信息
    vecAssembler = VectorAssembler(inputCols=columnNames, outputCol="features")
    try:
        df = vecAssembler.transform(df)
    except utils.IllegalArgumentException:
        return "error_numerical"

    # 设定多项式扩展模型
    px = PolynomialExpansion(inputCol="features", outputCol=newColumnName)

    # 训练
    df = px.transform(df)

    df = df.drop("features")
    df.show()

    # 追加处理流程记录
    operateParameter = {}
    operateParameter['type'] = '9'
    operateParameter['operate'] = requestStr
    addProcessingFlow(projectName, "admin", operateParameter)

    return df
Beispiel #11
0
def getColumnNameWithNumberType():
    if request.method == 'GET':
        projectName = request.args.get('projectName')
    else:
        projectName = request.form.get('projectName')
    fileUrl = getProjectCurrentDataUrl(projectName)['fileUrl']
    try:
        data = pd.read_csv(fileUrl, encoding='utf-8')
        res = []
        for col in data.columns.values.tolist():
            if(data[col].dtype == 'int64' or data[col].dtype == 'float64'):
                res.append(col)
        return res
    except:
        return "error read"
Beispiel #12
0
def quantileDiscretization():
    # 接受请求传参,例如: {"projectName":"订单分析","columnName":"装运成本","newColumnName":"装运成本(分位数离散化)","numBuckets":10}
    # 参数中可指定分箱数numBuckets, 默认为5
    if request.method == 'GET':
        requestStr = request.args.get("requestStr")
    else:
        requestStr = request.form.get("requestStr")

    # 对参数格式进行转化:json->字典,并进一步进行解析
    requestDict = json.loads(requestStr)
    projectName = requestDict['projectName']

    # spark会话
    spark = SparkSession \
        .builder \
        .master("local") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()

    # 解析项目路径,读取csv
    urls = getProjectCurrentDataUrl(projectName)
    if urls == 'error':
        return 'error_projectUrl'  # 错误类型:项目名或项目路径有误
    fileUrl = urls['fileUrl']
    df = spark.read.csv(fileUrl, header=True, inferSchema=True)

    # 执行主函数,获取df(spark格式)
    df = quantileDiscretizationCore(requestStr,df)
    if df == "error_projectUrl":
        return "error: 项目名或项目路径有误"
    elif df == "error_columnInputNumSingle":
        return "error: 只能选择一列进行分位数离散化"
    elif df == "error_numerical":
        return "error: 只能离散化数值型的列,请检查列名输入是否有误"

    df.show()
    # 处理后的数据写入文件(借助pandas进行存储、返回)
    df_pandas = df.toPandas()
    df_pandas.to_csv(save_dir, header=True)
    # 追加处理流程记录
    operateParameter = {}
    operateParameter['type'] = '8'
    operateParameter['operate'] = requestStr
    addProcessingFlow(projectName, "admin", operateParameter)

    return jsonify({'length': df.count(), 'data': df_pandas.to_json(force_ascii=False)})
Beispiel #13
0
def getColumnNamesAndViews():
    if request.method == 'GET':
        projectName =  request.args.get('projectName')
    else:
        projectName = request.form.get('projectName')
    fileUrl = getProjectCurrentDataUrl(projectName)['fileUrl']
    result = {}
    try:
        data = pd.read_csv(fileUrl, encoding='utf-8')
        result['columnNames'] = data.columns.values.tolist()
        result['FullTableStatisticsView'] = getfileListFun('FullTableStatisticsView', projectName)
        result['FrequencyStatisticsView'] = getfileListFun('FrequencyStatisticsView', projectName)
        result['CorrelationCoefficientView'] = getfileListFun('CorrelationCoefficientView', projectName)
        result['ScatterPlot'] = getfileListFun('ScatterPlot', projectName)
        return result
    except:
        return "error read"
Beispiel #14
0
def oneHotEncoderCore(requestStr):
    # 对参数格式进行转化:json->字典,并进一步进行解析
    requestDict = json.loads(requestStr)
    projectName = requestDict['projectName']
    columnName = requestDict['columnName']
    # 只能输入一列,否则报错
    if len(columnName.split(",")) != 1:
        return "error_columnInputNumSingle"
    # 新列的列名默认为columnName + "(独热编码)",若用户指定,以用户指定为准
    try:
        newColumnName = requestDict['newColumnName']
    except:
        newColumnName = columnName + "(独热编码)"

    # spark会话
    spark = SparkSession \
        .builder \
        .master("local") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()

    # 解析项目路径,读取csv
    urls = getProjectCurrentDataUrl(projectName)
    if urls == 'error':
        return 'error_projectUrl'  # 错误类型:项目名或项目路径有误
    fileUrl = urls['fileUrl']
    df = spark.read.csv(fileUrl, header=True, inferSchema=True)

    # 设定独热编码模型
    ohe = OneHotEncoderEstimator(inputCols=[columnName], outputCols=[newColumnName])

    # 训练
    try:
        df = ohe.fit(df).transform(df)
    except:
        return "error_intOnly"

    df.show()

    # 追加处理流程记录
    operateParameter = {}
    operateParameter['type'] = '7'
    operateParameter['operate'] = requestStr
    addProcessingFlow(projectName, "admin", operateParameter)

    return df
Beispiel #15
0
def stringIndexerCore(requestStr):
    # 对参数格式进行转化:json->字典,并进一步进行解析
    requestDict = json.loads(requestStr)
    projectName = requestDict['projectName']
    columnName = requestDict['columnName']
    # 新列名称,默认为columnName + “(标签化,按频率排序,0为频次最高)”,若用户指定,以用户指定为准
    try:
        newColumnName = requestDict['newColumnName']
    except:
        newColumnName = columnName + "(标签化,按频率排序,0为频次最高)"
    # 只能输入一列,否则报错
    if len(columnName.split(",")) != 1:
        return "error_columnInputNumSingle"

    # spark会话
    spark = SparkSession \
        .builder \
        .master("local") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()

    # 解析项目路径,读取csv
    urls = getProjectCurrentDataUrl(projectName)
    if urls == 'error':
        return 'error_projectUrl'  # 错误类型:项目名或项目路径有误
    fileUrl = urls['fileUrl']
    df = spark.read.csv(fileUrl, header=True, inferSchema=True)

    # 设定si(字符串转标签模型)
    si = StringIndexer(inputCol=columnName, outputCol=newColumnName)

    # 训练
    df = si.fit(df).transform(df)

    df.show()

    # 追加处理流程记录
    operateParameter = {}
    operateParameter['type'] = '16'
    operateParameter['operate'] = requestStr
    addProcessingFlow(projectName, "admin", operateParameter)

    return df
Beispiel #16
0
def getViewData():
    viewsName = request.form.get("viewsName")
    projectName = request.form.get("projectName")
    viewFileName = request.form.get("viewFileName")
    print('viewsName: {}, projectName: {}, viewFileName: {}'.format(viewsName, projectName, viewFileName))
    urls = getProjectCurrentDataUrl(projectName)
    if (viewsName == 'FullTableStatisticsView'):
        viewsName = '全表统计'
    elif (viewsName == 'FrequencyStatisticsView'):
        viewsName = '频次统计'
    elif (viewsName == 'CorrelationCoefficientView'):
        viewsName = '相关系数'
    elif (viewsName == 'ScatterPlot'):
        viewsName = '散点图'
    viewFileUrl = urls['projectAddress']+'/'+viewsName+'/' + viewFileName +'.json'
    # 读入数据
    with open(viewFileUrl, 'r') as load_f:
        load_dict = json.load(load_f)
        load_dict = json.loads(load_dict)
    return load_dict
Beispiel #17
0
def frequencyStatistics():
    # 接受请求传参,参数包括:projectName,columnName(只能选一列)
    # 例如:projectName=医药病例分类分析;columnName=Item
    projectName = request.form.get("projectName")
    columnName = request.form.get("columnNames")
    # 报错信息,限选一列
    if len(columnName.split(",")) > 1:
        print(len(columnName.split(",")))
        return "频次统计只能选择一列,请勿多选"
    columnName = columnName.strip("[]")
    columnName = columnName.strip('""')

    # 读取项目对应的当前数据
    urls = getProjectCurrentDataUrl(projectName)
    if urls == 'error':
        return '项目名或项目路径有误'
    fileUrl = urls['fileUrl']
    projectAddress = urls['projectAddress']
    if fileUrl[-4:] == ".csv":
        df = pd.read_csv(fileUrl, encoding="utf-8")
    else:
        df = pd.read_excel(fileUrl, encoding="utf-8")

    # 频次统计
    res = {}
    for i in range(len(df[columnName].value_counts().index)):
        res.setdefault(df[columnName].value_counts().index[i], str(df[columnName].value_counts().values[i]))
        i += 1

    # 写入文件
    mkdir(projectAddress + '/频次统计')
    json_str = json.dumps(res, ensure_ascii=False)
    with open(projectAddress + '/频次统计/' + jsonFileName, "w", encoding="utf-8") as f:
        json.dump(json_str, f, ensure_ascii=False)
        print("加载入文件完成...")

    response = jsonify(res)
    print(res)
    response.headers.add('Access-Control-Allow-Origin', '*')
    return response
Beispiel #18
0
def getfileListFun(viewsName,projectName):
    urls = getProjectCurrentDataUrl(projectName)
    if(viewsName == 'FullTableStatisticsView'):
        viewsName = '全表统计'
    elif(viewsName == 'FrequencyStatisticsView'):
        viewsName = '频次统计'
    elif (viewsName == 'CorrelationCoefficientView'):
        viewsName = '相关系数'
    elif (viewsName == 'ScatterPlot'):
        viewsName = '散点图'
    projectAddress = urls['projectAddress']+'/'+viewsName
    if not os.path.exists(projectAddress):
        return []
    for root, dirs, files in os.walk(projectAddress):
        # print(root) #当前目录路径
        # print(dirs) #当前路径下所有子目录
        print(files)  #当前路径下所有非目录子文件!
    result = []
    for i in range(len(files)):
        if files[i] != jsonFileName:
            le = len(files[i])
            result.append(files[i][0:le - 5])
    return result
Beispiel #19
0
def saveViewData():
    viewsName = request.form.get("viewsName")
    projectName = request.form.get("projectName")
    newFileName = request.form.get("newFileName")
    print('viewsName: {}, projectName: {}, newFileName: {}'.format(viewsName, projectName, newFileName))
    urls = getProjectCurrentDataUrl(projectName)
    if (viewsName == 'FullTableStatisticsView'):
        viewsName = '全表统计'
    elif (viewsName == 'FrequencyStatisticsView'):
        viewsName = '频次统计'
    elif (viewsName == 'CorrelationCoefficientView'):
        viewsName = '相关系数'
    elif (viewsName == 'ScatterPlot'):
        viewsName = '散点图'
    viewFileUrl = urls['projectAddress']+'/'+viewsName+'/' + jsonFileName
    newfile_path = urls['projectAddress']+'/'+viewsName+'/' + newFileName + '.json'
    # 复制文件
    try:
        shutil.copyfile(viewFileUrl, newfile_path)
        print('保存成功')
        return getfileListFun(viewsName, projectName)
    except Exception as e:
        print('保存失败', e)
        return '保存失败' + str(e)
Beispiel #20
0
def fullTableStatistics():
    # 接受参数并处理
    print(request.form)
    columnNameStr = request.form.get('columnNames')
    projectName = request.form.get("projectName")
    columnNameStr = columnNameStr[1:len(columnNameStr)-1]
    columnNames = columnNameStr.split(',')
    for i in range(len(columnNames)):
        columnNames[i] = columnNames[i][1:len(columnNames[i])-1]
    print('projectName: {}, columnNames: {}'.format(projectName, columnNames))
    # 读取项目对应的当前数据
    urls = getProjectCurrentDataUrl(projectName)
    fileUrl = urls['fileUrl']
    projectAddress = urls['projectAddress']
    if fileUrl[-4:] == ".csv":
        df_excel = pd.read_csv(fileUrl, encoding="utf-8")
    else:
        df_excel = pd.read_excel(fileUrl, encoding="utf-8")
    # 全表统计
    res = []
    statistics = [' 字段名',' 类型','总数','最小值','最小值位置','25%分位数','中位数','75%分位数','均值','最大值','最大值位置','平均绝对偏差','方差','标准差','偏度','峰度']
    for columnName in columnNames:
        info = {}.fromkeys(statistics)
        info[' 字段名'] = columnName
        info[' 类型'] = df_excel[columnName].dtype
        if info[' 类型'] == 'int64' or info[' 类型'] == 'float64':
            info[' 类型'] = 'number'
            info['总数'] = str(df_excel[columnName].count())
            info['最小值'] = str(df_excel[columnName].min())
            info['最小值位置'] = str(df_excel[columnName].idxmin())
            info['25%分位数'] = str(df_excel[columnName].quantile(.25))
            info['中位数'] = str(df_excel[columnName].median())
            info['75%分位数'] = str(df_excel[columnName].quantile(.75))
            info['均值'] = str(df_excel[columnName].mean())
            info['最大值'] = str(df_excel[columnName].max())
            info['最大值位置'] = str(df_excel[columnName].idxmax())
            info['平均绝对偏差'] = str(df_excel[columnName].mad())
            info['方差'] = str(df_excel[columnName].var())
            info['标准差'] = str(df_excel[columnName].std())
            info['偏度'] = str(df_excel[columnName].skew())
            info['峰度'] = str(df_excel[columnName].kurt())
            print('int')
        else:
            info[' 类型'] = "text"
            info['总数'] = str(df_excel[columnName].count())
            print("text")
        res.append(info)
    # 写入文件
    mkdir(projectAddress+'/全表统计')
    from app.constFile import const

    save_dir = const.SAVEDIR
    # jsonFileName = str(int(time.time()))+'.json'
    json_str = json.dumps(res, ensure_ascii=False)
    with open(projectAddress+'/全表统计/' + jsonFileName, "w", encoding="utf-8") as f:
        json.dump(json_str, f, ensure_ascii=False)
        print("加载入文件完成...")
    result = {}
    result['fileName'] = jsonFileName
    result['data'] = res
    response = jsonify(result)
    response.headers.add('Access-Control-Allow-Origin', '*')
    return response
Beispiel #21
0
def chiSqSelectorCore(requestStr):
    # 对参数格式进行转化:json->字典,并进一步进行解析
    requestDict = json.loads(requestStr)
    projectName = requestDict['projectName']
    columnNamesStr = requestDict['columnNames']
    columnName_label = requestDict['columnName_label']
    # columnName_label必须为单列,否则报错
    if len(columnName_label.split(",")) != 1:
        return "error_columnInputNumSingle"

    # 获取卡方选择结果topN的数目,默认numTopFeatures为1
    try:
        numTopFeatures = requestDict['numTopFeatures']
    except:
        numTopFeatures = 1

    columnNames = columnNamesStr.split(",")
    # columnNames的数目必须大于1,否则报错
    if len(columnNames) < 2:
        return "error_columnInputNumMultiple"
    # 新列的列名默认为"卡方选择" + (与 columnName_label 相关的前 numTopFeatures 个特征列),若用户指定,以用户指定为准
    try:
        newColumnName = requestDict['newColumnName']
    except:
        newColumnName = "卡方选择" + "(与 [" + str(columnName_label) + "] 相关的前 " + str(numTopFeatures) + " 个特征列)"

    # spark会话
    spark = SparkSession \
        .builder \
        .master("local") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()

    # 解析项目路径,读取csv
    urls = getProjectCurrentDataUrl(projectName)
    if urls == 'error':
        return 'error_projectUrl'  # 错误类型:项目名或项目路径有误
    fileUrl = urls['fileUrl']
    df = spark.read.csv(fileUrl, header=True, inferSchema=True)

    # 转化列类型 -> 向量, 输入列必须为数值型,否则返回错误信息
    vecAssembler = VectorAssembler(inputCols=columnNames, outputCol="features")
    try:
        df = vecAssembler.transform(df)
    except utils.IllegalArgumentException:
        return "error_numerical"

    # 设定标签列label
    df = df.withColumn("label", df[columnName_label])

    # 设定多项式扩展模型
    selector = ChiSqSelector(numTopFeatures=numTopFeatures, outputCol=newColumnName)

    # 训练,若label的类型不是数值型,报错
    try:
        df = selector.fit(df).transform(df)
    except utils.IllegalArgumentException:
        return "error_numerical"

    df = df.drop("features")
    df.show()

    # 追加处理流程记录
    operateParameter = {}
    operateParameter['type'] = '13'
    operateParameter['operate'] = requestStr
    addProcessingFlow(projectName, "admin", operateParameter)

    return df