def randomForest(request): # 获取数据 f = request.FILES.get("csv_file") filename = csv_util.upload(f) data_train = pd.read_csv(filename) os.remove(filename) # target是补全的目标列名 target = request.POST['target'] # ref是用来生成拟合值的相关项列表,格式是以逗号将各项隔开的字符串,如:SibSp,Pclass,Fare,Parch ref_str = request.POST['ref'] ref = ref_str.split(',') ref.insert(0,target)#将target列名插入在ref最前面 print(ref) target_df=data_train[ref]#将这些列的数据都取出来 #将数据分成已知目标项值和未知目标项值两部分 known_data= target_df[target_df[target].notnull()].as_matrix() unknown_data = target_df[target_df[target].isnull()].as_matrix() # y即目标值 y = known_data[:, 0] # X即特征属性值 X = known_data[:, 1:] # fit到RandomForestRegressor之中 rfr = RandomForestRegressor(random_state=0, n_estimators=2000, n_jobs=-1) rfr.fit(X, y) # 用得到的模型进行未知目标值结果预测 predictedAges = rfr.predict(unknown_data[:, 1::]) # 用得到的预测结果填补原缺失数据 data_train.loc[(data_train[target].isnull()), target] = predictedAges return response_util.csv_info(data_train)
def scale_(request): if "POST" == request.method: # 获取数据 f = request.FILES.get("csv_file") filename = csv_util.upload(f) data_train = pd.read_csv(filename) os.remove(filename) #尺度变换目标列名 target_str = request.POST['target'] target = target_str.split(',') # 尺度变换方法:log2、log10、ln、abs、sqrt scale = request.POST['scale'] for each in target: target_df = data_train[each] if scale == 'log2': data_train[each] = np.log2(target_df) elif scale == 'log10': data_train[each] = np.log10(target_df) elif scale == 'ln': data_train[each] = np.log(target_df) elif scale == 'abs': data_train[each] = np.abs(target_df) elif scale == 'sqrt': data_train[each] = np.sqrt(target_df) else: return response_util.wrong_info( '输入的方法不包含在log2、log10、ln、abs、sqrt里') return response_util.csv_info(data_train)
def setId(request): if "POST" == request.method: # 获取数据 f = request.FILES.get("csv_file") filename=csv_util.upload(f) data_train=pd.read_csv(filename) os.remove(filename) # 增加id列 count = len(data_train) list = [] for num in range(0, count): list.append(num) data_train.insert(0, 'id', list) return response_util.csv_info(data_train)
def standard(request): # 获取数据 f = request.FILES.get("csv_file") filename = csv_util.upload(f) data_train = pd.read_csv(filename) os.remove(filename) # target是标准化的目标列名 target_str = request.POST['target'] target = target_str.split(',') scaler = preprocessing.StandardScaler() for each in target: standard_data = scaler.fit_transform(data_train[each].values.reshape(-1,1)) data_train.drop([each], axis=1, inplace=True) data_train[each] = standard_data return response_util.csv_info(data_train)
def dummy(request): if "POST" == request.method: # 获取数据 f = request.FILES.get("csv_file") filename = csv_util.upload(f) data_train = pd.read_csv(filename) os.remove(filename) # target_str为需要因子化的列名,以逗号隔开,例:cabin,sex,pclass target_str = request.POST['target'] target = target_str.split(',') for each in target: dummies = pd.get_dummies(data_train[each], prefix=each) data_train = pd.concat([data_train, dummies], axis=1) data_train.drop([each], axis=1, inplace=True) return response_util.csv_info(data_train)
def soften(request): if "POST" == request.method: # 获取数据 f = request.FILES.get("csv_file") filename = csv_util.upload(f) data_train = pd.read_csv(filename) os.remove(filename) #平滑目标列名 target = request.POST['target'] # 平滑方法:百分位 per 、阈值 thresh soften_method = request.POST['soften_method'] # 上下数据点(上下百分位或最大最小阈值) min_value = int(request.POST['min']) max_value = int(request.POST['max']) # 取出目标数据 target_df = data_train[target].values.tolist() if soften_method == 'per': # 百分位平滑 if min_value < 0: return response_util.wrong_info('百分位平滑,min值应不小于0') elif max_value > 100: return response_util.wrong_info('百分位平滑,max值应不大于100') else: min_value /= 100 max_value /= 100 # 列表中最小最大值 data_min = min(target_df) data_max = max(target_df) # 计算出来的范围 min_num = data_min + min_value * (data_max - data_min) max_num = data_min + max_value * (data_max - data_min) for i in range(0, len(target_df)): if target_df[i] < min_num: target_df[i] = min_num elif target_df[i] > max_num: target_df[i] = max_num data_train[target] = target_df elif soften_method == 'thresh': # 阈值平滑 for i in range(0, len(target_df)): if target_df[i] < min_value: target_df[i] = min_value elif target_df[i] > max_value: target_df[i] = max_value data_train[target] = target_df else: return response_util.wrong_info('输入的方法不包含在per/thresh里') return response_util.csv_info(data_train)
def normalize(request): if "POST" == request.method: # 获取数据 f = request.FILES.get("csv_file") filename = csv_util.upload(f) data_train = pd.read_csv(filename) os.remove(filename) #归一化目标列名 target = request.POST['target'] # 取出目标列数据 mm = preprocessing.MinMaxScaler() # 归一化 target_str = request.POST['target'] target = target_str.split(',') for each in target: mm_data = mm.fit_transform(data_train[each].values.reshape( -1, 1)) # 处理数据 data_train.drop([each], axis=1, inplace=True) data_train[each] = mm_data return response_util.csv_info(data_train)
def discrete(request): if "POST" == request.method: # 获取数据 f = request.FILES.get("csv_file") filename = csv_util.upload(f) data_train = pd.read_csv(filename) os.remove(filename) #离散化目标列名 target = request.POST['target'] # 离散化方法:等频 frequency 、等距 metric、聚类 cluster discrete_method = request.POST['discrete_method'] # 离散区间 num = int(request.POST['num']) target_df = data_train[target] if discrete_method == 'metric': # 等距离散化 data_train[target] = pd.cut(target_df, num, labels=range(num)) elif discrete_method == 'frequency': # 等频率离散化 w = [1.0 * i / num for i in range(num + 1)] w = target_df.describe(percentiles=w)[4:4 + num + 1] w[0] = w[0] * (1 - 1e-10) data_train[target] = pd.cut(target_df, w, labels=range(num)) elif discrete_method == 'cluster': #基于聚类的离散化 kmodel = KMeans(n_clusters=num, n_jobs=4) # n_jobs是并行数,一般等于CPU数 kmodel.fit(target_df.values.reshape((len(target_df), 1))) c = pd.DataFrame(kmodel.cluster_centers_).sort_values(0) # rolling_mean表示移动平均,即用当前值和前2个数值取平均数, # 由于通过移动平均,会使得第一个数变为空值,因此需要使用.iloc[1:]过滤掉空值。 w = c.rolling(2).mean().iloc[1:] w = [0] + list(w[0]) + [target_df.max() ] # 把首末边界点加上,首边界为0,末边界为data的最大值120000 data_train[target] = pd.cut( target_df, w, labels=range(num)) # cut函数实现将data中的数据按照w的边界分类。 else: return response_util.wrong_info( '输入的方法不包含在frequency/metric/cluster里') return response_util.csv_info(data_train)