def df_filter_row_by_id(): """ 功能 根据A表提供的ID 对B表的ID进行过滤 """ st(context=21) ret = {} try: # 获取相应数据 store = pd.HDFStore(HDF5_path) df_name = request.args.get('origin_samples', None) assert HDF5_PREF+df_name+DATA_SUFF in store.keys(), \ "dataframe %s not in store %s"%(HDF5_PREF+df_name+DATA_SUFF, store.filename) df_id_name = request.args.get('id_candicate', None) assert HDF5_PREF+df_id_name+DATA_SUFF in store.keys(), \ "dataframe %s not in store %s"%(HDF5_PREF+df_id_name+DATA_SUFF, store.filename) new_dataframe_name = request.args.get('new_dataframe_name', None) df = store[df_name+DATA_SUFF] ix.tag_meta_auto(df) tmp1 = store[df_id_name+DATA_SUFF]["DESYNPUF_ID"] def f1(*args): id = args[0] return id in [a for a in tmp1] cnt_rows = ix.select_rows_by_function(df, f1) df = ix.filter_rows(df, cnt_rows) update_df_in_HDFStore_by_name(store, new_dataframe_name, df) # assert df_name!=new_dataframe_name, "df_name and new_dataframe_name cannot be the same" # store.put(new_dataframe_name+DATA_SUFF, df) ret['info'] = 'affects %s number of rows'%(str(sum(cnt_rows))) store.close() except Exception, e: store.close() return render_template('dc_error.html', e_message=e)
def deal_feature_engineering(store, df_name, feature_list, action, derive_prefix, new_dataframe_name): """ 功能 1) 将传入的feature进行特征打散 2) 将新生成的特征存入dataframe 入参 store : 与HDF5存储的接口 df_name : 在HDF5中需要处理的dataframe feature_list : 需要处理的列 action : 对传入的feature_list进行的操作 derive_prefix : 由factor类型变量衍生出新列的前缀名 new_dataframe_name : 新生成的dataframe的name """ assert action in app.config['COL_ACTION'].keys(), \ "columns action not in app.config['COL_ACTION']" assert HDF5_PREF+df_name+DATA_SUFF in store.keys(), \ "dataframe %s not in store %s"%(HDF5_PREF+df_name+DATA_SUFF, store.filename) for feature in feature_list: assert feature in store[df_name+DATA_SUFF].columns, \ "column %s not in dataframe %s"%(feature, df_name+DATA_SUFF) df = store[df_name+DATA_SUFF] ix.tag_meta_auto(df) derived = [] for feature in feature_list: t = ix.derive_columns_from_factor(df, feature, derive_prefix=derive_prefix) derived.append(t) df = pd.concat(derived, axis=1, join_axes=[df.index]) # 更新data信息 如果没有输入new_dataframe_name 则不做操作 if new_dataframe_name!='': update_df_in_HDFStore_by_name(store, new_dataframe_name, df)
def dc_data_exploration(): paras = {} if request.method=='POST': try: # 0.推给前端数据结构为meta_list # [(df_name1, (meta_column_names1, meta_data1)), (df_name2, (meta_column_names2, meta_data2)), ... ] # 这里的df_name是干净的不带前后缀的 meta_list = [] # 1.从HDFStore中读取dataframe 并分流为需要exploration和不需要exploration的部分 store = pd.HDFStore(HDF5_path) df_list_need_exploration = [] df_list_not_exploration = [] divide_dataframe_for_exploration(store, df_list_need_exploration, df_list_not_exploration) # 2.处理需要exploration的dataframe for df in df_list_need_exploration: ix.tag_meta_auto(df[1], num2factor_threshold=2) # 重写dataframe对应的meta的全表信息 & 信息添加到meta_list中 meta_column_names, meta_df = trans_peek_to_2Dtable(peek(df[1], meta=True), df[1], df[0]) meta_column_names = [ meta_show_name_map.get(meta, u'未设置显示名称') for meta in meta_column_names ] meta_list.append( (df[0], (meta_column_names, meta_df.values)) ) # 3.处理不需要exploration的dataframe for df in df_list_not_exploration: meta_df = store.get(df[0]+META_SUFF) meta_column_names = meta_df.columns meta_column_names = [ meta_show_name_map.get(meta, u'未设置显示名称') for meta in meta_column_names ] meta_list.append( (df[0], (meta_column_names, meta_df.values)) ) store.close() # 4.推给前端meta_list # paras['meta_list'] = meta_list return render_template('dc_data_exploration_content.html', **paras) except Exception,e: return render_template('dc_error.html', e_message=e)
def dc_pro_data_cleansing(): """ 功能 处理高级列过滤 """ paras = {} try: # 确认ajax传回的df_name存在于HDFStore中 store = pd.HDFStore(HDF5_path) df_name = request.args.get('df_name', None) assert HDF5_PREF+df_name+DATA_SUFF in store.keys(), \ "dataframe %s not in store %s"%(HDF5_PREF+df_name+DATA_SUFF, store.filename) # 接受ajax传回的传入select_columns_by_condition参数 # 通过字典col_filter_condition_paras来控制截取需要从ajax截取的参数 received_condition = {} for k in col_filter_condition_paras.keys(): if request.args.get(k)!='': received_condition[k] = request.args.get(k) # 验证ajax传入的参数 assert validate_pro_data_cleansing(received_condition), "recived paras from ajax are not valid" # 执行过滤模块 返回过滤后剩下的列名 df = store[df_name+DATA_SUFF] ix.tag_meta_auto(df) # A tmp tradeoff method to correct the datatype of all ready explored dataframe assert HDF5_PREF+df_name+META_SUFF in store.keys(), \ "dataframe %s not in store %s"%(HDF5_PREF+df_name+META_SUFF, store.filename) df_meta = store[HDF5_PREF+df_name+META_SUFF] for i in range(len(df_meta.index)): col_name = df_meta.ix[i]['col_name'] col_datatype = df_meta.ix[i]['col_datatype'] ix.update_meta(df, [col_name], "col_datatype", col_datatype) remained_col_names = ix.select_columns_by_condition(df,**received_condition) # 更新HDFStore中的数据(DATA数据 META数据) 默认DATA数据和META数据都存在 # 1) 如果是'CREATE'模式 则生成新dataframe, 并传入新dataframe的name # 2) 如果是'REMAIN'模式 则更新原有dataframe并保留符合条件的列 if request.args.get('new_dataframe_name','') != '': paras['meta_table'] = deal_pro_column_filtering(store, df_name, \ remained_col_names, 'CREATE', request.args.get('new_dataframe_name')) else: paras['meta_table'] = deal_pro_column_filtering(store, df_name, remained_col_names, 'REMAIN') store.close() return render_template('dc_pro_data_cleansing.html', **paras) except Exception,e: return render_template('dc_error.html', e_message=e)
def dc_feature_engineering_datetime(): """ 功能 能进入到这里的数据 都保证是datetime类型了 将datetime列按格式转换, 并进行数值化处理 """ st(context=21) ret = {} try: store = pd.HDFStore(HDF5_path) df_name = request.args.get('df_name', None) new_dataframe_name = request.args.get('new_dataframe_name',None) value_as_base = pd.to_datetime(request.args.get('value_as_base')) derive_prefix = request.args.get('derive_prefix',None) assert HDF5_PREF+df_name+DATA_SUFF in store.keys(), \ "dataframe %s not in store %s"%(HDF5_PREF+df_name+DATA_SUFF, store.filename) df = store[df_name+DATA_SUFF] col_name_list = [] for col_name in request.args.getlist('col_names',None): col_name_list.append(col_name.split('.')[1]) print col_name # 将选中的datetime类型的列进行格式转换 ix.tag_meta_auto(df) ix.update_meta(df, col_name_list, "col_datatype","datetime") ix.type_casting(df, col_name_list, dt_format="%Y%m%d") store[df_name+DATA_SUFF] = df # 将转换后的数据存入新dataframe中 t_df = ix.derive_columns_from_datetime( df, col_name_list, value_as_base=value_as_base, inverse=True, derive_prefix=derive_prefix) if new_dataframe_name!='': update_df_in_HDFStore_by_name(store, new_dataframe_name, t_df) # 这里不需要跟新md5 因为格式已经变化 ret['impact_columns'] = str(len(col_name_list)) store.close() return json.dumps(ret) except Exception,e: return render_template('dc_error.html', e_message=e)
def dc_feature_engineering_one_to_one(): paras = {} try: # 获取ajax回传参数 store = pd.HDFStore(HDF5_path) df_name = request.args.get('df_name', None) assert HDF5_PREF+df_name+DATA_SUFF in store.keys(), \ "dataframe %s not in store %s"%(HDF5_PREF+df_name+DATA_SUFF, store.filename) new_dataframe_name = request.args.get('new_dataframe_name', None) input_col = request.args.get('input_col', None) lambda_function_type = request.args.get('lambda_function_type', None) output_col = request.args.get('output_col', None) df = store[df_name+DATA_SUFF] ix.tag_meta_auto(df) if lambda_function_type=='isnull': new_dataframe = ix.derive_one_to_one(df, input_col, lambda a: not pd.isnull(a), output_col) store.put(new_dataframe_name+DATA_SUFF, new_dataframe) store.close() return redirect(url_for('dc_feature_engineering')) except Exception,e: return render_template('dc_error.html', e_message=e)
def dc_select_row_by_expr(): """ 功能 根据表达式过滤行 """ st(context=21) ret = {} try: store = pd.HDFStore(HDF5_path) # 接受ajax传回参数 df_name = request.args.get('df_name', None) assert HDF5_PREF+df_name+DATA_SUFF in store.keys(), \ "dataframe %s not in store %s"%(HDF5_PREF+df_name+DATA_SUFF, store.filename) new_dataframe_name = request.args.get('new_dataframe_name', None) rval_expr = request.args.get('rval_expr', None) # non_NA_percent = request.args.get('non_NA_percent',None) non_NA_percent = 0 # expr_symbol = urllib2.unquote(request.args.get('expr_symbol', None)) expr_symbol = '\$' # 按条件过滤数据 df = store[df_name+DATA_SUFF] ix.tag_meta_auto(df) cnt_rows = ix.select_rows_by_expr( df, expr_symbol=expr_symbol, non_NA_percent=non_NA_percent, rval_expr=rval_expr ) df = ix.filter_rows(df, cnt_rows) # 生成新dataframe update_df_in_HDFStore_by_name(store, new_dataframe_name, df) ret['info'] = 'affects %s number of rows'%(str(sum(cnt_rows))) store.close() except Exception, e: store.close() return render_template('dc_error.html', e_message=e)