Exemple #1
0
def df_filter_row_by_id():
    """
    功能
        根据A表提供的ID 对B表的ID进行过滤
    """
    st(context=21)
    ret = {}
    try:
        # 获取相应数据
        store = pd.HDFStore(HDF5_path)
        df_name = request.args.get('origin_samples', None)
        assert HDF5_PREF+df_name+DATA_SUFF in store.keys(), \
            "dataframe %s not in store %s"%(HDF5_PREF+df_name+DATA_SUFF, store.filename)
        df_id_name = request.args.get('id_candicate', None)
        assert HDF5_PREF+df_id_name+DATA_SUFF in store.keys(), \
            "dataframe %s not in store %s"%(HDF5_PREF+df_id_name+DATA_SUFF, store.filename)
        new_dataframe_name = request.args.get('new_dataframe_name', None)
        df = store[df_name+DATA_SUFF]
        ix.tag_meta_auto(df)
        tmp1 = store[df_id_name+DATA_SUFF]["DESYNPUF_ID"]
        def f1(*args):
            id = args[0]
            return id in [a for a in tmp1]
        cnt_rows = ix.select_rows_by_function(df, f1)
        df = ix.filter_rows(df, cnt_rows)
        update_df_in_HDFStore_by_name(store, new_dataframe_name, df)
        # assert df_name!=new_dataframe_name, "df_name and new_dataframe_name cannot be the same"
        # store.put(new_dataframe_name+DATA_SUFF, df)
        ret['info'] = 'affects %s number of rows'%(str(sum(cnt_rows)))
        store.close()
    except Exception, e:
        store.close()
        return render_template('dc_error.html', e_message=e)
Exemple #2
0
def deal_feature_engineering(store, df_name, feature_list, action, derive_prefix, new_dataframe_name):
    """
    功能
        1) 将传入的feature进行特征打散
        2) 将新生成的特征存入dataframe
    入参
        store : 与HDF5存储的接口
        df_name : 在HDF5中需要处理的dataframe
        feature_list : 需要处理的列
        action : 对传入的feature_list进行的操作
        derive_prefix : 由factor类型变量衍生出新列的前缀名
        new_dataframe_name : 新生成的dataframe的name 
    """
    assert action in app.config['COL_ACTION'].keys(), \
            "columns action not in app.config['COL_ACTION']"
    assert HDF5_PREF+df_name+DATA_SUFF in store.keys(), \
            "dataframe %s not in store %s"%(HDF5_PREF+df_name+DATA_SUFF, store.filename)
    for feature in feature_list:
        assert feature in store[df_name+DATA_SUFF].columns, \
                "column %s not in dataframe %s"%(feature, df_name+DATA_SUFF)
    df = store[df_name+DATA_SUFF]
    ix.tag_meta_auto(df)
    derived = []
    for feature in feature_list:
        t = ix.derive_columns_from_factor(df, feature, derive_prefix=derive_prefix)
        derived.append(t)
    df = pd.concat(derived, axis=1, join_axes=[df.index])
    # 更新data信息 如果没有输入new_dataframe_name 则不做操作
    if new_dataframe_name!='':
        update_df_in_HDFStore_by_name(store, new_dataframe_name, df) 
Exemple #3
0
def dc_data_exploration():
    paras = {}
    if request.method=='POST':
        try:
            # 0.推给前端数据结构为meta_list
            #   [(df_name1, (meta_column_names1, meta_data1)), (df_name2, (meta_column_names2, meta_data2)), ... ]
            #   这里的df_name是干净的不带前后缀的
            meta_list = []
            # 1.从HDFStore中读取dataframe 并分流为需要exploration和不需要exploration的部分
            store = pd.HDFStore(HDF5_path)
            df_list_need_exploration = []
            df_list_not_exploration = []
            divide_dataframe_for_exploration(store, df_list_need_exploration, df_list_not_exploration)
            # 2.处理需要exploration的dataframe
            for df in df_list_need_exploration:
                ix.tag_meta_auto(df[1], num2factor_threshold=2)
                # 重写dataframe对应的meta的全表信息 & 信息添加到meta_list中
                meta_column_names, meta_df = trans_peek_to_2Dtable(peek(df[1], meta=True), df[1], df[0]) 
                meta_column_names = [ meta_show_name_map.get(meta, u'未设置显示名称') for meta in meta_column_names ]
                meta_list.append( (df[0], (meta_column_names, meta_df.values)) )
            # 3.处理不需要exploration的dataframe
            for df in df_list_not_exploration:
                meta_df = store.get(df[0]+META_SUFF)
                meta_column_names = meta_df.columns
                meta_column_names = [ meta_show_name_map.get(meta, u'未设置显示名称') for meta in meta_column_names ]
                meta_list.append( (df[0], (meta_column_names, meta_df.values)) )
            store.close()
            # 4.推给前端meta_list #
            paras['meta_list'] = meta_list
            return render_template('dc_data_exploration_content.html', **paras)
        except Exception,e:
            return render_template('dc_error.html', e_message=e)
Exemple #4
0
def dc_pro_data_cleansing():
    """
    功能
        处理高级列过滤
    """
    paras = {}
    try:
        # 确认ajax传回的df_name存在于HDFStore中
        store = pd.HDFStore(HDF5_path)
        df_name = request.args.get('df_name', None)
        assert HDF5_PREF+df_name+DATA_SUFF in store.keys(), \
            "dataframe %s not in store %s"%(HDF5_PREF+df_name+DATA_SUFF, store.filename)
        # 接受ajax传回的传入select_columns_by_condition参数
        # 通过字典col_filter_condition_paras来控制截取需要从ajax截取的参数
        received_condition = {}
        for k in col_filter_condition_paras.keys():
            if request.args.get(k)!='':
                received_condition[k] = request.args.get(k)
        # 验证ajax传入的参数
        assert validate_pro_data_cleansing(received_condition), "recived paras from ajax are not valid"
        # 执行过滤模块 返回过滤后剩下的列名
        df = store[df_name+DATA_SUFF]
        ix.tag_meta_auto(df)
        # A tmp tradeoff method to correct the datatype of all ready explored dataframe 
        assert HDF5_PREF+df_name+META_SUFF in store.keys(), \
            "dataframe %s not in store %s"%(HDF5_PREF+df_name+META_SUFF, store.filename)
        df_meta = store[HDF5_PREF+df_name+META_SUFF]
        for i in range(len(df_meta.index)):
            col_name = df_meta.ix[i]['col_name']
            col_datatype = df_meta.ix[i]['col_datatype']
            ix.update_meta(df, [col_name], "col_datatype", col_datatype)
        remained_col_names = ix.select_columns_by_condition(df,**received_condition)
        # 更新HDFStore中的数据(DATA数据 META数据) 默认DATA数据和META数据都存在
        # 1) 如果是'CREATE'模式 则生成新dataframe, 并传入新dataframe的name
        # 2) 如果是'REMAIN'模式 则更新原有dataframe并保留符合条件的列
        if request.args.get('new_dataframe_name','') != '':
                paras['meta_table'] = deal_pro_column_filtering(store, df_name, \
                        remained_col_names, 'CREATE', request.args.get('new_dataframe_name'))
        else:
            paras['meta_table'] = deal_pro_column_filtering(store, df_name, remained_col_names, 'REMAIN')
        store.close()
        return render_template('dc_pro_data_cleansing.html', **paras)
    except Exception,e:
        return render_template('dc_error.html', e_message=e)
Exemple #5
0
def dc_feature_engineering_datetime():
    """
    功能
        能进入到这里的数据 都保证是datetime类型了
        将datetime列按格式转换, 并进行数值化处理
    """
    st(context=21)
    ret = {}
    try:
        store = pd.HDFStore(HDF5_path)
        df_name = request.args.get('df_name', None)
        new_dataframe_name = request.args.get('new_dataframe_name',None)
        value_as_base = pd.to_datetime(request.args.get('value_as_base'))
        derive_prefix = request.args.get('derive_prefix',None)
        assert HDF5_PREF+df_name+DATA_SUFF in store.keys(), \
            "dataframe %s not in store %s"%(HDF5_PREF+df_name+DATA_SUFF, store.filename)
        df = store[df_name+DATA_SUFF]
        col_name_list = []
        for col_name in request.args.getlist('col_names',None):
            col_name_list.append(col_name.split('.')[1])
            print col_name
        # 将选中的datetime类型的列进行格式转换
        ix.tag_meta_auto(df)
        ix.update_meta(df, col_name_list, "col_datatype","datetime")
        ix.type_casting(df, col_name_list, dt_format="%Y%m%d") 
        store[df_name+DATA_SUFF] = df
        # 将转换后的数据存入新dataframe中
        t_df = ix.derive_columns_from_datetime(
                df, 
                col_name_list, 
                value_as_base=value_as_base,
                inverse=True,
                derive_prefix=derive_prefix)
        if new_dataframe_name!='':
            update_df_in_HDFStore_by_name(store, new_dataframe_name, t_df) 
        # 这里不需要跟新md5 因为格式已经变化
        ret['impact_columns'] = str(len(col_name_list))
        store.close()
        return json.dumps(ret)
    except Exception,e:
        return render_template('dc_error.html', e_message=e)
Exemple #6
0
def dc_feature_engineering_one_to_one():
    paras = {}
    try:
        # 获取ajax回传参数 
        store = pd.HDFStore(HDF5_path)
        df_name = request.args.get('df_name', None)
        assert HDF5_PREF+df_name+DATA_SUFF in store.keys(), \
            "dataframe %s not in store %s"%(HDF5_PREF+df_name+DATA_SUFF, store.filename)
        new_dataframe_name = request.args.get('new_dataframe_name', None)
        input_col = request.args.get('input_col', None)
        lambda_function_type = request.args.get('lambda_function_type', None)
        output_col = request.args.get('output_col', None)
        df = store[df_name+DATA_SUFF]
        ix.tag_meta_auto(df)
        if lambda_function_type=='isnull':
            new_dataframe = ix.derive_one_to_one(df, input_col, lambda a: not pd.isnull(a), output_col)
        store.put(new_dataframe_name+DATA_SUFF, new_dataframe)
        store.close()
        return redirect(url_for('dc_feature_engineering'))
    except Exception,e:
        return render_template('dc_error.html', e_message=e)
Exemple #7
0
def dc_select_row_by_expr():
    """
    功能
        根据表达式过滤行
    """
    st(context=21)
    ret = {}
    try:
        store = pd.HDFStore(HDF5_path)
        # 接受ajax传回参数
        df_name = request.args.get('df_name', None)
        assert HDF5_PREF+df_name+DATA_SUFF in store.keys(), \
            "dataframe %s not in store %s"%(HDF5_PREF+df_name+DATA_SUFF, store.filename)
        new_dataframe_name = request.args.get('new_dataframe_name', None)
        rval_expr = request.args.get('rval_expr', None)
        # non_NA_percent = request.args.get('non_NA_percent',None)
        non_NA_percent = 0
        # expr_symbol = urllib2.unquote(request.args.get('expr_symbol', None))
        expr_symbol = '\$' 
        # 按条件过滤数据 
        df = store[df_name+DATA_SUFF]
        ix.tag_meta_auto(df)
        cnt_rows = ix.select_rows_by_expr(
                    df, 
                    expr_symbol=expr_symbol,
                    non_NA_percent=non_NA_percent,
                    rval_expr=rval_expr
                    )
        df = ix.filter_rows(df, cnt_rows)
        # 生成新dataframe 
        update_df_in_HDFStore_by_name(store, new_dataframe_name, df)
        ret['info'] = 'affects %s number of rows'%(str(sum(cnt_rows)))
        store.close()
    except Exception, e:
        store.close()
        return render_template('dc_error.html', e_message=e)