def approxi_quantile(data_instances, params, cols_dict, abnormal_list, header, is_sparse): """ Calculates each quantile information Parameters ---------- data_instances : DTable The input data cols_dict: dict Record key, value pairs where key is cols' name, and value is cols' index. params : FeatureBinningParam object, Parameters that user set. abnormal_list: list, default: None Specify which columns are abnormal so that will not static when traveling. header: list, Storing the header information. is_sparse: bool Specify whether data_instance is in sparse type Returns ------- summary_dict: dict {'col_name1': summary1, 'col_name2': summary2, ... } """ summary_dict = {} summary_param = { 'compress_thres': params.compress_thres, 'head_size': params.head_size, 'error': params.error, 'abnormal_list': abnormal_list } for col_name, col_index in cols_dict.items(): quantile_summaries = quantile_summary_factory( is_sparse=is_sparse, param_dict=summary_param) summary_dict[col_name] = quantile_summaries QuantileBinning.insert_datas(data_instances, summary_dict, cols_dict, header, is_sparse) for _, summary_obj in summary_dict.items(): summary_obj.compress() return summary_dict
def feature_summary(data_iter, params, cols_dict, abnormal_list, header, is_sparse): summary_dict = {} summary_param = { 'compress_thres': params.compress_thres, 'head_size': params.head_size, 'error': params.error, 'abnormal_list': abnormal_list } for col_name, col_index in cols_dict.items(): quantile_summaries = quantile_summary_factory( is_sparse=is_sparse, param_dict=summary_param) summary_dict[col_name] = quantile_summaries _ = str(uuid.uuid1()) for _, instant in data_iter: if not is_sparse: if type(instant).__name__ == 'Instance': features = instant.features else: features = instant for col_name, summary in summary_dict.items(): col_index = cols_dict[col_name] summary.insert(features[col_index]) else: data_generator = instant.features.get_all_data() for col_idx, col_value in data_generator: col_name = header[col_idx] if col_name not in cols_dict: continue summary = summary_dict[col_name] summary.insert(col_value) result = [] for features_name, summary_obj in summary_dict.items(): summary_obj.compress() # result.append(((_, features_name), summary_obj)) result.append((features_name, summary_obj)) return result