def upgrade_category(self, source_id, category_no, be_att_id): be_id = self.get_be_id_by_be_att_id(be_att_id) core_tables = get_be_core_table_names( self.dnx_config.config_db_url, self.dnx_config.org_business_entities_collection, be_id) bt_current_dataset = self.dnx_db_path + core_tables[0] if is_dir_exists(bt_current_dataset): next_cat = self.get_next_be_att_id_category( source_id, be_att_id, category_no) current_category_dataset = bt_current_dataset + "\\SourceID=" + str( source_id) + "\\AttributeID=" + str( be_att_id) + "\\ResetDQStage=" + str( category_no) + "\\process_no=" + str(self.process_no) next_category_dataset = bt_current_dataset + "\\SourceID=" + str( source_id) + "\\AttributeID=" + str( be_att_id) + "\\ResetDQStage=" + str( next_cat) + "\\process_no=" + str(self.process_no) dq_result_dataset = self.result_db_path + core_tables[3] partioned_dq_result_dataset = dq_result_dataset + \ "\\SourceID=" + str(source_id) + \ "\\AttributeID=" + str(be_att_id) + \ "\\ResetDQStage=" + str(category_no) + \ "\\process_no="+str(self.process_no) +\ "\\is_issue=0" if is_dir_exists(partioned_dq_result_dataset): rowkeys = read_all_from_parquet(partioned_dq_result_dataset, ['RowKey'], True, filter=None) # rowkeys = read_all_from_parquet_delayed(partioned_dq_result_dataset, ['RowKey']).compute() suffix = "_old" # bt_dataset_old = current_category_dataset+suffix if is_dir_exists(current_category_dataset): bt_dataset_old = self.switch_dataset( current_category_dataset, suffix) rowkeys = rowkeys.set_index('RowKey') # parallel_delayed_upgrade_rowkeys = [] for bt_current in read_batches_from_parquet( bt_dataset_old, None, int(self.parameters_dict['bt_batch_size']), True): self.upgrade_rowkeys(bt_current, rowkeys, current_category_dataset, next_category_dataset) # delayed_upgrade_rowkeys = delayed(self.upgrade_rowkeys)(delayed(bt_current), delayed(rowkeys), current_category_dataset, next_category_dataset) # self.parallel_delayed_upgrade_rowkeys.append(delayed_upgrade_rowkeys) # compute(*parallel_delayed_upgrade_rowkeys, num_workers=self.cpu_num_workers) delete_dataset(bt_dataset_old)
def etl_be(self, source_id, bt_current_collection, bt_collection, source_collection, process_no, cpu_num_workers): base_bt_current_data_set = self.dnx_db_path + bt_current_collection bt_data_set = self.dnx_db_path + bt_collection base_source_data_set = self.src_db_path + source_collection source_data_set = base_source_data_set + '\\SourceID=' + str( source_id ) + '\\' + self.dnx_config.process_no_column_name + '=' + process_no bt_current_data_ddf = pd.DataFrame() bt_current_data_df = pd.DataFrame() bt_current_collection_old = base_bt_current_data_set + "_old" if int(self.parameters_dict['get_delta']) == 1: if is_dir_exists(bt_current_collection_old): pass # bt_current_data_ddf = read_all_from_parquet_delayed(dataset=bt_current_collection_old, # columns=bt_columns, # filter=None, # nthreads=self.cpu_num_workers) if is_dir_exists(source_data_set): parallel_delayed_load_data = [] for batch_no, get_source_data in enumerate( self.get_chunks_from_source_data(source_id, source_data_set)): bt_current_data_set = base_bt_current_data_set source_data_df, bt_ids = delayed(get_source_data[0]), delayed( get_source_data[1]) # if is_dir_exists(bt_current_collection_old): # bt_ids = delayed(data_to_list)(bt_ids['bt_id']) # bt_current_data_df = read_all_from_parquet_delayed(bt_current_collection_old, bt_columns, None) # bt_current_data_df = delayed(self.get_bt_current_data)(bt_current_collection_old, bt_columns, bt_ids) delayed_load_data = delayed(self.load_data)( source_data_df, bt_current_data_df, bt_data_set, bt_current_data_set, bt_current_collection_old, None) parallel_delayed_load_data.append(delayed_load_data) with ProgressBar(): compute(*parallel_delayed_load_data, num_workers=cpu_num_workers) delete_dataset(bt_current_collection_old)
def execute_data_rules(self, category_no, be_att_dr_id, source_id, join_with_f): # print('execute_data_rules started') be_data_rule_lvls_query = "select be_att_id, rule_id, next_pass, next_fail, kwargs from " + \ self.dnx_config.be_attributes_data_rules_lvls_collection + \ " where active = 1 and be_att_dr_id = " + str(be_att_dr_id) + " order by level_no" be_data_rule_lvls = get_all_data_from_source( self.dnx_config.config_db_url, None, be_data_rule_lvls_query) no_of_lvls = len(be_data_rule_lvls.index) for current_lvl_no, data_rule_lvls in enumerate( be_data_rule_lvls.iterrows(), start=1): data_rule_lvls = data_rule_lvls[1] be_att_id = data_rule_lvls['be_att_id'] rule_id = data_rule_lvls['rule_id'] next_pass = data_rule_lvls['next_pass'] next_fail = data_rule_lvls['next_fail'] kwargs = data_rule_lvls['kwargs'] g_result = 1 if no_of_lvls == current_lvl_no else 0 # print('no_of_lvls', be_att_dr_id, g_result, no_of_lvls, current_lvl_no) be_id = self.get_be_id_by_be_att_id(str(be_att_id)) core_tables = get_be_core_table_names( self.dnx_config.config_db_url, self.dnx_config.org_business_entities_collection, be_id) bt_current_collection = core_tables[0] source_collection = core_tables[2] dq_result_collection = core_tables[3] # print(core_tables) base_bt_current_data_set = self.dnx_db_path + bt_current_collection src_f_data_set = self.src_f_db_path + source_collection + "\\SourceID=" + str( source_id) result_data_set = self.result_db_path + dq_result_collection # self.all_result_data_set.append(result_data_set) if result_data_set not in self.all_result_data_set else None result_data_set_tmp = result_data_set + "_tmp" if current_lvl_no == 1 and join_with_f == 1: src_f_data = read_all_from_parquet_delayed(src_f_data_set) else: src_f_data = None # src_f_data = src_f_data.set_index('rowkey') if is_dir_exists(base_bt_current_data_set): self.execute_lvl_data_rules( src_f_data, base_bt_current_data_set, result_data_set, result_data_set_tmp, source_id, be_att_dr_id, category_no, be_att_id, rule_id, g_result, current_lvl_no, next_pass, next_fail, join_with_f, kwargs)
def load_data(self, p_source_data, p_current_data, bt_data_set, bt_current_data_set, bt_current_collection_old, p_bt_ids=None): if int(self.parameters_dict['get_delta']) == 1 and is_dir_exists( bt_current_collection_old): # bt_ids = p_bt_ids.set_index('bt_id') # bt_current_data_df = p_current_data.merge(bt_ids, left_index=True, right_index=True) # bt_current_data_df = bt_current_data_df.compute() # print('p_bt_idsp_bt_ids', p_bt_ids) # filter_bt_ids = [['bt_id', p_bt_ids], ] # bt_current_data_df = self.get_bt_current_data(bt_current_collection_old, bt_columns, filter_bt_ids) # print('bt_current_data_df', len(bt_current_data_df.index)) # print(bt_current_data_df.index) # print('p_source_data', len(p_source_data.index)) # print(p_source_data.index) get_delta_result = self.get_delta(p_source_data, p_current_data) same_df = get_delta_result[5] save_to_parquet(same_df, bt_current_data_set, bt_partition_cols, bt_object_cols) if get_delta_result[3] in (0, 2): #etl_occurred assert len(get_delta_result[0]) == len(get_delta_result[1]) modified_df = get_delta_result[0] expired_df = get_delta_result[1] # expired_ids = get_delta_result[4] # modified_df['batch_no'] = batch_no save_to_parquet(modified_df, bt_current_data_set, bt_partition_cols, bt_object_cols) # expired_df['batch_no'] = batch_no save_to_parquet(expired_df, bt_data_set, bt_partition_cols, bt_object_cols) if get_delta_result[3] in (1, 2): # etl_occurred new_data_df = get_delta_result[2] save_to_parquet(new_data_df, bt_current_data_set, bt_partition_cols, bt_object_cols) else: save_to_parquet(p_source_data, bt_current_data_set, bt_partition_cols, bt_object_cols)
def execute_lvl_data_rules(self, src_f_data, base_bt_current_data_set, result_data_set, result_data_set_tmp, source_id, be_att_dr_id, category_no, be_att_id, rule_id, g_result, current_lvl_no, next_pass, next_fail, join_with_f, kwargs): # print('++++++++ source_id:', source_id, 'be_att_dr_id:', be_att_dr_id, 'category_no:', category_no) # print('++++++++ be_att_id:', be_att_id, 'rule_id:', rule_id, 'g_result:', g_result, 'current_lvl_no:', current_lvl_no, # 'next_pass:'******'next_fail:', next_fail) result_data_set_tmp = result_data_set_tmp + "_" + str( be_att_dr_id) + "_process_no_" + str(self.process_no) suffix = "_old" result_data_set_tmp_old = self.switch_dataset(result_data_set_tmp, suffix) for bt_current_data_df in self.get_bt_current_data( src_f_data, base_bt_current_data_set, source_id, category_no, be_att_id, join_with_f): # print('len_bt_current_data_df', len(bt_current_data_df.index)) if not bt_current_data_df.empty: if current_lvl_no > 1: result_df = pd.DataFrame() if is_dir_exists(result_data_set_tmp_old): for row_keys_df in self.get_tmp_rowkeys( result_data_set_tmp_old ): # filter with level number too! bt_nxt_lvl_current_data_df = bt_current_data_df[ bt_current_data_df['RowKey'].isin(row_keys_df)] if not bt_nxt_lvl_current_data_df.empty: result_lvl_df = self.validate_data_rule( bt_nxt_lvl_current_data_df, be_att_dr_id, rule_id, kwargs) result_df = result_df.append(result_lvl_df) else: result_df = self.validate_data_rule( bt_current_data_df, be_att_dr_id, rule_id, kwargs) self.insert_result_df(result_df, g_result, result_data_set, next_pass, next_fail, result_data_set_tmp, source_id, category_no, be_att_id) delete_dataset(result_data_set_tmp_old)
def get_bt_current_data(self, bt_dataset, columns, bt_ids): bt_df = pd.DataFrame() if is_dir_exists(bt_dataset): filter_bt_ids = [ ['bt_id', bt_ids], ] # bt_df = read_all_from_parquet_delayed(dataset=bt_dataset, # columns=columns, # use_threads=True,#self.cpu_num_workers, # filter=filter).compute() for df in read_batches_from_parquet( bt_dataset, columns, int(self.parameters_dict['bt_batch_size']), True, #self.cpu_num_workers, filter=filter_bt_ids): if not df.empty: # print('bt_df.columns', bt_df.columns) # print('bt_df.info', bt_df.info()) bt_df = bt_df.append(df) return bt_df
def show_results(config_db_url, result_db_path, org_business_entities_collection): print( "**********************************************************************" ) be_ids = bt.StartBT.get_be_ids(config_db_url) for i, be_id in be_ids.iterrows(): be_id = be_id['be_id'] core_tables = get_be_core_table_names( config_db_url, org_business_entities_collection, be_id) dq_result_collection = core_tables[3] result_path = result_db_path + dq_result_collection # result_path =None # result_path = "C:\\dc\\parquet_db\\Result\\result_4383_10" if is_dir_exists(result_path): df1 = dd.read_parquet(path=result_path, engine='pyarrow')[[ 'p_SourceID', 'p_AttributeID', 'p_ResetDQStage', 'p_be_att_dr_id', 'p_data_rule_id', 'p_is_issue', 'bt_id' ]] df2 = df1.reset_index() df2.columns = [ 'indx', 'SourceID', 'AttributeID', 'Category_no', 'be_att_dr_id', 'rule_id', 'is_issue', 'bt_id' ] df2 = df2.groupby([ 'SourceID', 'AttributeID', 'Category_no', 'be_att_dr_id', 'rule_id', 'is_issue' ]).agg({'bt_id': ['count']}) df2.columns = ["cells#"] with ProgressBar(): print(df2.compute()) print( "----------------------------------------------------------------------" ) print( "**********************************************************************" )
def get_bt_current_data(self, src_f_data, bt_dataset, source_id, category_no, be_att_id, join_with_f): complete_dataset = bt_dataset + \ "\\SourceID=" + str(source_id) +\ "\\AttributeID=" + str(be_att_id) +\ "\\ResetDQStage=" + str(category_no) +\ "\\process_no="+str(self.process_no) # print('complete_dataset', complete_dataset) if is_dir_exists(complete_dataset): # print('src_f_data', src_f_data.compute().columns) for file_name in get_files_in_dir(complete_dataset): pa_file_path = complete_dataset + "\\" + file_name # bt_current_df = read_batches_from_parquet(dataset_root_path=pa_file_path, # columns=['bt_id', 'RowKey', 'AttributeValue'], # batch_size=int(self.parameters_dict['bt_batch_size']), # use_threads=True, filter=None, filter_index=True) bt_current_df = read_all_from_parquet_delayed( dataset=pa_file_path, columns=['bt_id', 'RowKey', 'AttributeValue'], filter=None) # src_f_data_df = src_f_data[src_f_data['rowkey'].isin(bt_current_df['RowKey'])] ############### if join_with_f == 1: bt_current_df = bt_current_df.merge( src_f_data, left_on=['RowKey'], # left_index=True, # right_index=True, right_on=['rowkey'], suffixes=('_new', '_cbt'), how='inner') ###################### yield bt_current_df.compute()