def get_be_ids(config_db_url, p_be_id=None): be_att_ids_query = "select distinct be_att_id from be_attributes_data_rules_lvls where active = 1" be_att_ids = get_all_data_from_source(config_db_url, None, be_att_ids_query) list_be_att_ids = data_to_list(be_att_ids['be_att_id']) in_list = list_to_string(list_be_att_ids, ", ", 1) be_ids_query = "select distinct be_id from be_attributes where _id in (" + in_list + ")" if p_be_id: be_ids_query = be_ids_query + " and be_id = " + str(p_be_id) # print(be_ids_query) be_ids = get_all_data_from_source(config_db_url, None, be_ids_query) return be_ids
def get_be_source_ids(self, be_id): mapping_be_source_ids_query = 'select distinct be_data_source_id from ' + self.dnx_config.be_data_sources_mapping_collection mapping_be_source_ids = get_all_data_from_source( self.dnx_config.config_db_url, None, mapping_be_source_ids_query) list_mapping_be_source_ids = data_to_list( mapping_be_source_ids['be_data_source_id']) in_list = list_to_string(list_mapping_be_source_ids, ", ", 1) be_source_ids_query = 'select distinct _id SourceID from ' + self.dnx_config.be_data_sources_collection + ' where active = 1 and _id in (' + in_list + ') and be_id = ' + single_quotes( be_id) be_source_ids = get_all_data_from_source(self.dnx_config.config_db_url, None, be_source_ids_query) # print('mapping_be_source_ids_query', mapping_be_source_ids_query) return be_source_ids
def get_be_id_by_be_att_id(self, be_att_id): be_id_query = "select be_id from " + self.dnx_config.be_attributes_collection + \ " where _id = " + str(be_att_id) be_id = get_all_data_from_source(self.dnx_config.config_db_url, None, be_id_query)['be_id'].values[0] return be_id
def get_cpu_count_cpu_num_workers(config_db_url, parameters_collection, no_of_subprocess=None): if no_of_subprocess is None: no_of_subprocess_query = "select value from " + parameters_collection + " where _id = 'no_of_subprocess' " no_of_subprocess = int( list_to_string( get_all_data_from_source( config_db_url, None, no_of_subprocess_query)['value'].values)) server_cpu_count = multiprocessing.cpu_count() if 0 < no_of_subprocess <= server_cpu_count: cpu_count = no_of_subprocess else: cpu_count = server_cpu_count if cpu_count == server_cpu_count: cpu_num_workers = 1 else: cpu_num_workers = math.floor(server_cpu_count / cpu_count) # print('no_of_subprocess', no_of_subprocess) # print('server_cpu_count', server_cpu_count) # print('cpu_count', cpu_count) # print('cpu_num_workers', cpu_num_workers) return cpu_count, cpu_num_workers
def get_source_connection_credentials(self, source_id): be_data_sources_query = 'select query, org_connection_id from ' + self.dnx_config.be_data_sources_collection + ' where _id = ' + single_quotes( source_id) be_data_sources_data = get_all_data_from_source( self.dnx_config.config_db_url, None, be_data_sources_query) source_query = list_to_string(be_data_sources_data['query'].values) org_source_id = list_to_string( be_data_sources_data['org_connection_id'].values) org_connections_query = 'select url, schema from ' + self.dnx_config.org_connections_collection + ' where _id = ' + single_quotes( org_source_id) org_connections_data = get_all_data_from_source( self.dnx_config.config_db_url, None, org_connections_query) source_url = list_to_string(org_connections_data['url'].values) source_schema = list_to_string(org_connections_data['schema'].values) return source_url, source_schema, source_query
def get_rowkey_column_name(self, source_id, be_id): row_key_column_query = "select query_column_name " \ " from " + self.dnx_config.be_data_sources_mapping_collection + \ " where be_data_source_id = " + single_quotes(source_id) + \ " and be_att_id = (select _id from " + self.dnx_config.be_attributes_collection + " where be_id = " + single_quotes(be_id) + " and att_id = '0')" row_key_column_data = get_all_data_from_source( self.dnx_config.config_db_url, None, row_key_column_query) row_key_column_name = list_to_string( row_key_column_data['query_column_name'].values) return row_key_column_name
def get_source_category_rules(config_db_url, category_no): data_rules_query = "select t1._id, t1.be_att_id, t1.be_data_source_id, join_with_f from be_attributes_data_rules t1 " + \ " join be_data_sources t2 on t2._id = t1.be_data_source_id and t2.active = 1" \ " where t1.active = 1" \ " and t1.category_no = "+str(category_no) order_by = " order by t1.be_data_source_id, be_att_id" data_rules_query = data_rules_query + order_by data_rules = get_all_data_from_source(config_db_url, None, data_rules_query) return data_rules
def get_source_column_name(self, source_id, be_id): columns_query = "select query_column_name, 'F'||be_att_id F_be_att_id" \ " from " + self.dnx_config.be_data_sources_mapping_collection + \ " where be_data_source_id = " + single_quotes(source_id) + \ " and be_att_id in (select _id from " + self.dnx_config.be_attributes_collection + " where be_id = " + single_quotes(be_id) + " and att_id != 0)" columns_data = get_all_data_from_source(self.dnx_config.config_db_url, None, columns_query) # print(columns_query) f_col = {} for i, data in columns_data.iterrows(): f_col[data['query_column_name']] = data['F_be_att_id'] # print('get_source_column_name', data) # print('f_col', f_col) return f_col
def execute_data_rules(self, category_no, be_att_dr_id, source_id, join_with_f): # print('execute_data_rules started') be_data_rule_lvls_query = "select be_att_id, rule_id, next_pass, next_fail, kwargs from " + \ self.dnx_config.be_attributes_data_rules_lvls_collection + \ " where active = 1 and be_att_dr_id = " + str(be_att_dr_id) + " order by level_no" be_data_rule_lvls = get_all_data_from_source( self.dnx_config.config_db_url, None, be_data_rule_lvls_query) no_of_lvls = len(be_data_rule_lvls.index) for current_lvl_no, data_rule_lvls in enumerate( be_data_rule_lvls.iterrows(), start=1): data_rule_lvls = data_rule_lvls[1] be_att_id = data_rule_lvls['be_att_id'] rule_id = data_rule_lvls['rule_id'] next_pass = data_rule_lvls['next_pass'] next_fail = data_rule_lvls['next_fail'] kwargs = data_rule_lvls['kwargs'] g_result = 1 if no_of_lvls == current_lvl_no else 0 # print('no_of_lvls', be_att_dr_id, g_result, no_of_lvls, current_lvl_no) be_id = self.get_be_id_by_be_att_id(str(be_att_id)) core_tables = get_be_core_table_names( self.dnx_config.config_db_url, self.dnx_config.org_business_entities_collection, be_id) bt_current_collection = core_tables[0] source_collection = core_tables[2] dq_result_collection = core_tables[3] # print(core_tables) base_bt_current_data_set = self.dnx_db_path + bt_current_collection src_f_data_set = self.src_f_db_path + source_collection + "\\SourceID=" + str( source_id) result_data_set = self.result_db_path + dq_result_collection # self.all_result_data_set.append(result_data_set) if result_data_set not in self.all_result_data_set else None result_data_set_tmp = result_data_set + "_tmp" if current_lvl_no == 1 and join_with_f == 1: src_f_data = read_all_from_parquet_delayed(src_f_data_set) else: src_f_data = None # src_f_data = src_f_data.set_index('rowkey') if is_dir_exists(base_bt_current_data_set): self.execute_lvl_data_rules( src_f_data, base_bt_current_data_set, result_data_set, result_data_set_tmp, source_id, be_att_dr_id, category_no, be_att_id, rule_id, g_result, current_lvl_no, next_pass, next_fail, join_with_f, kwargs)
def get_source_categories(config_db_url, category_no=None): # t1.be_data_source_id source_id, source_categories_query = "select distinct t1.category_no from be_attributes_data_rules t1 " +\ " join be_data_sources t2 on t2._id = t1.be_data_source_id and t2.active = 1" \ " where t1.active = 1 " order_by = " order by t1.be_data_source_id, t1.category_no" if category_no: source_categories_query = source_categories_query + " and t1.category_no = " + str( category_no) source_categories_query = source_categories_query + order_by # print(source_categories_query) source_categories = get_all_data_from_source(config_db_url, None, source_categories_query) return source_categories
def get_next_be_att_id_category(self, source_id, be_att_id, current_category): next_category_query = "select min(category_no) next_category_no" \ " from " + self.dnx_config.be_attributes_data_rules_collection + \ " where be_att_id = " + str(be_att_id) + \ " and be_data_source_id = " + single_quotes(str(source_id)) + \ " and category_no > " + str(current_category) # print('next_category_query', next_category_query) next_category = get_all_data_from_source( self.dnx_config.config_db_url, None, next_category_query)['next_category_no'].values[0] if next_category is None: next_category = current_category + 1 return next_category
def get_att_ids_df(self, be_data_source_id): data_sources_mapping = self.dnx_config.be_data_sources_mapping_collection query = "select query_column_name, be_att_id from " + data_sources_mapping + " where be_data_source_id = " + single_quotes( be_data_source_id) data_sources_mapping_data = get_all_data_from_source( self.dnx_config.config_db_url, None, query) data_sources_mapping_data[ 'ResetDQStage'] = data_sources_mapping_data.apply( lambda row: get_minimum_category( self.dnx_config.config_db_url, "", self.dnx_config. be_attributes_data_rules_collection, row['be_att_id']), axis=1) data_sources_mapping_data = data_sources_mapping_data.rename( index=str, columns={ "query_column_name": "AttributeName", "be_att_id": "AttributeID" }) return data_sources_mapping_data
def get_be_att_ids(config_db_url, category_no, source_id=None, be_att_id=None): be_att_ids_query = "select distinct t1.be_data_source_id, t1.be_att_id from be_attributes_data_rules t1 " + \ " join be_data_sources t2 on t2._id = t1.be_data_source_id and t2.active = 1" \ " where t1.active = 1" \ " and t1.category_no = " + str(category_no) if source_id and be_att_id: be_att_ids_query = be_att_ids_query + " and t1.be_data_source_id = " + str( source_id) + " and t1.be_att_id = " + str(be_att_id) order_by = " order by be_data_source_id, t1.be_att_id" be_att_ids_query = be_att_ids_query + order_by # print('be_att_ids_query', be_att_ids_query) be_att_ids = get_all_data_from_source(config_db_url, None, be_att_ids_query) return be_att_ids
module_path = os.path.dirname(sys.modules['__main__'].__file__) run_time = datetime.datetime.now() result = get_cpu_count_cpu_num_workers(dnx_config.config_db_url, dnx_config.parameters_collection, no_of_subprocess=None) bt_cpu_count, bt_cpu_num_workers = result[0], result[1] result = get_cpu_count_cpu_num_workers(dnx_config.config_db_url, dnx_config.parameters_collection, no_of_subprocess=1) dq_cpu_count, dq_cpu_num_workers = result[0], result[1] run_engine_query = "select RD, BT, DQ from " + dnx_config.run_engine_collection + " where start_time = '' " run_engine_data = get_all_data_from_source(dnx_config.config_db_url, None, run_engine_query) # run_engine_data = config_database[dnx_config.run_engine_collection].find({'start_time': ''}) # print('run_engine_data', run_engine_query, run_engine_data) for i, run_engine_row in run_engine_data.iterrows(): # print(run_engine_row) RD = run_engine_row['RD'] BT = run_engine_row['BT'] DQ = run_engine_row['DQ'] if BT == 1: if RD == 1: load_source_data_time = datetime.datetime.now() to_run = module_path + '/load_source_data/load_source_data.py' inputs = "cpu_count=" + str(bt_cpu_count) print('start loading data from sources ...')