Exemple #1
0
    def get_be_ids(config_db_url, p_be_id=None):
        be_att_ids_query = "select distinct be_att_id from be_attributes_data_rules_lvls where active = 1"
        be_att_ids = get_all_data_from_source(config_db_url, None,
                                              be_att_ids_query)

        list_be_att_ids = data_to_list(be_att_ids['be_att_id'])
        in_list = list_to_string(list_be_att_ids, ", ", 1)
        be_ids_query = "select distinct be_id from be_attributes where _id in (" + in_list + ")"
        if p_be_id:
            be_ids_query = be_ids_query + " and be_id = " + str(p_be_id)
        # print(be_ids_query)
        be_ids = get_all_data_from_source(config_db_url, None, be_ids_query)
        return be_ids
Exemple #2
0
    def get_be_source_ids(self, be_id):
        mapping_be_source_ids_query = 'select distinct be_data_source_id from ' + self.dnx_config.be_data_sources_mapping_collection
        mapping_be_source_ids = get_all_data_from_source(
            self.dnx_config.config_db_url, None, mapping_be_source_ids_query)
        list_mapping_be_source_ids = data_to_list(
            mapping_be_source_ids['be_data_source_id'])
        in_list = list_to_string(list_mapping_be_source_ids, ", ", 1)

        be_source_ids_query = 'select distinct _id SourceID from ' + self.dnx_config.be_data_sources_collection + ' where active = 1 and _id in (' + in_list + ') and be_id = ' + single_quotes(
            be_id)
        be_source_ids = get_all_data_from_source(self.dnx_config.config_db_url,
                                                 None, be_source_ids_query)
        # print('mapping_be_source_ids_query', mapping_be_source_ids_query)
        return be_source_ids
Exemple #3
0
    def get_be_id_by_be_att_id(self, be_att_id):
        be_id_query = "select be_id from " + self.dnx_config.be_attributes_collection + \
                      " where _id = " + str(be_att_id)
        be_id = get_all_data_from_source(self.dnx_config.config_db_url, None,
                                         be_id_query)['be_id'].values[0]

        return be_id
def get_cpu_count_cpu_num_workers(config_db_url,
                                  parameters_collection,
                                  no_of_subprocess=None):
    if no_of_subprocess is None:
        no_of_subprocess_query = "select value from " + parameters_collection + " where _id = 'no_of_subprocess' "
        no_of_subprocess = int(
            list_to_string(
                get_all_data_from_source(
                    config_db_url, None,
                    no_of_subprocess_query)['value'].values))

    server_cpu_count = multiprocessing.cpu_count()

    if 0 < no_of_subprocess <= server_cpu_count:
        cpu_count = no_of_subprocess
    else:
        cpu_count = server_cpu_count

    if cpu_count == server_cpu_count:
        cpu_num_workers = 1
    else:
        cpu_num_workers = math.floor(server_cpu_count / cpu_count)

    # print('no_of_subprocess', no_of_subprocess)
    # print('server_cpu_count', server_cpu_count)
    # print('cpu_count', cpu_count)
    # print('cpu_num_workers', cpu_num_workers)

    return cpu_count, cpu_num_workers
Exemple #5
0
    def get_source_connection_credentials(self, source_id):
        be_data_sources_query = 'select query, org_connection_id from ' + self.dnx_config.be_data_sources_collection + ' where _id = ' + single_quotes(
            source_id)
        be_data_sources_data = get_all_data_from_source(
            self.dnx_config.config_db_url, None, be_data_sources_query)
        source_query = list_to_string(be_data_sources_data['query'].values)

        org_source_id = list_to_string(
            be_data_sources_data['org_connection_id'].values)
        org_connections_query = 'select url, schema from ' + self.dnx_config.org_connections_collection + ' where _id = ' + single_quotes(
            org_source_id)
        org_connections_data = get_all_data_from_source(
            self.dnx_config.config_db_url, None, org_connections_query)
        source_url = list_to_string(org_connections_data['url'].values)
        source_schema = list_to_string(org_connections_data['schema'].values)

        return source_url, source_schema, source_query
Exemple #6
0
    def get_rowkey_column_name(self, source_id, be_id):
        row_key_column_query = "select query_column_name " \
                               " from " + self.dnx_config.be_data_sources_mapping_collection + \
                               " where be_data_source_id = " + single_quotes(source_id) + \
                               " and be_att_id = (select _id from " + self.dnx_config.be_attributes_collection + " where be_id = " + single_quotes(be_id) + " and att_id = '0')"

        row_key_column_data = get_all_data_from_source(
            self.dnx_config.config_db_url, None, row_key_column_query)
        row_key_column_name = list_to_string(
            row_key_column_data['query_column_name'].values)
        return row_key_column_name
Exemple #7
0
    def get_source_category_rules(config_db_url, category_no):
        data_rules_query = "select t1._id, t1.be_att_id, t1.be_data_source_id, join_with_f from be_attributes_data_rules t1 " + \
                           " join be_data_sources t2 on t2._id = t1.be_data_source_id and t2.active = 1" \
                                                                                   " where t1.active = 1" \
                           " and t1.category_no = "+str(category_no)

        order_by = " order by t1.be_data_source_id, be_att_id"
        data_rules_query = data_rules_query + order_by
        data_rules = get_all_data_from_source(config_db_url, None,
                                              data_rules_query)

        return data_rules
Exemple #8
0
 def get_source_column_name(self, source_id, be_id):
     columns_query = "select query_column_name,  'F'||be_att_id F_be_att_id" \
                    " from " + self.dnx_config.be_data_sources_mapping_collection + \
                    " where be_data_source_id = " + single_quotes(source_id) + \
                    " and be_att_id in (select _id from " + self.dnx_config.be_attributes_collection + " where be_id = " + single_quotes(be_id) + " and att_id != 0)"
     columns_data = get_all_data_from_source(self.dnx_config.config_db_url,
                                             None, columns_query)
     # print(columns_query)
     f_col = {}
     for i, data in columns_data.iterrows():
         f_col[data['query_column_name']] = data['F_be_att_id']
         # print('get_source_column_name', data)
     # print('f_col', f_col)
     return f_col
Exemple #9
0
    def execute_data_rules(self, category_no, be_att_dr_id, source_id,
                           join_with_f):
        # print('execute_data_rules started')
        be_data_rule_lvls_query = "select be_att_id, rule_id, next_pass, next_fail, kwargs from " + \
                                  self.dnx_config.be_attributes_data_rules_lvls_collection + \
                                  " where active = 1 and be_att_dr_id = " + str(be_att_dr_id) + " order by level_no"
        be_data_rule_lvls = get_all_data_from_source(
            self.dnx_config.config_db_url, None, be_data_rule_lvls_query)
        no_of_lvls = len(be_data_rule_lvls.index)

        for current_lvl_no, data_rule_lvls in enumerate(
                be_data_rule_lvls.iterrows(), start=1):
            data_rule_lvls = data_rule_lvls[1]
            be_att_id = data_rule_lvls['be_att_id']
            rule_id = data_rule_lvls['rule_id']
            next_pass = data_rule_lvls['next_pass']
            next_fail = data_rule_lvls['next_fail']
            kwargs = data_rule_lvls['kwargs']

            g_result = 1 if no_of_lvls == current_lvl_no else 0
            # print('no_of_lvls', be_att_dr_id, g_result, no_of_lvls, current_lvl_no)

            be_id = self.get_be_id_by_be_att_id(str(be_att_id))
            core_tables = get_be_core_table_names(
                self.dnx_config.config_db_url,
                self.dnx_config.org_business_entities_collection, be_id)
            bt_current_collection = core_tables[0]
            source_collection = core_tables[2]
            dq_result_collection = core_tables[3]

            # print(core_tables)
            base_bt_current_data_set = self.dnx_db_path + bt_current_collection
            src_f_data_set = self.src_f_db_path + source_collection + "\\SourceID=" + str(
                source_id)
            result_data_set = self.result_db_path + dq_result_collection
            # self.all_result_data_set.append(result_data_set) if result_data_set not in self.all_result_data_set else None
            result_data_set_tmp = result_data_set + "_tmp"

            if current_lvl_no == 1 and join_with_f == 1:
                src_f_data = read_all_from_parquet_delayed(src_f_data_set)
            else:
                src_f_data = None
                # src_f_data = src_f_data.set_index('rowkey')
            if is_dir_exists(base_bt_current_data_set):
                self.execute_lvl_data_rules(
                    src_f_data, base_bt_current_data_set, result_data_set,
                    result_data_set_tmp, source_id, be_att_dr_id, category_no,
                    be_att_id, rule_id, g_result, current_lvl_no, next_pass,
                    next_fail, join_with_f, kwargs)
Exemple #10
0
    def get_source_categories(config_db_url, category_no=None):
        # t1.be_data_source_id source_id,
        source_categories_query = "select distinct t1.category_no from be_attributes_data_rules t1 " +\
                           " join be_data_sources t2 on t2._id = t1.be_data_source_id and t2.active = 1" \
                           " where t1.active = 1 "
        order_by = " order by t1.be_data_source_id, t1.category_no"
        if category_no:
            source_categories_query = source_categories_query + " and t1.category_no = " + str(
                category_no)

        source_categories_query = source_categories_query + order_by
        # print(source_categories_query)
        source_categories = get_all_data_from_source(config_db_url, None,
                                                     source_categories_query)

        return source_categories
Exemple #11
0
    def get_next_be_att_id_category(self, source_id, be_att_id,
                                    current_category):
        next_category_query = "select min(category_no) next_category_no" \
                              " from " + self.dnx_config.be_attributes_data_rules_collection + \
                              " where be_att_id = " + str(be_att_id) + \
                              " and be_data_source_id =  " + single_quotes(str(source_id)) + \
                              " and category_no > " + str(current_category)
        # print('next_category_query', next_category_query)
        next_category = get_all_data_from_source(
            self.dnx_config.config_db_url, None,
            next_category_query)['next_category_no'].values[0]

        if next_category is None:
            next_category = current_category + 1

        return next_category
Exemple #12
0
    def get_att_ids_df(self, be_data_source_id):
        data_sources_mapping = self.dnx_config.be_data_sources_mapping_collection
        query = "select query_column_name, be_att_id from " + data_sources_mapping + " where be_data_source_id = " + single_quotes(
            be_data_source_id)
        data_sources_mapping_data = get_all_data_from_source(
            self.dnx_config.config_db_url, None, query)

        data_sources_mapping_data[
            'ResetDQStage'] = data_sources_mapping_data.apply(
                lambda row: get_minimum_category(
                    self.dnx_config.config_db_url, "", self.dnx_config.
                    be_attributes_data_rules_collection, row['be_att_id']),
                axis=1)
        data_sources_mapping_data = data_sources_mapping_data.rename(
            index=str,
            columns={
                "query_column_name": "AttributeName",
                "be_att_id": "AttributeID"
            })
        return data_sources_mapping_data
Exemple #13
0
    def get_be_att_ids(config_db_url,
                       category_no,
                       source_id=None,
                       be_att_id=None):
        be_att_ids_query = "select distinct t1.be_data_source_id, t1.be_att_id from be_attributes_data_rules t1 " + \
                           " join be_data_sources t2 on t2._id = t1.be_data_source_id and t2.active = 1" \
                                                                                   " where t1.active = 1" \
                                                                                   " and t1.category_no = " + str(category_no)
        if source_id and be_att_id:
            be_att_ids_query = be_att_ids_query + " and t1.be_data_source_id = " + str(
                source_id) + " and t1.be_att_id = " + str(be_att_id)

        order_by = " order by be_data_source_id, t1.be_att_id"
        be_att_ids_query = be_att_ids_query + order_by

        # print('be_att_ids_query', be_att_ids_query)
        be_att_ids = get_all_data_from_source(config_db_url, None,
                                              be_att_ids_query)

        return be_att_ids
    module_path = os.path.dirname(sys.modules['__main__'].__file__)

    run_time = datetime.datetime.now()

    result = get_cpu_count_cpu_num_workers(dnx_config.config_db_url,
                                           dnx_config.parameters_collection,
                                           no_of_subprocess=None)
    bt_cpu_count, bt_cpu_num_workers = result[0], result[1]

    result = get_cpu_count_cpu_num_workers(dnx_config.config_db_url,
                                           dnx_config.parameters_collection,
                                           no_of_subprocess=1)
    dq_cpu_count, dq_cpu_num_workers = result[0], result[1]

    run_engine_query = "select RD, BT, DQ from " + dnx_config.run_engine_collection + " where start_time = '' "
    run_engine_data = get_all_data_from_source(dnx_config.config_db_url, None,
                                               run_engine_query)

    # run_engine_data = config_database[dnx_config.run_engine_collection].find({'start_time': ''})
    # print('run_engine_data', run_engine_query, run_engine_data)
    for i, run_engine_row in run_engine_data.iterrows():
        # print(run_engine_row)
        RD = run_engine_row['RD']
        BT = run_engine_row['BT']
        DQ = run_engine_row['DQ']

        if BT == 1:
            if RD == 1:
                load_source_data_time = datetime.datetime.now()
                to_run = module_path + '/load_source_data/load_source_data.py'
                inputs = "cpu_count=" + str(bt_cpu_count)
                print('start loading data from sources ...')