Example #1
0
 def target_dir(self):
     """Returns the target directory"""
     _val = '{root_saves}/{domain}/{database}/{table}'
     _val = _val.format(
         root_saves=self.cfg_mgr.root_hdfs_saves,
         domain=self.domain,
         database=Utilities.replace_special_chars(self.database),
         table=Utilities.replace_special_chars(self.table_name))
     return _val
Example #2
0
def parallel_dryrun_workflows(info):
    """Dry run workflows in parallel.
    For sake of multiprocessing.Pool, this needs to be a top level function
    Args:
        info: List[cfg_mgr, workflow.xml]
    """
    cfg_mgr = info[0]
    workflow_name = info[1]
    utils = Utilities(cfg_mgr)
    status = utils.dryrun_workflow(workflow_name)
    return status, workflow_name
Example #3
0
    def reingest_hql(self, view, src_table, src_db, full_table):
        """Create HQLs for all table to reingest with row data from it_table
        @view : views from ibis it_table provides team name
        @src_table : source_table_name from ibis it_table for table name
        @src_db : source_database_name from ibis it_table for db name
        @full_table : full_table_name from ibis it_table for hdfs table name
        """
        view_full_name = '{view_name}.{database}_{table_name}'
        view_full_name = view_full_name.format(view_name=view,
                                               database=src_db,
                                               table_name=src_table)
        src_view = full_table
        views_hql = ('SET mapred.job.queue.name={queue_name};\n'
                     'SET mapreduce.map.memory.mb=8000;\n'
                     'SET mapreduce.reduce.memory.mb=16000;\n\n'
                     'DROP VIEW IF EXISTS {view_full_name};\n\n'
                     'DROP TABLE IF EXISTS {view_full_name};\n\n'
                     'set hive.exec.dynamic.partition.mode=nonstrict;\n\n'
                     'CREATE DATABASE IF NOT EXISTS {view_name};\n\n'
                     'CREATE  TABLE {view_full_name} like {src_view};\n\n'
                     'INSERT OVERWRITE TABLE {view_full_name} '
                     'PARTITION(incr_ingest_timestamp) '
                     'select * from {src_view} ;\n\n')
        views_hql = views_hql.format(queue_name=self.queue,
                                     view_full_name=view_full_name,
                                     view_name=view,
                                     src_view=src_view)

        file_name = view + '_' + getpass.getuser() + '_' + \
            'full_' + Utilities.replace_special_chars(src_table)+'.hql'
        act_file_name = os.path.join(self.cfg_mgr.files, file_name)
        file_h = open(act_file_name, "wb+")
        file_h.write(views_hql)
        file_h.close()
        return act_file_name
Example #4
0
 def __init__(self, cfg_mgr):
     self.cfg_mgr = cfg_mgr
     self.utilities = Utilities(self.cfg_mgr)
     self.logger = get_logger(self.cfg_mgr)
     self.queue = self.cfg_mgr.queue_name
     self.table = self.cfg_mgr.it_table
     self.freq_ingest_table = self.cfg_mgr.freq_ingest
Example #5
0
 def parse_file(self, path):
     """parses DSL file and validates it
     Args:
         path: DSL file path
     Returns:
         action_data: list of action names from config file
     """
     with open(path, 'rb') as file_h:
         lines = file_h.readlines()
     lines = Utilities.clean_lines(lines)
     action_data = []
     for index, line in enumerate(lines):
         try:
             result = self.pattern.parseString(line)
             action_data.append(result['action_body']['action_id'])
         except pp.ParseException as parse_ex:
             err_msg = "Line {lineno}, column {err.col}:\n"
             err_msg += "Fix this line: '{err.line}'"
             err_msg = err_msg.format(err=parse_ex, lineno=index+1)
             err_msg = Utilities.print_box_msg(err_msg, border_char='x')
             raise ValueError(err_msg)
     return action_data
Example #6
0
    def views_helper(self, _views):
        """helper"""
        if isinstance(_views, list):
            pass
        elif isinstance(_views, str) and _views:
            reg_lst = r'[^A-Za-z0-9|_]'
            strsrch = re.search(reg_lst, _views)

            if strsrch is None:
                _views = _views.split('|')
            else:
                msg = ("characters other than Alphabets and Pipe and "
                       "underscore is not allowed in view "
                       "value: '{0}'").format(_views)
                raise ValueError(Utilities.print_box_msg(msg, border_char='x'))

        elif not _views:
            _views = []
        else:
            raise ValueError("Views is a pipe separated "
                             "value: '{0}'".format(_views))
        return _views
Example #7
0
    def test_update_views(self):
        """update views"""
        it_obj = ItTable(self.it_table_dict, self.cfg_mgr)
        # test update
        it_obj.views = 'fake_view_open|call'
        self.assertEqual(it_obj.views, 'fake_view_im|fake_view_open|call')
        self.assertTrue(
            set(it_obj.views_list) == set(
                ['fake_view_im', 'fake_view_open', 'call']))
        # test duplicate
        it_obj.views = 'fake_view_open|fake_view_im|call'
        self.assertEqual(it_obj.views, 'fake_view_im|fake_view_open|call')
        self.assertTrue(
            set(it_obj.views_list) == set(
                ['fake_view_im', 'fake_view_open', 'call']))

        strchk = Utilities.print_box_msg(("characters other than Alphabets "
                                          "and Pipe and underscore is not"
                                          " allowed in View"),
                                         border_char='x')

        with self.assertRaises(ValueError) as context:
            it_obj.views = 'fake_view_open| *call'
            self.assertTrue(strchk in str(context.exception))
Example #8
0
 def setUpClass(cls):
     cls.utilities = Utilities(ConfigManager(UNIT_TEST_ENV))
     cls.utilities_run = Utilities(ConfigManager('JENKINS', '', True))
     cls.td_tbl_weekly = {
         'load': '100001',
         'mappers': 20,
         'domain': 'fake_view_im',
         'target_dir': 'mdm/fake_view_im/fake_database_3/'
         'fake_services_tablename',
         'password_file': '/user/dev/fake.password.file',
         'source_table_name': 'fake_services_tablename',
         'hold': 0,
         'split_by': 'epi_id',
         'fetch_size': 50000,
         'source_database_name': 'fake_database_3',
         'connection_factories': 'com.cloudera.connector.teradata.'
         'TeradataManagerFactory',
         'full_table_name': 'fake_view_im.'
         'fake_database_3_fake_services_tablename',
         'db_username': '******',
         'jdbcurl': 'jdbc:teradata://fake.teradata/DATABASE='
         'fake_database_3',
         'views': 'analytics'
     }
     cls.oracle_tbl_monthly = {
         'load': '010001',
         'mappers': 20,
         'domain': 'fake_domain',
         'target_dir': 'mdm/fake_domain/fake_database_1/fake_svc_tablename',
         'password_file': '/user/dev/fake.password.file',
         'source_table_name': 'fake_svc_tablename',
         'hold': 0,
         'split_by': '',
         'fetch_size': 50000,
         'source_database_name': 'fake_database_1',
         'connection_factories': 'com.quest.oraoop.OraOopManagerFactory',
         'full_table_name':
         'fake_domain_fake_database_1_fake_svc_tablename',
         'db_username': '******',
         'jdbcurl': 'jdbc:oracle:thin:@//fake.oracle:1521/fake_servicename'
     }
     cls.sql_tbl_quarterly = {
         'full_table_name': 'logs.fake_database_2_fake_tools_tablename',
         'domain': 'logs',
         'target_dir': 'mdm/logs/fake_database_2/fake_tools_tablename',
         'split_by': 'fake_split_by',
         'mappers': 10,
         'jdbcurl': 'jdbc:sqlserver://fake.sqlserver:'
         '1433;database=fake_database_2',
         'connection_factories': 'com.cloudera.sqoop.manager.'
         'DefaultManagerFactory',
         'db_username': '******',
         'password_file': '/user/dev/fake.password.file',
         'load': '001100',
         'fetch_size': 50000,
         'hold': 0,
         'source_database_name': 'fake_database_2',
         'source_table_name': 'fake_tools_tablename'
     }
     cls.db2_tbl_fortnightly = {
         'full_table_name': 'rx.fake_database_4_fake_rx_tablename',
         'domain': 'rx',
         'target_dir': 'mdm/rx/fake_database_4/fake_rx_tablename',
         'split_by': '',
         'mappers': 1,
         'jdbcurl': 'jdbc:db2://fake.db2:50400/fake_servicename',
         'connection_factories': 'com.cloudera.sqoop.manager.'
         'DefaultManagerFactory',
         'db_username': '******',
         'password_file': '/user/dev/fake.password.file',
         'load': '110100',
         'fetch_size': 50000,
         'hold': 0,
         'source_database_name': 'fake_database_4',
         'source_table_name': 'fake_rx_tablename'
     }
     cls.mysql_tbl_fortnightly = {
         'full_table_name': 'dashboard.fake_servicename',
         'domain': 'dashboard',
         'target_dir': 'mdm/dashboard/fake_servicename/',
         'split_by': '',
         'mappers': 1,
         'jdbcurl': 'jdbc:mysql://\
                     fake.mysql:3306/fake_servicename',
         'connection_factories': 'com.cloudera.sqoop.manager.'
         'DefaultManagerFactory',
         'db_username': '******',
         'password_file': '/user/dev/fake.password.file',
         'load': '110100',
         'fetch_size': 50000,
         'hold': 0,
         'source_database_name': 'dashboard',
         'source_table_name': 'fake_servicename'
     }
     cls.appl_ref_id_tbl = {
         'job_name': 'C1_FAKE_CALL_FAKE_DATABASE_DAILY',
         'frequency': 'Daily',
         'time': '6:00',
         'string_date': 'Every Day',
         'ksh_name': 'call_fake_database_daily',
         'domain': 'call',
         'db': 'fake_database',
         'environment': 'DEV'
     }
Example #9
0
 def test_clean_lines(self):
     """test clean lines"""
     test_input = ['a\n', 'b\r\n', '   ', 'c\n', '\n', '  ']
     expected = ['a', 'b', 'c']
     test_lines = Utilities.clean_lines(test_input)
     self.assertTrue(set(test_lines) == set(expected))
Example #10
0
def parse_file_by_sections(section_file, header, required_keys,
                           optional_fields):
    """Split a file by a common header.
    checks if it contains complete sections using a list of required keys.
    Returns a list of each section as a dictionary and a message
    """
    # List used to store all sections
    all_sections = []
    msg = ''
    log_msg = ''
    try:
        lines = section_file.readlines()
        lines = Utilities.clean_lines(lines)
        last = len(lines) - 1
        # List used to store a section
        section = {}
        flag = False

        def add_to_dictionary():
            """compare keys"""
            if set(required_keys) <= set(section.keys()):
                # Add complete section to list
                all_sections.append(section)
            else:
                diff_keys = set(required_keys).difference(section.keys())
                missed_keys = ", ".join(sorted(diff_keys))
                err_msg = '\n\n' + '%' * 100
                err_msg += "\n\tMissing mandatory fields in file: {file}"
                err_msg += " \n\tMandatory fields: {mandatory}"
                err_msg += "\n\tMissing fields: {missed_keys}"
                err_msg += "\n\tEntered fields: {given_fields}"
                err_msg += '\n' + '%' * 100
                err_msg = err_msg.format(
                    file=section_file.name,
                    mandatory=', '.join(sorted(required_keys)),
                    missed_keys=missed_keys,
                    given_fields=', '.join(sorted(section.keys())))
                raise IndexError(err_msg)

            for each in section.keys():
                all_keys = required_keys + optional_fields
                if each not in all_keys:
                    extra_fields = sorted(
                        set(section.keys()).difference(all_keys))
                    err_msg = '\n\n' + '%' * 100
                    err_msg += '\n\tAllowed fields: {0}. '
                    err_msg += '\n\tEntered fields: {1}'
                    err_msg += '\n\tRemove these fields: {2}'
                    err_msg += '\n' + '%' * 100
                    err_msg = err_msg.format(', '.join(sorted(all_keys)),
                                             ', '.join(sorted(section.keys())),
                                             ', '.join(extra_fields))
                    raise IndexError(err_msg)

        header_found = False

        for i, line in enumerate(lines):
            # Check for non empty line
            if line:
                if line == header:
                    header_found = True
                    if flag:
                        add_to_dictionary()
                        section = {}
                else:
                    flag = True
                    # Split the line by the first : occurrence to get key/value
                    # and add it to the section dictionary
                    try:
                        r = line.split(':', 1)
                        if header_found:
                            col = r[0].strip()
                            val = r[1].strip()

                            # remove the condition after 2 weeks
                            if col.rstrip().lower() == 'domain':
                                log_msg = "Domain field is deprecated!!! "
                                log_msg += "Default domain database_i "
                                log_msg += "is assumed. Please stop providing"
                                log_msg += " Domain. Process will begin to"
                                log_msg += " fail in next 2 weeks."
                            # Add  key value pair to section dictionary
                            elif col.rstrip() not in section.keys():
                                val = val.rstrip()
                                val = val.replace('"', '\\"')
                                val = val.replace("'", "\\'")
                                section[col.rstrip()] = val
                            else:
                                err_msg = ("Duplicate Key value pair: '{0}':"
                                           " '{1}'\n{2} header missing?")
                                err_msg = err_msg.format(col, val, header)
                                err_msg = Utilities.print_box_msg(err_msg, 'x')
                                raise ValueError(err_msg)
                        else:
                            err_msg = 'Header {0} not found'.format(header)
                            err_msg = Utilities.print_box_msg(err_msg, 'x')
                            raise ValueError(err_msg)

                        if i == last:
                            add_to_dictionary()

                    except IndexError as err:
                        all_sections = None
                        msg = ("\n Verify the key-value pair in the"
                               " request file: {file}")
                        msg = msg.format(file=section_file.name)
                        msg = err.message + msg
                        raise ValueError(msg)
            else:
                # Empty line
                if i == last:
                    add_to_dictionary()
                else:
                    continue
        section_file.close()
    except IOError:
        msg = 'Error can\'t find file or read data from {file}'
        msg = msg.format(file=section_file.name)
        raise ValueError(msg)
    except IndexError as err:
        all_sections = None
        msg = err.message
        raise ValueError(msg)
    return all_sections, msg, log_msg
Example #11
0
    def save_perf_hql(self, table):
        """creates perf view hql"""
        all_file = []
        all_views = table.views_list
        domain = self.cfg_mgr.domains_list
        domain = domain.split(",")
        domain = set(domain).intersection(all_views)
        domain = list(domain)

        for target_db in all_views:
            if len(domain) > 0 and target_db.lower() == domain[0]:
                continue
            view_full_name = '{view_name}.{database}_{table_name}'
            view_full_name = view_full_name.format(view_name=target_db,
                                                   database=table.database,
                                                   table_name=table.table_name)
            src_view = '{src_vw_name}.{src_db}_{src_tbl_name}'
            src_view = src_view.format(src_vw_name=table.domain,
                                       src_db=table.database,
                                       src_tbl_name=table.table_name)

            if len(domain) > 0 and domain[0]:
                team_view_name = '{team_nm}.{team_db}_{team_table}'
                team_view_name = team_view_name.format(
                    team_nm=domain[0],
                    team_db=table.database,
                    team_table=table.table_name)
                views_hql = ('SET mapred.job.queue.name={queue_name};\n'
                             'SET mapreduce.map.memory.mb=8000;\n'
                             'SET mapreduce.reduce.memory.mb=16000;\n\n'
                             'DROP VIEW IF EXISTS {view_full_name};\n'
                             'DROP TABLE IF EXISTS {view_full_name};\n\n'
                             'CREATE DATABASE IF NOT EXISTS {view_name};\n'
                             'CREATE DATABASE IF NOT EXISTS {team_name};\n'
                             'DROP VIEW IF EXISTS {team_view_name};\n'
                             'DROP TABLE IF EXISTS {team_view_name};\n'
                             'CREATE VIEW {team_view_name} AS '
                             'SELECT * FROM {src_view}; \n '
                             'CREATE  TABLE {view_full_name} like {src_view}\n'
                             'INSERT OVERWRITE TABLE {view_full_name} '
                             'PARTITION(incr_ingest_timestamp) '
                             'select * from {team_view_name} ;\n\n')
                views_hql = views_hql.format(view_full_name=view_full_name,
                                             view_name=target_db,
                                             src_view=src_view,
                                             team_view_name=team_view_name,
                                             team_name=domain[0],
                                             queue_name=self.queue)
            else:
                views_hql = ('SET mapred.job.queue.name={queue_name};\n'
                             'SET mapreduce.map.memory.mb=8000;\n'
                             'SET mapreduce.reduce.memory.mb=16000;\n\n'
                             'DROP VIEW IF EXISTS {view_full_name};\n'
                             'DROP TABLE IF EXISTS {view_full_name};\n\n'
                             'CREATE DATABASE IF NOT EXISTS {view_name};\n'
                             'CREATE  TABLE {view_full_name} like {src_view}\n'
                             'INSERT OVERWRITE TABLE {view_full_name} '
                             'PARTITION(incr_ingest_timestamp) '
                             'select * from {src_view} ;\n\n')
                views_hql = views_hql.format(view_full_name=view_full_name,
                                             view_name=target_db,
                                             src_view=src_view,
                                             queue_name=self.queue)

            views_hql += "msck repair table {0};\n\n".format(view_full_name)

            file_name = table.db_env + '_' + getpass.getuser() + '_' + \
                'full_' + target_db + '_' +\
                Utilities.replace_special_chars(table.table_name)\
                + '.hql'
            all_file.append(file_name)
            act_file_name = os.path.join(self.cfg_mgr.files, file_name)
            file_h = open(act_file_name, "wb+")
            file_h.write(views_hql)
            file_h.close()

        return all_file