def target_dir(self): """Returns the target directory""" _val = '{root_saves}/{domain}/{database}/{table}' _val = _val.format( root_saves=self.cfg_mgr.root_hdfs_saves, domain=self.domain, database=Utilities.replace_special_chars(self.database), table=Utilities.replace_special_chars(self.table_name)) return _val
def parallel_dryrun_workflows(info): """Dry run workflows in parallel. For sake of multiprocessing.Pool, this needs to be a top level function Args: info: List[cfg_mgr, workflow.xml] """ cfg_mgr = info[0] workflow_name = info[1] utils = Utilities(cfg_mgr) status = utils.dryrun_workflow(workflow_name) return status, workflow_name
def reingest_hql(self, view, src_table, src_db, full_table): """Create HQLs for all table to reingest with row data from it_table @view : views from ibis it_table provides team name @src_table : source_table_name from ibis it_table for table name @src_db : source_database_name from ibis it_table for db name @full_table : full_table_name from ibis it_table for hdfs table name """ view_full_name = '{view_name}.{database}_{table_name}' view_full_name = view_full_name.format(view_name=view, database=src_db, table_name=src_table) src_view = full_table views_hql = ('SET mapred.job.queue.name={queue_name};\n' 'SET mapreduce.map.memory.mb=8000;\n' 'SET mapreduce.reduce.memory.mb=16000;\n\n' 'DROP VIEW IF EXISTS {view_full_name};\n\n' 'DROP TABLE IF EXISTS {view_full_name};\n\n' 'set hive.exec.dynamic.partition.mode=nonstrict;\n\n' 'CREATE DATABASE IF NOT EXISTS {view_name};\n\n' 'CREATE TABLE {view_full_name} like {src_view};\n\n' 'INSERT OVERWRITE TABLE {view_full_name} ' 'PARTITION(incr_ingest_timestamp) ' 'select * from {src_view} ;\n\n') views_hql = views_hql.format(queue_name=self.queue, view_full_name=view_full_name, view_name=view, src_view=src_view) file_name = view + '_' + getpass.getuser() + '_' + \ 'full_' + Utilities.replace_special_chars(src_table)+'.hql' act_file_name = os.path.join(self.cfg_mgr.files, file_name) file_h = open(act_file_name, "wb+") file_h.write(views_hql) file_h.close() return act_file_name
def __init__(self, cfg_mgr): self.cfg_mgr = cfg_mgr self.utilities = Utilities(self.cfg_mgr) self.logger = get_logger(self.cfg_mgr) self.queue = self.cfg_mgr.queue_name self.table = self.cfg_mgr.it_table self.freq_ingest_table = self.cfg_mgr.freq_ingest
def parse_file(self, path): """parses DSL file and validates it Args: path: DSL file path Returns: action_data: list of action names from config file """ with open(path, 'rb') as file_h: lines = file_h.readlines() lines = Utilities.clean_lines(lines) action_data = [] for index, line in enumerate(lines): try: result = self.pattern.parseString(line) action_data.append(result['action_body']['action_id']) except pp.ParseException as parse_ex: err_msg = "Line {lineno}, column {err.col}:\n" err_msg += "Fix this line: '{err.line}'" err_msg = err_msg.format(err=parse_ex, lineno=index+1) err_msg = Utilities.print_box_msg(err_msg, border_char='x') raise ValueError(err_msg) return action_data
def views_helper(self, _views): """helper""" if isinstance(_views, list): pass elif isinstance(_views, str) and _views: reg_lst = r'[^A-Za-z0-9|_]' strsrch = re.search(reg_lst, _views) if strsrch is None: _views = _views.split('|') else: msg = ("characters other than Alphabets and Pipe and " "underscore is not allowed in view " "value: '{0}'").format(_views) raise ValueError(Utilities.print_box_msg(msg, border_char='x')) elif not _views: _views = [] else: raise ValueError("Views is a pipe separated " "value: '{0}'".format(_views)) return _views
def test_update_views(self): """update views""" it_obj = ItTable(self.it_table_dict, self.cfg_mgr) # test update it_obj.views = 'fake_view_open|call' self.assertEqual(it_obj.views, 'fake_view_im|fake_view_open|call') self.assertTrue( set(it_obj.views_list) == set( ['fake_view_im', 'fake_view_open', 'call'])) # test duplicate it_obj.views = 'fake_view_open|fake_view_im|call' self.assertEqual(it_obj.views, 'fake_view_im|fake_view_open|call') self.assertTrue( set(it_obj.views_list) == set( ['fake_view_im', 'fake_view_open', 'call'])) strchk = Utilities.print_box_msg(("characters other than Alphabets " "and Pipe and underscore is not" " allowed in View"), border_char='x') with self.assertRaises(ValueError) as context: it_obj.views = 'fake_view_open| *call' self.assertTrue(strchk in str(context.exception))
def setUpClass(cls): cls.utilities = Utilities(ConfigManager(UNIT_TEST_ENV)) cls.utilities_run = Utilities(ConfigManager('JENKINS', '', True)) cls.td_tbl_weekly = { 'load': '100001', 'mappers': 20, 'domain': 'fake_view_im', 'target_dir': 'mdm/fake_view_im/fake_database_3/' 'fake_services_tablename', 'password_file': '/user/dev/fake.password.file', 'source_table_name': 'fake_services_tablename', 'hold': 0, 'split_by': 'epi_id', 'fetch_size': 50000, 'source_database_name': 'fake_database_3', 'connection_factories': 'com.cloudera.connector.teradata.' 'TeradataManagerFactory', 'full_table_name': 'fake_view_im.' 'fake_database_3_fake_services_tablename', 'db_username': '******', 'jdbcurl': 'jdbc:teradata://fake.teradata/DATABASE=' 'fake_database_3', 'views': 'analytics' } cls.oracle_tbl_monthly = { 'load': '010001', 'mappers': 20, 'domain': 'fake_domain', 'target_dir': 'mdm/fake_domain/fake_database_1/fake_svc_tablename', 'password_file': '/user/dev/fake.password.file', 'source_table_name': 'fake_svc_tablename', 'hold': 0, 'split_by': '', 'fetch_size': 50000, 'source_database_name': 'fake_database_1', 'connection_factories': 'com.quest.oraoop.OraOopManagerFactory', 'full_table_name': 'fake_domain_fake_database_1_fake_svc_tablename', 'db_username': '******', 'jdbcurl': 'jdbc:oracle:thin:@//fake.oracle:1521/fake_servicename' } cls.sql_tbl_quarterly = { 'full_table_name': 'logs.fake_database_2_fake_tools_tablename', 'domain': 'logs', 'target_dir': 'mdm/logs/fake_database_2/fake_tools_tablename', 'split_by': 'fake_split_by', 'mappers': 10, 'jdbcurl': 'jdbc:sqlserver://fake.sqlserver:' '1433;database=fake_database_2', 'connection_factories': 'com.cloudera.sqoop.manager.' 'DefaultManagerFactory', 'db_username': '******', 'password_file': '/user/dev/fake.password.file', 'load': '001100', 'fetch_size': 50000, 'hold': 0, 'source_database_name': 'fake_database_2', 'source_table_name': 'fake_tools_tablename' } cls.db2_tbl_fortnightly = { 'full_table_name': 'rx.fake_database_4_fake_rx_tablename', 'domain': 'rx', 'target_dir': 'mdm/rx/fake_database_4/fake_rx_tablename', 'split_by': '', 'mappers': 1, 'jdbcurl': 'jdbc:db2://fake.db2:50400/fake_servicename', 'connection_factories': 'com.cloudera.sqoop.manager.' 'DefaultManagerFactory', 'db_username': '******', 'password_file': '/user/dev/fake.password.file', 'load': '110100', 'fetch_size': 50000, 'hold': 0, 'source_database_name': 'fake_database_4', 'source_table_name': 'fake_rx_tablename' } cls.mysql_tbl_fortnightly = { 'full_table_name': 'dashboard.fake_servicename', 'domain': 'dashboard', 'target_dir': 'mdm/dashboard/fake_servicename/', 'split_by': '', 'mappers': 1, 'jdbcurl': 'jdbc:mysql://\ fake.mysql:3306/fake_servicename', 'connection_factories': 'com.cloudera.sqoop.manager.' 'DefaultManagerFactory', 'db_username': '******', 'password_file': '/user/dev/fake.password.file', 'load': '110100', 'fetch_size': 50000, 'hold': 0, 'source_database_name': 'dashboard', 'source_table_name': 'fake_servicename' } cls.appl_ref_id_tbl = { 'job_name': 'C1_FAKE_CALL_FAKE_DATABASE_DAILY', 'frequency': 'Daily', 'time': '6:00', 'string_date': 'Every Day', 'ksh_name': 'call_fake_database_daily', 'domain': 'call', 'db': 'fake_database', 'environment': 'DEV' }
def test_clean_lines(self): """test clean lines""" test_input = ['a\n', 'b\r\n', ' ', 'c\n', '\n', ' '] expected = ['a', 'b', 'c'] test_lines = Utilities.clean_lines(test_input) self.assertTrue(set(test_lines) == set(expected))
def parse_file_by_sections(section_file, header, required_keys, optional_fields): """Split a file by a common header. checks if it contains complete sections using a list of required keys. Returns a list of each section as a dictionary and a message """ # List used to store all sections all_sections = [] msg = '' log_msg = '' try: lines = section_file.readlines() lines = Utilities.clean_lines(lines) last = len(lines) - 1 # List used to store a section section = {} flag = False def add_to_dictionary(): """compare keys""" if set(required_keys) <= set(section.keys()): # Add complete section to list all_sections.append(section) else: diff_keys = set(required_keys).difference(section.keys()) missed_keys = ", ".join(sorted(diff_keys)) err_msg = '\n\n' + '%' * 100 err_msg += "\n\tMissing mandatory fields in file: {file}" err_msg += " \n\tMandatory fields: {mandatory}" err_msg += "\n\tMissing fields: {missed_keys}" err_msg += "\n\tEntered fields: {given_fields}" err_msg += '\n' + '%' * 100 err_msg = err_msg.format( file=section_file.name, mandatory=', '.join(sorted(required_keys)), missed_keys=missed_keys, given_fields=', '.join(sorted(section.keys()))) raise IndexError(err_msg) for each in section.keys(): all_keys = required_keys + optional_fields if each not in all_keys: extra_fields = sorted( set(section.keys()).difference(all_keys)) err_msg = '\n\n' + '%' * 100 err_msg += '\n\tAllowed fields: {0}. ' err_msg += '\n\tEntered fields: {1}' err_msg += '\n\tRemove these fields: {2}' err_msg += '\n' + '%' * 100 err_msg = err_msg.format(', '.join(sorted(all_keys)), ', '.join(sorted(section.keys())), ', '.join(extra_fields)) raise IndexError(err_msg) header_found = False for i, line in enumerate(lines): # Check for non empty line if line: if line == header: header_found = True if flag: add_to_dictionary() section = {} else: flag = True # Split the line by the first : occurrence to get key/value # and add it to the section dictionary try: r = line.split(':', 1) if header_found: col = r[0].strip() val = r[1].strip() # remove the condition after 2 weeks if col.rstrip().lower() == 'domain': log_msg = "Domain field is deprecated!!! " log_msg += "Default domain database_i " log_msg += "is assumed. Please stop providing" log_msg += " Domain. Process will begin to" log_msg += " fail in next 2 weeks." # Add key value pair to section dictionary elif col.rstrip() not in section.keys(): val = val.rstrip() val = val.replace('"', '\\"') val = val.replace("'", "\\'") section[col.rstrip()] = val else: err_msg = ("Duplicate Key value pair: '{0}':" " '{1}'\n{2} header missing?") err_msg = err_msg.format(col, val, header) err_msg = Utilities.print_box_msg(err_msg, 'x') raise ValueError(err_msg) else: err_msg = 'Header {0} not found'.format(header) err_msg = Utilities.print_box_msg(err_msg, 'x') raise ValueError(err_msg) if i == last: add_to_dictionary() except IndexError as err: all_sections = None msg = ("\n Verify the key-value pair in the" " request file: {file}") msg = msg.format(file=section_file.name) msg = err.message + msg raise ValueError(msg) else: # Empty line if i == last: add_to_dictionary() else: continue section_file.close() except IOError: msg = 'Error can\'t find file or read data from {file}' msg = msg.format(file=section_file.name) raise ValueError(msg) except IndexError as err: all_sections = None msg = err.message raise ValueError(msg) return all_sections, msg, log_msg
def save_perf_hql(self, table): """creates perf view hql""" all_file = [] all_views = table.views_list domain = self.cfg_mgr.domains_list domain = domain.split(",") domain = set(domain).intersection(all_views) domain = list(domain) for target_db in all_views: if len(domain) > 0 and target_db.lower() == domain[0]: continue view_full_name = '{view_name}.{database}_{table_name}' view_full_name = view_full_name.format(view_name=target_db, database=table.database, table_name=table.table_name) src_view = '{src_vw_name}.{src_db}_{src_tbl_name}' src_view = src_view.format(src_vw_name=table.domain, src_db=table.database, src_tbl_name=table.table_name) if len(domain) > 0 and domain[0]: team_view_name = '{team_nm}.{team_db}_{team_table}' team_view_name = team_view_name.format( team_nm=domain[0], team_db=table.database, team_table=table.table_name) views_hql = ('SET mapred.job.queue.name={queue_name};\n' 'SET mapreduce.map.memory.mb=8000;\n' 'SET mapreduce.reduce.memory.mb=16000;\n\n' 'DROP VIEW IF EXISTS {view_full_name};\n' 'DROP TABLE IF EXISTS {view_full_name};\n\n' 'CREATE DATABASE IF NOT EXISTS {view_name};\n' 'CREATE DATABASE IF NOT EXISTS {team_name};\n' 'DROP VIEW IF EXISTS {team_view_name};\n' 'DROP TABLE IF EXISTS {team_view_name};\n' 'CREATE VIEW {team_view_name} AS ' 'SELECT * FROM {src_view}; \n ' 'CREATE TABLE {view_full_name} like {src_view}\n' 'INSERT OVERWRITE TABLE {view_full_name} ' 'PARTITION(incr_ingest_timestamp) ' 'select * from {team_view_name} ;\n\n') views_hql = views_hql.format(view_full_name=view_full_name, view_name=target_db, src_view=src_view, team_view_name=team_view_name, team_name=domain[0], queue_name=self.queue) else: views_hql = ('SET mapred.job.queue.name={queue_name};\n' 'SET mapreduce.map.memory.mb=8000;\n' 'SET mapreduce.reduce.memory.mb=16000;\n\n' 'DROP VIEW IF EXISTS {view_full_name};\n' 'DROP TABLE IF EXISTS {view_full_name};\n\n' 'CREATE DATABASE IF NOT EXISTS {view_name};\n' 'CREATE TABLE {view_full_name} like {src_view}\n' 'INSERT OVERWRITE TABLE {view_full_name} ' 'PARTITION(incr_ingest_timestamp) ' 'select * from {src_view} ;\n\n') views_hql = views_hql.format(view_full_name=view_full_name, view_name=target_db, src_view=src_view, queue_name=self.queue) views_hql += "msck repair table {0};\n\n".format(view_full_name) file_name = table.db_env + '_' + getpass.getuser() + '_' + \ 'full_' + target_db + '_' +\ Utilities.replace_special_chars(table.table_name)\ + '.hql' all_file.append(file_name) act_file_name = os.path.join(self.cfg_mgr.files, file_name) file_h = open(act_file_name, "wb+") file_h.write(views_hql) file_h.close() return all_file