def toPyspark(self, cnv_ds): stmt = self.text + ';' stmt = self.simpleParse(stmt) cnv_code = '' cnv_log = '' #read sp out parameter position file with open('.\\Teradata\\sp_out_param.pos', 'r') as f: pos_txt = f.read() pos_txt_sps = re.findall('(\w+) \[', pos_txt) #get sp name and parameter list sp_name, sp_params = re.search( r'CALL\s+(?:\w+(?=\.))?\.?\s*(\w+)\s*\((.*?)\)\s*;', stmt, re.S | re.I).groups() sp_params = re.sub(r'[\s\:]', '', sp_params, re.S) sp_param_list = util.newSplit(sp_params, ',') #if sp name present in out parameter position file if sp_name in pos_txt_sps: #get out parameter indexes out_param_pos = re.search(r'{} \[(.*?)\]'.format(sp_name), pos_txt, re.S | re.I).group(1) if out_param_pos: #make out parameter string out_param_idxs = [int(idx) for idx in out_param_pos.split(',')] out_param_str = '' for idx in out_param_idxs: out_param_str += sp_param_list[idx] + ',' out_param_str = out_param_str[:-1] #make in parameter string param_idxs = [idx for idx in range(len(sp_param_list))] in_param_idxs = list(set(param_idxs) - set(out_param_idxs)) in_param_str = '' for idx in in_param_idxs: in_param_str += sp_param_list[idx] + ',' in_param_str = in_param_str[:-1] cnv_code += f"{out_param_str} = sp_name({in_param_str})" else: cnv_code += f"{sp_name}({sp_params})" #add import code cnv_ds.output.code[0] += f"from {sp_name} import {sp_name}\n" return cnv_code, cnv_log else: #Error return '', f'Statement Spipped \n\t {self.text}'
def loadStatements(self): statement_objs = [] script_statements = util.newSplit(self.text, ';') #split file content for stmt in script_statements: stmt = stmt.strip() if len(stmt) > 0: if re.match(r'^\s*\.[a-z]+\s+', stmt, re.S | re.I): stmt_obj = TS.CtrlStmt(stmt) statement_objs.append(stmt_obj) elif re.match(r'^SELECT', stmt, re.S | re.I): stmt_obj = TS.Select(stmt) statement_objs.append(stmt_obj) elif re.match(r'CREATE\s+\w*\s*VOLATILE\s+TABLE\s+', stmt, re.S | re.I): stmt_obj = TS.VolTbl(stmt) statement_objs.append(stmt_obj) elif re.match(r'^INS(?:ERT)?\s+INTO\s+', stmt, re.S | re.I): stmt_obj = TS.Insert(stmt) statement_objs.append(stmt_obj) elif re.match(r'^UPD(?:ATE)?\s+.*?\s+SET\s+', stmt, re.S | re.I): stmt_obj = TS.Update(stmt) statement_objs.append(stmt_obj) elif re.match(r'^DEL(?:ETE)?\s+(?:FROM)?\s*', stmt, re.S | re.I): stmt_obj = TS.Delete(stmt) statement_objs.append(stmt_obj) elif re.match(r'^MERGE\s+.*?\s+USING\s+', stmt, re.S | re.I): stmt_obj = TS.Merge(stmt) statement_objs.append(stmt_obj) elif re.match(r'^CALL\s+', stmt, re.S | re.I): stmt_obj = TS.Call(stmt) statement_objs.append(stmt_obj) else: #print('Not supported --- [', stmt, ']', sep= '') pass return statement_objs
def loadStatements(self): is_sp_stmt = False statement_objs = [] self.cntx.logger.add_log( 'INFO', 'Starting process to split script content to individual statements.' ) #Antlr Parse try: script_text = re.sub(r'\bLOAD\b', r'__LOAD__', \ re.sub(r'\bBLOCK\b', r'__BLOCK__', \ re.sub(r'\bPLATFORM\b', r'__PLATFORM__', self.text, flags=re.S|re.I), \ flags=re.S|re.I), \ flags=re.S|re.I) lexer = FullTSqlAntlrLexer(InputStream(script_text)) lexer.removeErrorListeners() lexer.addErrorListener(TSqlErrorListener()) stream = CommonTokenStream(lexer) parser = FullTSqlAntlrParser(stream) parser.removeErrorListeners() parser.addErrorListener(TSqlErrorListener()) tree = parser.tsql_file() conv = TSqlScriptParse(stream) walker = ParseTreeWalker() walker.walk(conv, tree) except Exception as e: self.cntx.logger.add_log('ERROR', 'Failed to parse script content.') self.cntx.logger.add_log_details('Syntax error: ' + str(e)) self.cntx.logger.add_log( 'WARN', 'Using unparsed script content. Result may be inconsistent.') else: self.cntx.logger.add_log('INFO', 'Script contect parse completed.') script_text = conv.out_script self.upd_tbl_alias = conv.upd_tbl_alias finally: script_text = script_text.replace('__LOAD__', 'load').replace( '__BLOCK__', 'block').replace('__PLATFORM__', 'platform') #split file content try: self.cntx.logger.add_log( 'INFO', 'Splitting script content into individual statements based on semi-colon.' ) script_statements = util.newSplit(script_text, ';') except: raise Exception( 'Splitting script content into statement list failed.') self.cntx.logger.add_log( 'INFO', 'Identifying statements in scope of the converter') for stmt in script_statements: stmt = stmt.strip() if len(stmt) > 0: #print('['+stmt+']\n') if re.search(r'^(?:\bCREATE\b|\bALTER\b)\s+\bPROC(?:EDURE)?\b', stmt, re.S | re.I): is_sp_stmt = True stmt_obj = sss.SPheader(stmt) statement_objs.append(stmt_obj) if stmt == 'SP_END': is_sp_stmt = False elif re.search(r'^DECLARE\s+@', stmt, re.S | re.I) and is_sp_stmt: stmt_obj = sss.Declare(stmt) statement_objs.append(stmt_obj) elif re.search(r'^(?:\bSET|\bSELECT)\s+@\w+\s*=', stmt, re.S | re.I) and is_sp_stmt: stmt_obj = sss.SetVar(stmt) statement_objs.append(stmt_obj) elif re.search( r'^DECLARE\s+\w+\s+CURSOR|OPEN\s+\w+|^FETCH\s+\w+\s+FROM|^CLOSE\s+\w+', stmt, re.S | re.I) and is_sp_stmt: stmt_obj = sss.Cursor(stmt) statement_objs.append(stmt_obj) elif re.search(r'^(BEGIN|END)\s+(TRY|CATCH)', stmt, re.S | re.I) and is_sp_stmt: stmt_obj = sss.ErrorHadling(stmt) statement_objs.append(stmt_obj) elif re.search(r'^IF\s*\(?|^ELSE|^END IF', stmt, re.S | re.I) and is_sp_stmt: stmt_obj = sss.IfCondition(stmt) statement_objs.append(stmt_obj) elif re.search(r'^WHILE\s*\(?|^END WHILE', stmt, re.S | re.I) and is_sp_stmt: stmt_obj = sss.WhileLoop(stmt) statement_objs.append(stmt_obj) elif re.search(r'^SELECT\s+|^WITH\s+', stmt, re.S | re.I) and is_sp_stmt: stmt_obj = sss.Select(stmt) statement_objs.append(stmt_obj) elif re.search(r'^INSERT\s+(?:INTO\s+)?', stmt, re.S | re.I) and is_sp_stmt: stmt_obj = sss.Insert(stmt) statement_objs.append(stmt_obj) elif re.search(r'^UPDATE\s+', stmt, re.S | re.I) and is_sp_stmt: stmt_obj = sss.Update(stmt) statement_objs.append(stmt_obj) elif re.search(r'^DELETE\s+|^TRUNCATE\s+TABLE\s', stmt, re.S | re.I) and is_sp_stmt: stmt_obj = sss.Delete(stmt) statement_objs.append(stmt_obj) elif re.search(r'^MERGE\s+', stmt, re.S | re.I) and is_sp_stmt: stmt_obj = sss.Merge(stmt) statement_objs.append(stmt_obj) elif re.search(r'^EXEC(?:UTE)?\s+|^CALL \s+', stmt, re.S | re.I) and is_sp_stmt: stmt_obj = sss.Execute(stmt) statement_objs.append(stmt_obj) else: if is_sp_stmt: self.cntx.logger.add_log( 'WARN', 'Statement skipped. Not Supported.') else: self.cntx.logger.add_log( 'WARN', 'Statement skipped. Outside stored procedure definition.' ) self.cntx.logger.add_log_details(stmt) return statement_objs
def toPyspark(self, cnv_ds): try: lexer = TDantlrLexer(InputStream(self.text)) stream = CommonTokenStream(lexer) parser = TDantlrParser(stream) parser.addErrorListener(TDErrorListener()) tree = parser.start() psp = PySparkParse(stream) walker = ParseTreeWalker() walker.walk(psp, tree) stmt = psp.out_sql except Exception as e: stmt = self.simpleParse() + ';' #[TEMP_FIX] for db reference in columns stmt = re.sub(r'(\w+)\.(\w+)\.(\w+)', r'\2.\3', stmt, flags=re.S | re.I) cnv_code = '' cnv_log = '' #get primary table name and alias p_table, p_table_alias = re.search( r'\bMERGE\s+INTO\s+([\w\.]+)\s+(?:AS\s+)?(\w*)\s*USING\b', stmt, re.S | re.I).groups() #get corresponding data frame name for the primary table p_table_df = cnv_ds.table_df_map[p_table] #[TODO] if p_table_alias is missing #[TEMP_FIX] if not (bool(p_table_alias)): if '.' in p_table: p_table_alias = p_table.split('.')[-1] else: p_table_alias = p_table #check if secondary table is a subquery if re.search(r'\bUSING\s*\(\s*SELECT', stmt, re.S | re.I): #get select statement of secondary table s_table_sql, s_table_alias = re.search( r'\bUSING\s*\((.*?)\)\s*(?:AS\s+)?(\w*)\s+ON\b', stmt, re.S | re.I).groups() #make dataframe name for secondary table select statement if s_table_alias: s_table = s_table_alias + '__df' else: #[TODO] if p_table_alias is missing s_table = 'temp__df' #replace table name with dataframe name s_table_sql = util.replaceTableWithDF(s_table_sql, cnv_ds, 1) #[TODO] check if modified statement using variable #pyspark code to create df for secondary table cnv_code += f'{s_table} = spark.sql("""{s_table_sql}""")\n' cnv_code += f'{s_table}.createOrReplaceTempView("{s_table}")\n\n' else: #get secondary table name and alias s_table, s_table_alias = re.search( r'\bUSING\s+([\w\.]+)\s*(?:AS\s+)?(\w*)\s+ON\b', stmt, re.S | re.I).groups() #[TODO] if s_table_alias is missing #[TEMP_FIX] if not (bool(s_table_alias)): if '.' in s_table: s_table_alias = s_table.split('.')[-1] else: s_table_alias = s_table #get merge join condition join_cond_str = re.search(r'\bON\s+\(?(.*?)\)?\s+WHEN', stmt, re.S | re.I).group(1).strip() #get merge update set section upd_set_str_match = re.search( r'\bUPD(?:ATE)?\s+SET\s+(.*?)(?:(?:WHEN\b)|;)', stmt, re.S | re.I) if upd_set_str_match: upd_set_str = upd_set_str_match.group(1).strip() else: upd_set_str = '' if upd_set_str: #make sql to select records that will be updated subtract_sql = f'select {p_table_alias}.* \n from {p_table} as {p_table_alias} \n inner join {s_table} as {s_table_alias} \n on {join_cond_str}' #replace db table names with corresponding data frame name subtract_sql = util.replaceTableWithDF(subtract_sql, cnv_ds, 1) #[TODO] check if modified statement using variable p_table_df_tmp_1 = p_table_df + '_1' #pyspark code to load temp df with records to be updated cnv_code += '#Load records to be updated temporary dataframe\n' cnv_code += f'{p_table_df_tmp_1} = spark.sql("""{subtract_sql}""")\n' #create python dictionary with kay as column to be updated and value as update value upd_col_dict = '{' #custom split set sction of UPDATE statement to get each column assignment set_fields = util.newSplit(upd_set_str, ',') #for each column assignment get update column and updating value for field in set_fields: side = field.split('=') side[0] = side[0].strip() side[1] = side[1].strip() #if update column has table reference if re.match(r'\w+\.\w+', side[0], re.S | re.I): upd_col_dict += f'"{side[0]}":"{side[1]} as{side[0]}"' + ",\n" else: upd_col_dict += f'"{p_table_alias}.{side[0]}":"{side[1]} as {side[0]}"' + ",\n" upd_col_dict = upd_col_dict[:-2] + '}' #pyspark code to get colummn list of update table cnv_code += f"#Get column list of db table {p_table}\n" cnv_code += f"df_col_list = mod_df['{p_table_df}'].columns\n" cnv_code += "#Convert column list to string with db table alias\n" cnv_code += f"df_col_list_str = '{p_table_alias}.'+',{p_table_alias}.'.join(df_col_list)\n" #pyspark code to get first column of update table cnv_code += f"#Get first column of db table {p_table}\n" cnv_code += "check_col = df_col_list_str[0:df_col_list_str.index(',')]\n" #pyspark code to replace update columns with updating values cnv_code += "#Create dictionary with update column and udating value\n" cnv_code += f"upd_col_dict = {upd_col_dict}\n" cnv_code += "#Substitute update columns in column string with updating value\n" cnv_code += "for col in upd_col_dict.keys():\n" cnv_code += "\tdf_col_list_str = re.sub(col,upd_col_dict[col],df_col_list_str, flags=re.I)\n" #make sql to select records with updated column value update_sql = f'"select " + df_col_list_str + """\n from {p_table} as {p_table_alias} \n inner join {s_table} as {s_table_alias} \n on {join_cond_str}"""' #replace db table names with corresponding data frame name update_sql = util.replaceTableWithDF(update_sql, cnv_ds, 1) p_table_df_tmp_2 = p_table_df + '_2' #pyspark code to load temp df with updated records cnv_code += "#Load temporary dataframe with updated records\n" cnv_code += f"{p_table_df_tmp_2} = spark.sql({update_sql})\n" else: update_sql = '' #get merge insert section insert_str_match = re.search(r'\bINS(?:ERT)?\s+(.*?)(?:(?:WHEN\b)|;)', stmt, re.S | re.I) if insert_str_match: insert_str = insert_str_match.group(1).strip() #get merge insert section columns if insert_str.split()[0].upper() == 'VALUES': insert_col_str = '' else: insert_col_str = re.search(r'\(?(.*?)\)?\s+VALUES\b', insert_str, re.S | re.I).group(1).strip() #get merge insert section values insert_val_str = re.search(r'\bVALUES\s*\(?(.*?)\)?$', insert_str, re.S | re.I).group(1).strip() else: insert_str = '' insert_col_str = '' insert_val_str = '' if insert_str: if insert_col_str: insert_cols = util.newSplit(insert_col_str, ',') insert_vals = util.newSplit(insert_val_str, ',') #make select column string for merge insert select_str = '' for i in range(len(insert_cols)): col = insert_cols[i].strip() val = insert_vals[i].strip() select_str += f"{val} as {col},\n" select_str = select_str[:-2] else: select_str = '*' #[TODO] add logic for select_str #make sql to select records to be inserted by merge insert insert_sql = f'"""select {select_str} \n from {s_table} as {s_table_alias} \n left outer join {p_table} as {p_table_alias} \n on {join_cond_str} \n where """ + check_col + " is null"' #replace table name with dataframe name insert_sql = util.replaceTableWithDF(insert_sql, cnv_ds, 1) #[TODO] check if modified statement using variable p_table_df_tmp_3 = p_table_df + '_3' #pyspark code to load temp df with merge insert records cnv_code += "#Load temporary dataframe merge insert records\n" cnv_code += f"{p_table_df_tmp_3} = spark.sql({insert_sql})\n\n" else: insert_sql = '' #pyspark code to remove old records from db table dataframe and insert merge update and insert records cnv_code += "#Remove old records from db table dataframe and insert merge update and insert records\n" cnv_code += (f"mod_df['{p_table_df}'] = mod_df['{p_table_df}']") + ( f".subtract({p_table_df_tmp_1}).union({p_table_df_tmp_2})" if update_sql else '') + (f".union({p_table_df_tmp_3})\n" if insert_sql else '') cnv_code += f"mod_df['{p_table_df}'].createOrReplaceTempView('{p_table_df}')\n" return cnv_code, cnv_log
def toPyspark(self, cnv_ds): try: lexer = TDantlrLexer(InputStream(self.text)) stream = CommonTokenStream(lexer) parser = TDantlrParser(stream) parser.addErrorListener(TDErrorListener()) tree = parser.start() psp = PySparkParse(stream) walker = ParseTreeWalker() walker.walk(psp, tree) stmt = psp.out_sql except Exception as e: stmt = self.simpleParse() + ';' cnv_code = '' cnv_log = '' #if update statement has FROM keyword if re.search(r'\bUPD(?:ATE)?[^\(]*?\bFROM\b', stmt, re.S | re.I): #get table alias which will be updated upd_table_alias = re.search(r'\bUPD(?:ATE)?\s+(.*?)\s+(?=FROM)', stmt, re.S | re.I).group(1).strip() #get list of tables involved in update statement upd_from_str = re.search(r'(?<=FROM)\s+(.*?)\s+(?=SET)', stmt, re.S | re.I).group(1).strip() #get table name that will be updated upd_table = re.search( r'(\w*\.?\w+)\s+(?:AS\s+)?{}'.format(upd_table_alias), upd_from_str, re.S | re.I).group(1).strip() #get data frame name of the table upd_table_df = cnv_ds.table_df_map[upd_table] upd_table_df_tmp_1 = upd_table_df + '_1' upd_table_df_tmp_2 = upd_table_df + '_1' #get the set section of update statement upd_set_str = re.search(r'(?<=SET)\s+(.*?)\s+(?=WHERE)', stmt, re.S | re.I).group(1).strip() #get the where section of update statement upd_where_str = re.search(r'\bSET\b.*?\bWHERE\b(.*?);', stmt, re.S | re.I).group(1).strip() #make sql to select records that will be updated subtract_sql = f"select {upd_table_alias}.* \nfrom {upd_from_str} \nwhere {upd_where_str}" #replace db table names with corresponding dataframe name subtract_sql = util.replaceTableWithDF(subtract_sql, cnv_ds, 1) #[TODO] replace variable name #enclose modified sql in triple quote subtract_sql = '"""' + subtract_sql + '"""' #create python dictionary with kay as column to be updated and value as update value upd_col_dict = '{' #split set sction of update statement to get each column assignment set_fields = util.newSplit(upd_set_str, ',') #for each column assignment get update column and updating value for field in set_fields: side = field.split('=') side[0] = side[0].strip() side[1] = side[1].strip() #if update column has table reference if re.match(r'\w+\.\w+', side[0], re.S | re.I): upd_col_dict += f'"{side[0]}":"{side[1]} as{side[0]}"' + ",\n" else: upd_col_dict += f'"{upd_table_alias}.{side[0]}":"{side[1]} as {side[0]}"' + ",\n" upd_col_dict = upd_col_dict[:-2] + '}' #pyspark code for update value select statement cnv_code += f"df_col_list = mod_df['{upd_table_df}'].columns\n" cnv_code += f"df_col_list_str = '{upd_table_alias}.'+',{upd_table_alias}.'.join(df_col_list)\n" cnv_code += f"upd_col_dict = {upd_col_dict}\n\n" cnv_code += "for col in upd_col_dict.keys():\n" cnv_code += "\tdf_col_list_str = re.sub(col,upd_col_dict[col],df_col_list_str, flags=re.I)\n\n" #final update value select statement update_sql = f'"select " + df_col_list_str + """\nfrom {upd_from_str} \nwhere {upd_where_str}"""' else: #get table name which will be updated upd_table = re.search(r'\bUPD(?:ATE)?\s+([\.\w]+)', stmt, re.S | re.I).group(1).strip() #get corresponding data frame name for the table upd_table_df = cnv_ds.table_df_map[upd_table] upd_table_df_tmp_1 = upd_table_df + '_1' upd_table_df_tmp_2 = upd_table_df + '_2' #check if update statement has where condition if re.search(r'\bWHERE\b', stmt, re.S | re.I): #get the set section of update upd_set_str = re.search(r'(?<=SET)\s+(.*?)\s+(?=WHERE)', stmt, re.S | re.I).group(1).strip() #get the where section of update statement upd_where_str = re.search(r'\bSET\b.*?\bWHERE\b(.*)', stmt, re.S | re.I).group(1).strip() else: #get the set section of update upd_set_str = re.search(r'(?<=SET)\s+(.*?);', stmt, re.S | re.I).group(1).strip() upd_where_str = '' #make sql to select records that will be updated subtract_sql = f"select * \nfrom {upd_table_df} " + ( f"\nwhere {upd_where_str}" if upd_where_str else '') #remove semi-colon at the end if subtract_sql.strip()[-1] == ';': subtract_sql = subtract_sql.strip()[:-1] #replace db table names with corresponding data frame name subtract_sql = util.replaceTableWithDF(subtract_sql, cnv_ds, 1) #enclose modified sql in triple quote subtract_sql = '"""' + subtract_sql + '"""' #[TODO]check if modified statement using variable #create python dictionary with kay as column to be updated and value as update value upd_col_dict = '{' #split set sction of update statement to get each column assignment set_fields = util.newSplit(upd_set_str, ',') #for each column assignment get update column and updating value for field in set_fields: side = field.split('=') side[0] = side[0].strip() side[1] = side[1].strip() upd_col_dict += f'"{side[0]}":"{side[1]} as {side[0]}",\n' upd_col_dict = upd_col_dict[:-2] + '}' #pyspark code for update value select statement cnv_code += f"df_col_list = mod_df['{upd_table_df}'].columns\n" cnv_code += "df_col_list_str = ','.join(df_col_list)\n" cnv_code += f"upd_col_dict = {upd_col_dict}\n\n" cnv_code += "for col in upd_col_dict.keys():\n" cnv_code += "\tdf_col_list_str = re.sub(col,upd_col_dict[col],df_col_list_str, flags=re.I)\n\n" #final update value select statement update_sql = f'"select " + df_col_list_str + """\nfrom {upd_table}' + ( f'\nwhere {upd_where_str}"""' if upd_where_str else '') #remove semi-colon at the end if update_sql[-4:] == ';"""': update_sql = update_sql[:-4] + '"""' #replace db table names with corresponding data frame name update_sql = util.replaceTableWithDF(update_sql, cnv_ds, 1) #[TODO] check if modified statement using variable #pyspark code to update records in dataframe cnv_code += f"{upd_table_df_tmp_1} = spark.sql({subtract_sql})\n\n" cnv_code += f"{upd_table_df_tmp_2} = spark.sql({update_sql})\n\n" cnv_code += f"mod_df[\'{upd_table_df}\'] = mod_df[\'{upd_table_df}\'].subtract({upd_table_df_tmp_1}).union({upd_table_df_tmp_2})\n" cnv_code += f"mod_df[\'{upd_table_df}\'].createOrReplaceTempView('{upd_table_df}')\n\n" return cnv_code, cnv_log
def toPyspark(self, cnv_ds): try: lexer = TDantlrLexer(InputStream(self.text)) stream = CommonTokenStream(lexer) parser = TDantlrParser(stream) parser.addErrorListener(TDErrorListener()) tree = parser.start() psp = PySparkParse(stream) walker = ParseTreeWalker() walker.walk(psp, tree) stmt = psp.out_sql except Exception as e: stmt = self.simpleParse() + ';' cnv_code = '' cnv_log = '' #get table name where records will be inserted table = re.match(r'\bINS(?:ERT)?\s+INTO\s+(\w*\.?\w+)', stmt, re.S | re.I).group(1).strip() #get table df name table_df = cnv_ds.table_df_map[table] #check if insert statement with select if re.match(r'\bINSERT\s+INTO(.*?)\(?.*?\)?\s+\(?\bSEL(?:ECT)?\b', stmt, re.S | re.I): #check if select statement has target table column list specified if re.search(table + r'\s+\(.*?\)[\s\(]+\bSEL(?:ECT)?\b', stmt, re.S | re.I): #check if select is in circular bracket if re.search(r'(.)\s*SEL(?:ECT)?\b', stmt, re.S | re.I).group(1) == '(': #replace starting bracker with spaecial string stmt = re.sub(r'\((\s*)(SEL(?:ECT)?\b)', r'‹\1\2', stmt, 1, flags=re.S | re.I) #replace FROM keyword with spaecial character stmt = re.sub(r"\bFROM\b", 'ƒ', stmt, flags=re.S | re.I) #custom split insert statement stmt_part = util.newSplit(stmt, 'ƒ') #first part of the split stmt_part_1 = re.sub(r'ƒ', 'from', re.sub(r'‹', r'(', stmt_part[0], flags=re.S | re.I), flags=re.S | re.I) #second part of the split stmt_part_2 = re.sub(r'ƒ', 'from', re.sub(r'‹', r'(', stmt_part[1], flags=re.S | re.I), flags=re.S | re.I) #get target table column list col_list_str = re.search( table + r'\s+\((.*?)\)[\(\s]*\bSEL(?:ECT)?\b', stmt_part_1, re.S | re.I).group(1) col_list = util.newSplit(col_list_str, ',') #get target insert values val_list_str = re.search(r'\bSEL(?:ECT)?\b\s+(.*)', stmt_part_1, re.S | re.I).group(1) val_list = util.newSplit(val_list_str, ',') #create select column list cnct_str = '' for i in range(len(col_list)): col = col_list[i].strip() val = val_list[i].strip() #remove alias from column value if re.match(r'^\bCASE\b', val, re.S | re.I): #ignore END keyword as alias for column value with CASE statement val = re.sub(r'(?<=END)\s*(AS)?[ \t]+\w+$', '', val, flags=re.S | re.I) else: val = re.sub(r'(\s*AS)?[ \t]+[a-zA-Z]\w+$', '', val, flags=re.S | re.I) cnct_str += val + ' as ' + col + ',\n' cnct_str = cnct_str[:-2] select_str = 'select ' + cnct_str + "\nfrom " + stmt_part_2 else: #get select statement in insert into in case of target table column list not specified select_str = re.search(r'([\(\s]*\bSEL(?:ECT)?\b.*)', stmt, re.S | re.I).group(1) #replace table with dataframe select_str = util.replaceTableWithDF(select_str, cnv_ds, 1) #remove semi-colon at the end if select_str.strip()[-1] == ';': select_str = select_str.strip()[:-1] #enclose modified sql in triple quote select_str = '"""' + select_str + '"""' #[TODO] check if modified statement using variable #check if insert statement with vlaues if re.search(r'INSERT\s+INTO\s+\w*\.?\w.*?\bVALUES\s*\(', stmt, re.S | re.I): #get tablename, column and value list match_grp = re.match( r'INSERT\s+INTO\s+(.*?)\s+\(?(.*?)\)?\s*VALUES\s*\((.*?)\)\s*;$', stmt, re.S | re.I) table = match_grp.group(1).strip() cols = match_grp.group(2).strip() vals = match_grp.group(3).strip() #if column list present then create python list if cols: cols = re.sub(r'\s', '', cols, flags=re.S | re.I) cols = re.sub(r',', '","', cols, flags=re.S | re.I) cols = '"' + cols + '"' cnv_code += "ins_cols = [" + cols + "]\n" else: #pyspark code to get column list from df cnv_code += "ins_cols = " + table_df + ".columns\n" #remove colon and new line from column value string vals = re.sub(r':', '', vals, flags=re.S | re.I) vals = re.sub(r'[\r\n]+', '', vals, flags=re.S | re.I) #custom split column value string val_list = util.newSplit(vals, ',') #create python list for column values val_str = '' for val in val_list: val = val.strip() #if column value is in single quote if re.search(r"^'", val): val_str += '"' + val + '",' #if column value is function or CASE statement elif re.search(r'\w+\s*\(|^CASE\s+', val, re.S | re.I): vals = re.sub(r'\s+', ' ', vals, flags=re.S | re.I) val_str += '"' + val + '",' else: val_str += val + ',' val_str = val_str[:-1] #pyspark code to create select string cnv_code += "ins_vals = [" + val_str + "]\n" cnv_code += "col_val_str = ''\n" cnv_code += "for idx in (range(len(ins_cols))):\n" cnv_code += "\tcol_val_str = col_val_str + str(ins_vals[idx]) + ' as ' + ins_cols[idx] + ','\n" cnv_code += "else:\n" cnv_code += "\tcol_val_str = col_val_str[:-1]\n" #final select statement select_str = "'select ' + col_val_str" #if table present in union check list if table in cnv_ds.union_chklist: #create temporary data frame table_df_tmp = table_df + '_1' #pyspark code to load data in temporary data frame cnv_code += f"#Create temporary dataframe with records to be inserted into {table}\n" cnv_code += f"{table_df_tmp} = spark.sql({select_str})\n" #pyspark code to insert data into db table dataframe from temorary data frame cnv_code += f"#Load records into {table} dataframe \n" cnv_code += f"mod_df[\'{table_df}\'] = mod_df[\'{table_df}\'].union({table_df_tmp})\n" cnv_code += f"mod_df[\'{table_df}\'].createOrReplaceTempView('{table_df}')\n\n" else: #add table in union check list cnv_ds.union_chklist.append(table) #pyspark code to load data into db table dataframe cnv_code += f"#Load records into {table} dataframe \n" cnv_code += f"mod_df[\'{table_df}\'] = spark.sql({select_str})\n" cnv_code += f"mod_df[\'{table_df}\'].createOrReplaceTempView('{table_df}')\n\n" return cnv_code, cnv_log
def loadStatements(self): statement_objs = [] self.cntx.logger.add_log( 'INFO', 'Starting process to split script content to individual statements.' ) #Antlr Parse try: script_text = re.sub(r'(\bLOAD\b|\bBLOCK\b)', r'__\1__', self.text, re.S | re.I) lexer = FullTSqlAntlrLexer(InputStream(script_text)) stream = CommonTokenStream(lexer) parser = FullTSqlAntlrParser(stream) tree = parser.tsql_file() conv = TSqlScriptParse(stream) walker = ParseTreeWalker() walker.walk(conv, tree) except Exception as e: self.cntx.logger.add_log('ERROR', 'Failed to parse script content') self.cntx.logger.add_log_details(e.__str__()) self.cntx.logger.add_log( 'WARN', 'Using unparsed script content. Result may be iscosistant.') raise else: self.cntx.logger.add_log('INFO', 'File contect parse completed.') script_text = conv.out_script self.upd_tbl_alias = conv.upd_tbl_alias #split file content self.cntx.logger.add_log( 'INFO', 'Splitting script content into individual statements based on semi-colon.' ) script_statements = util.newSplit(script_text, ';') for stmt in script_statements: stmt = stmt.strip() if len(stmt) > 0: if re.search(r'^(?:\bCREATE\b|\bALTER\b)\s+\bPROCEDURE\b', stmt, re.S | re.I): stmt_obj = sss.SPheader(stmt) statement_objs.append(stmt_obj) elif re.search(r'^DECLARE\s+@', stmt, re.S | re.I): stmt_obj = sss.Declare(stmt) statement_objs.append(stmt_obj) elif re.search(r'^(?:\bSET|\bSELECT)\s+@\w+\s*=', stmt, re.S | re.I): stmt_obj = sss.SetVar(stmt) statement_objs.append(stmt_obj) elif re.search( r'^DECLARE\s+\w+\s+CURSOR|OPEN\s+\w+|^FETCH\s+\w+\s+FROM|^CLOSE\s+\w+', stmt, re.S | re.I): stmt_obj = sss.Cursor(stmt) statement_objs.append(stmt_obj) elif re.search(r'^(BEGIN|END)\s+(TRY|CATCH)', stmt, re.S | re.I): stmt_obj = sss.ErrorHadling(stmt) statement_objs.append(stmt_obj) elif re.search(r'^IF\s*\(?|^END IF', stmt, re.S | re.I): stmt_obj = sss.IfCondition(stmt) statement_objs.append(stmt_obj) elif re.search(r'^WHILE\s*\(?|^END WHILE', stmt, re.S | re.I): stmt_obj = sss.WhileLoop(stmt) statement_objs.append(stmt_obj) elif re.search(r'^SELECT\s+|^WITH\s+', stmt, re.S | re.I): stmt_obj = sss.Select(stmt) statement_objs.append(stmt_obj) elif re.search(r'^INSERT\s+(?:INTO\s+)?', stmt, re.S | re.I): stmt_obj = sss.Insert(stmt) statement_objs.append(stmt_obj) elif re.search(r'^UPDATE\s+', stmt, re.S | re.I): stmt_obj = sss.Update(stmt) statement_objs.append(stmt_obj) elif re.search(r'^DELETE\s+|^TRUNCATE\s+TABLE\s', stmt, re.S | re.I): stmt_obj = sss.Delete(stmt) statement_objs.append(stmt_obj) elif re.search(r'^MERGE\s+', stmt, re.S | re.I): stmt_obj = sss.Merge(stmt) statement_objs.append(stmt_obj) elif re.search(r'^EXEC(?:UTE)?\s+|^CALL \s+', stmt, re.S | re.I): stmt_obj = sss.Execute(stmt) statement_objs.append(stmt_obj) else: self.cntx.logger.add_log('WARN', 'Statement Not Supported.') self.cntx.logger.add_log_details(stmt) return statement_objs