Beispiel #1
0
    def toPyspark(self, cnv_ds):
        stmt = self.text + ';'
        stmt = self.simpleParse(stmt)
        cnv_code = ''
        cnv_log = ''

        #read sp out parameter position file
        with open('.\\Teradata\\sp_out_param.pos', 'r') as f:
            pos_txt = f.read()
        pos_txt_sps = re.findall('(\w+) \[', pos_txt)

        #get sp name and parameter list
        sp_name, sp_params = re.search(
            r'CALL\s+(?:\w+(?=\.))?\.?\s*(\w+)\s*\((.*?)\)\s*;', stmt,
            re.S | re.I).groups()
        sp_params = re.sub(r'[\s\:]', '', sp_params, re.S)
        sp_param_list = util.newSplit(sp_params, ',')

        #if sp name present in out parameter position file
        if sp_name in pos_txt_sps:
            #get out parameter indexes
            out_param_pos = re.search(r'{} \[(.*?)\]'.format(sp_name), pos_txt,
                                      re.S | re.I).group(1)

            if out_param_pos:
                #make out parameter string
                out_param_idxs = [int(idx) for idx in out_param_pos.split(',')]
                out_param_str = ''
                for idx in out_param_idxs:
                    out_param_str += sp_param_list[idx] + ','
                out_param_str = out_param_str[:-1]

                #make in parameter string
                param_idxs = [idx for idx in range(len(sp_param_list))]
                in_param_idxs = list(set(param_idxs) - set(out_param_idxs))
                in_param_str = ''
                for idx in in_param_idxs:
                    in_param_str += sp_param_list[idx] + ','
                in_param_str = in_param_str[:-1]

                cnv_code += f"{out_param_str} = sp_name({in_param_str})"
            else:
                cnv_code += f"{sp_name}({sp_params})"

            #add import code
            cnv_ds.output.code[0] += f"from {sp_name} import {sp_name}\n"

            return cnv_code, cnv_log

        else:
            #Error
            return '', f'Statement Spipped \n\t {self.text}'
Beispiel #2
0
    def loadStatements(self):
        statement_objs = []
        script_statements = util.newSplit(self.text, ';')  #split file content

        for stmt in script_statements:
            stmt = stmt.strip()
            if len(stmt) > 0:
                if re.match(r'^\s*\.[a-z]+\s+', stmt, re.S | re.I):
                    stmt_obj = TS.CtrlStmt(stmt)
                    statement_objs.append(stmt_obj)

                elif re.match(r'^SELECT', stmt, re.S | re.I):
                    stmt_obj = TS.Select(stmt)
                    statement_objs.append(stmt_obj)

                elif re.match(r'CREATE\s+\w*\s*VOLATILE\s+TABLE\s+', stmt,
                              re.S | re.I):
                    stmt_obj = TS.VolTbl(stmt)
                    statement_objs.append(stmt_obj)

                elif re.match(r'^INS(?:ERT)?\s+INTO\s+', stmt, re.S | re.I):
                    stmt_obj = TS.Insert(stmt)
                    statement_objs.append(stmt_obj)

                elif re.match(r'^UPD(?:ATE)?\s+.*?\s+SET\s+', stmt,
                              re.S | re.I):
                    stmt_obj = TS.Update(stmt)
                    statement_objs.append(stmt_obj)

                elif re.match(r'^DEL(?:ETE)?\s+(?:FROM)?\s*', stmt,
                              re.S | re.I):
                    stmt_obj = TS.Delete(stmt)
                    statement_objs.append(stmt_obj)

                elif re.match(r'^MERGE\s+.*?\s+USING\s+', stmt, re.S | re.I):
                    stmt_obj = TS.Merge(stmt)
                    statement_objs.append(stmt_obj)

                elif re.match(r'^CALL\s+', stmt, re.S | re.I):
                    stmt_obj = TS.Call(stmt)
                    statement_objs.append(stmt_obj)

                else:
                    #print('Not supported --- [', stmt, ']', sep= '')
                    pass

        return statement_objs
Beispiel #3
0
    def loadStatements(self):
        is_sp_stmt = False
        statement_objs = []

        self.cntx.logger.add_log(
            'INFO',
            'Starting process to split script content to individual statements.'
        )
        #Antlr Parse
        try:
            script_text = re.sub(r'\bLOAD\b', r'__LOAD__', \
                          re.sub(r'\bBLOCK\b', r'__BLOCK__', \
                          re.sub(r'\bPLATFORM\b', r'__PLATFORM__', self.text, flags=re.S|re.I), \
                          flags=re.S|re.I), \
                          flags=re.S|re.I)
            lexer = FullTSqlAntlrLexer(InputStream(script_text))
            lexer.removeErrorListeners()
            lexer.addErrorListener(TSqlErrorListener())
            stream = CommonTokenStream(lexer)
            parser = FullTSqlAntlrParser(stream)
            parser.removeErrorListeners()
            parser.addErrorListener(TSqlErrorListener())
            tree = parser.tsql_file()
            conv = TSqlScriptParse(stream)
            walker = ParseTreeWalker()
            walker.walk(conv, tree)
        except Exception as e:
            self.cntx.logger.add_log('ERROR',
                                     'Failed to parse script content.')
            self.cntx.logger.add_log_details('Syntax error: ' + str(e))
            self.cntx.logger.add_log(
                'WARN',
                'Using unparsed script content. Result may be inconsistent.')
        else:
            self.cntx.logger.add_log('INFO', 'Script contect parse completed.')
            script_text = conv.out_script
            self.upd_tbl_alias = conv.upd_tbl_alias
        finally:
            script_text = script_text.replace('__LOAD__', 'load').replace(
                '__BLOCK__', 'block').replace('__PLATFORM__', 'platform')

        #split file content
        try:
            self.cntx.logger.add_log(
                'INFO',
                'Splitting script content into individual statements based on semi-colon.'
            )
            script_statements = util.newSplit(script_text, ';')
        except:
            raise Exception(
                'Splitting script content into statement list failed.')

        self.cntx.logger.add_log(
            'INFO', 'Identifying statements in scope of the converter')
        for stmt in script_statements:
            stmt = stmt.strip()
            if len(stmt) > 0:
                #print('['+stmt+']\n')

                if re.search(r'^(?:\bCREATE\b|\bALTER\b)\s+\bPROC(?:EDURE)?\b',
                             stmt, re.S | re.I):
                    is_sp_stmt = True
                    stmt_obj = sss.SPheader(stmt)
                    statement_objs.append(stmt_obj)

                if stmt == 'SP_END':
                    is_sp_stmt = False

                elif re.search(r'^DECLARE\s+@', stmt,
                               re.S | re.I) and is_sp_stmt:
                    stmt_obj = sss.Declare(stmt)
                    statement_objs.append(stmt_obj)

                elif re.search(r'^(?:\bSET|\bSELECT)\s+@\w+\s*=', stmt,
                               re.S | re.I) and is_sp_stmt:
                    stmt_obj = sss.SetVar(stmt)
                    statement_objs.append(stmt_obj)

                elif re.search(
                        r'^DECLARE\s+\w+\s+CURSOR|OPEN\s+\w+|^FETCH\s+\w+\s+FROM|^CLOSE\s+\w+',
                        stmt, re.S | re.I) and is_sp_stmt:
                    stmt_obj = sss.Cursor(stmt)
                    statement_objs.append(stmt_obj)

                elif re.search(r'^(BEGIN|END)\s+(TRY|CATCH)', stmt,
                               re.S | re.I) and is_sp_stmt:
                    stmt_obj = sss.ErrorHadling(stmt)
                    statement_objs.append(stmt_obj)

                elif re.search(r'^IF\s*\(?|^ELSE|^END IF', stmt,
                               re.S | re.I) and is_sp_stmt:
                    stmt_obj = sss.IfCondition(stmt)
                    statement_objs.append(stmt_obj)

                elif re.search(r'^WHILE\s*\(?|^END WHILE', stmt,
                               re.S | re.I) and is_sp_stmt:
                    stmt_obj = sss.WhileLoop(stmt)
                    statement_objs.append(stmt_obj)

                elif re.search(r'^SELECT\s+|^WITH\s+', stmt,
                               re.S | re.I) and is_sp_stmt:
                    stmt_obj = sss.Select(stmt)
                    statement_objs.append(stmt_obj)

                elif re.search(r'^INSERT\s+(?:INTO\s+)?', stmt,
                               re.S | re.I) and is_sp_stmt:
                    stmt_obj = sss.Insert(stmt)
                    statement_objs.append(stmt_obj)

                elif re.search(r'^UPDATE\s+', stmt,
                               re.S | re.I) and is_sp_stmt:
                    stmt_obj = sss.Update(stmt)
                    statement_objs.append(stmt_obj)

                elif re.search(r'^DELETE\s+|^TRUNCATE\s+TABLE\s', stmt,
                               re.S | re.I) and is_sp_stmt:
                    stmt_obj = sss.Delete(stmt)
                    statement_objs.append(stmt_obj)

                elif re.search(r'^MERGE\s+', stmt, re.S | re.I) and is_sp_stmt:
                    stmt_obj = sss.Merge(stmt)
                    statement_objs.append(stmt_obj)

                elif re.search(r'^EXEC(?:UTE)?\s+|^CALL \s+', stmt,
                               re.S | re.I) and is_sp_stmt:
                    stmt_obj = sss.Execute(stmt)
                    statement_objs.append(stmt_obj)

                else:
                    if is_sp_stmt:
                        self.cntx.logger.add_log(
                            'WARN', 'Statement skipped. Not Supported.')
                    else:
                        self.cntx.logger.add_log(
                            'WARN',
                            'Statement skipped. Outside stored procedure definition.'
                        )

                    self.cntx.logger.add_log_details(stmt)

        return statement_objs
Beispiel #4
0
    def toPyspark(self, cnv_ds):
        try:
            lexer = TDantlrLexer(InputStream(self.text))
            stream = CommonTokenStream(lexer)
            parser = TDantlrParser(stream)
            parser.addErrorListener(TDErrorListener())
            tree = parser.start()
            psp = PySparkParse(stream)
            walker = ParseTreeWalker()
            walker.walk(psp, tree)
            stmt = psp.out_sql
        except Exception as e:
            stmt = self.simpleParse() + ';'

        #[TEMP_FIX] for db reference in columns
        stmt = re.sub(r'(\w+)\.(\w+)\.(\w+)',
                      r'\2.\3',
                      stmt,
                      flags=re.S | re.I)

        cnv_code = ''
        cnv_log = ''

        #get primary table name and alias
        p_table, p_table_alias = re.search(
            r'\bMERGE\s+INTO\s+([\w\.]+)\s+(?:AS\s+)?(\w*)\s*USING\b', stmt,
            re.S | re.I).groups()

        #get corresponding data frame name for the primary table
        p_table_df = cnv_ds.table_df_map[p_table]

        #[TODO] if p_table_alias is missing
        #[TEMP_FIX]
        if not (bool(p_table_alias)):
            if '.' in p_table:
                p_table_alias = p_table.split('.')[-1]
            else:
                p_table_alias = p_table

        #check if secondary table is a subquery
        if re.search(r'\bUSING\s*\(\s*SELECT', stmt, re.S | re.I):
            #get select statement of secondary table
            s_table_sql, s_table_alias = re.search(
                r'\bUSING\s*\((.*?)\)\s*(?:AS\s+)?(\w*)\s+ON\b', stmt,
                re.S | re.I).groups()

            #make dataframe name for secondary table select statement
            if s_table_alias:
                s_table = s_table_alias + '__df'
            else:
                #[TODO] if p_table_alias is missing
                s_table = 'temp__df'

            #replace table name with dataframe name
            s_table_sql = util.replaceTableWithDF(s_table_sql, cnv_ds, 1)
            #[TODO] check if modified statement using variable
            #pyspark code to create df for secondary table
            cnv_code += f'{s_table} = spark.sql("""{s_table_sql}""")\n'
            cnv_code += f'{s_table}.createOrReplaceTempView("{s_table}")\n\n'
        else:
            #get secondary table name and alias
            s_table, s_table_alias = re.search(
                r'\bUSING\s+([\w\.]+)\s*(?:AS\s+)?(\w*)\s+ON\b', stmt,
                re.S | re.I).groups()

            #[TODO] if s_table_alias is missing
            #[TEMP_FIX]
            if not (bool(s_table_alias)):
                if '.' in s_table:
                    s_table_alias = s_table.split('.')[-1]
                else:
                    s_table_alias = s_table

        #get merge join condition
        join_cond_str = re.search(r'\bON\s+\(?(.*?)\)?\s+WHEN', stmt,
                                  re.S | re.I).group(1).strip()

        #get merge update set section
        upd_set_str_match = re.search(
            r'\bUPD(?:ATE)?\s+SET\s+(.*?)(?:(?:WHEN\b)|;)', stmt, re.S | re.I)
        if upd_set_str_match:
            upd_set_str = upd_set_str_match.group(1).strip()
        else:
            upd_set_str = ''

        if upd_set_str:
            #make sql to select records that will be updated
            subtract_sql = f'select {p_table_alias}.* \n from {p_table} as {p_table_alias} \n inner join {s_table} as {s_table_alias} \n on {join_cond_str}'
            #replace db table names with corresponding data frame name
            subtract_sql = util.replaceTableWithDF(subtract_sql, cnv_ds, 1)
            #[TODO] check if modified statement using variable

            p_table_df_tmp_1 = p_table_df + '_1'
            #pyspark code to load temp df with records to be updated
            cnv_code += '#Load records to be updated temporary dataframe\n'
            cnv_code += f'{p_table_df_tmp_1} = spark.sql("""{subtract_sql}""")\n'

            #create python dictionary with kay as column to be updated and value as update value
            upd_col_dict = '{'
            #custom split set sction of UPDATE statement to get each column assignment
            set_fields = util.newSplit(upd_set_str, ',')
            #for each column assignment get update column and updating value
            for field in set_fields:
                side = field.split('=')
                side[0] = side[0].strip()
                side[1] = side[1].strip()
                #if update column has table reference
                if re.match(r'\w+\.\w+', side[0], re.S | re.I):
                    upd_col_dict += f'"{side[0]}":"{side[1]} as{side[0]}"' + ",\n"
                else:
                    upd_col_dict += f'"{p_table_alias}.{side[0]}":"{side[1]} as {side[0]}"' + ",\n"

            upd_col_dict = upd_col_dict[:-2] + '}'

            #pyspark code to get colummn list of update table
            cnv_code += f"#Get column list of db table {p_table}\n"
            cnv_code += f"df_col_list = mod_df['{p_table_df}'].columns\n"
            cnv_code += "#Convert column list to string with db table alias\n"
            cnv_code += f"df_col_list_str = '{p_table_alias}.'+',{p_table_alias}.'.join(df_col_list)\n"
            #pyspark code to get first column of update table
            cnv_code += f"#Get first column of db table {p_table}\n"
            cnv_code += "check_col = df_col_list_str[0:df_col_list_str.index(',')]\n"
            #pyspark code to replace update columns with updating values
            cnv_code += "#Create dictionary with update column and udating value\n"
            cnv_code += f"upd_col_dict = {upd_col_dict}\n"
            cnv_code += "#Substitute update columns in column string with updating value\n"
            cnv_code += "for col in upd_col_dict.keys():\n"
            cnv_code += "\tdf_col_list_str = re.sub(col,upd_col_dict[col],df_col_list_str, flags=re.I)\n"

            #make sql to select records with updated column value
            update_sql = f'"select " + df_col_list_str + """\n from {p_table} as {p_table_alias} \n inner join {s_table} as {s_table_alias} \n on {join_cond_str}"""'
            #replace db table names with corresponding data frame name
            update_sql = util.replaceTableWithDF(update_sql, cnv_ds, 1)

            p_table_df_tmp_2 = p_table_df + '_2'
            #pyspark code to load temp df with updated records
            cnv_code += "#Load temporary dataframe with updated records\n"
            cnv_code += f"{p_table_df_tmp_2} = spark.sql({update_sql})\n"
        else:
            update_sql = ''

        #get merge insert section
        insert_str_match = re.search(r'\bINS(?:ERT)?\s+(.*?)(?:(?:WHEN\b)|;)',
                                     stmt, re.S | re.I)
        if insert_str_match:
            insert_str = insert_str_match.group(1).strip()

            #get merge insert section columns
            if insert_str.split()[0].upper() == 'VALUES':
                insert_col_str = ''
            else:
                insert_col_str = re.search(r'\(?(.*?)\)?\s+VALUES\b',
                                           insert_str,
                                           re.S | re.I).group(1).strip()

            #get merge insert section values
            insert_val_str = re.search(r'\bVALUES\s*\(?(.*?)\)?$', insert_str,
                                       re.S | re.I).group(1).strip()
        else:
            insert_str = ''
            insert_col_str = ''
            insert_val_str = ''

        if insert_str:
            if insert_col_str:
                insert_cols = util.newSplit(insert_col_str, ',')
                insert_vals = util.newSplit(insert_val_str, ',')
                #make select column string for merge insert
                select_str = ''
                for i in range(len(insert_cols)):
                    col = insert_cols[i].strip()
                    val = insert_vals[i].strip()
                    select_str += f"{val} as {col},\n"
                select_str = select_str[:-2]
            else:
                select_str = '*'  #[TODO] add logic for select_str

            #make sql to select records to be inserted by merge insert
            insert_sql = f'"""select {select_str} \n from {s_table} as {s_table_alias} \n left outer join {p_table} as {p_table_alias} \n on {join_cond_str} \n where """ + check_col + " is null"'
            #replace table name with dataframe name
            insert_sql = util.replaceTableWithDF(insert_sql, cnv_ds, 1)
            #[TODO] check if modified statement using variable

            p_table_df_tmp_3 = p_table_df + '_3'
            #pyspark code to load temp df with merge insert records
            cnv_code += "#Load temporary dataframe merge insert records\n"
            cnv_code += f"{p_table_df_tmp_3} = spark.sql({insert_sql})\n\n"

        else:
            insert_sql = ''

        #pyspark code to remove old records from db table dataframe and insert merge update and insert records
        cnv_code += "#Remove old records from db table dataframe and insert merge update and insert records\n"

        cnv_code += (f"mod_df['{p_table_df}'] = mod_df['{p_table_df}']") + (
            f".subtract({p_table_df_tmp_1}).union({p_table_df_tmp_2})"
            if update_sql else '') + (f".union({p_table_df_tmp_3})\n"
                                      if insert_sql else '')
        cnv_code += f"mod_df['{p_table_df}'].createOrReplaceTempView('{p_table_df}')\n"

        return cnv_code, cnv_log
Beispiel #5
0
    def toPyspark(self, cnv_ds):
        try:
            lexer = TDantlrLexer(InputStream(self.text))
            stream = CommonTokenStream(lexer)
            parser = TDantlrParser(stream)
            parser.addErrorListener(TDErrorListener())
            tree = parser.start()
            psp = PySparkParse(stream)
            walker = ParseTreeWalker()
            walker.walk(psp, tree)
            stmt = psp.out_sql
        except Exception as e:
            stmt = self.simpleParse() + ';'

        cnv_code = ''
        cnv_log = ''

        #if update statement has FROM keyword
        if re.search(r'\bUPD(?:ATE)?[^\(]*?\bFROM\b', stmt, re.S | re.I):
            #get table alias which will be updated
            upd_table_alias = re.search(r'\bUPD(?:ATE)?\s+(.*?)\s+(?=FROM)',
                                        stmt, re.S | re.I).group(1).strip()
            #get list of tables involved in update statement
            upd_from_str = re.search(r'(?<=FROM)\s+(.*?)\s+(?=SET)', stmt,
                                     re.S | re.I).group(1).strip()
            #get table name that will be updated
            upd_table = re.search(
                r'(\w*\.?\w+)\s+(?:AS\s+)?{}'.format(upd_table_alias),
                upd_from_str, re.S | re.I).group(1).strip()

            #get data frame name of the table
            upd_table_df = cnv_ds.table_df_map[upd_table]
            upd_table_df_tmp_1 = upd_table_df + '_1'
            upd_table_df_tmp_2 = upd_table_df + '_1'

            #get the set section of update statement
            upd_set_str = re.search(r'(?<=SET)\s+(.*?)\s+(?=WHERE)', stmt,
                                    re.S | re.I).group(1).strip()
            #get the where section of update statement
            upd_where_str = re.search(r'\bSET\b.*?\bWHERE\b(.*?);', stmt,
                                      re.S | re.I).group(1).strip()

            #make sql to select records that will be updated
            subtract_sql = f"select {upd_table_alias}.* \nfrom {upd_from_str} \nwhere {upd_where_str}"

            #replace db table names with corresponding dataframe name
            subtract_sql = util.replaceTableWithDF(subtract_sql, cnv_ds, 1)
            #[TODO] replace variable name

            #enclose modified sql in triple quote
            subtract_sql = '"""' + subtract_sql + '"""'

            #create python dictionary with kay as column to be updated and value as update value
            upd_col_dict = '{'
            #split set sction of update statement to get each column assignment
            set_fields = util.newSplit(upd_set_str, ',')
            #for each column assignment get update column and updating value
            for field in set_fields:
                side = field.split('=')
                side[0] = side[0].strip()
                side[1] = side[1].strip()
                #if update column has table reference
                if re.match(r'\w+\.\w+', side[0], re.S | re.I):
                    upd_col_dict += f'"{side[0]}":"{side[1]} as{side[0]}"' + ",\n"
                else:
                    upd_col_dict += f'"{upd_table_alias}.{side[0]}":"{side[1]} as {side[0]}"' + ",\n"

            upd_col_dict = upd_col_dict[:-2] + '}'

            #pyspark code for update value select statement
            cnv_code += f"df_col_list = mod_df['{upd_table_df}'].columns\n"
            cnv_code += f"df_col_list_str = '{upd_table_alias}.'+',{upd_table_alias}.'.join(df_col_list)\n"
            cnv_code += f"upd_col_dict = {upd_col_dict}\n\n"
            cnv_code += "for col in upd_col_dict.keys():\n"
            cnv_code += "\tdf_col_list_str = re.sub(col,upd_col_dict[col],df_col_list_str, flags=re.I)\n\n"

            #final update value select statement
            update_sql = f'"select " + df_col_list_str + """\nfrom {upd_from_str} \nwhere {upd_where_str}"""'

        else:
            #get table name which will be updated
            upd_table = re.search(r'\bUPD(?:ATE)?\s+([\.\w]+)', stmt,
                                  re.S | re.I).group(1).strip()
            #get corresponding data frame name for the table
            upd_table_df = cnv_ds.table_df_map[upd_table]
            upd_table_df_tmp_1 = upd_table_df + '_1'
            upd_table_df_tmp_2 = upd_table_df + '_2'

            #check if update statement has where condition
            if re.search(r'\bWHERE\b', stmt, re.S | re.I):
                #get the set section of update
                upd_set_str = re.search(r'(?<=SET)\s+(.*?)\s+(?=WHERE)', stmt,
                                        re.S | re.I).group(1).strip()
                #get the where section of update statement
                upd_where_str = re.search(r'\bSET\b.*?\bWHERE\b(.*)', stmt,
                                          re.S | re.I).group(1).strip()
            else:
                #get the set section of update
                upd_set_str = re.search(r'(?<=SET)\s+(.*?);', stmt,
                                        re.S | re.I).group(1).strip()
                upd_where_str = ''

            #make sql to select records that will be updated
            subtract_sql = f"select * \nfrom {upd_table_df} " + (
                f"\nwhere {upd_where_str}" if upd_where_str else '')
            #remove semi-colon at the end
            if subtract_sql.strip()[-1] == ';':
                subtract_sql = subtract_sql.strip()[:-1]
            #replace db table names with corresponding data frame name
            subtract_sql = util.replaceTableWithDF(subtract_sql, cnv_ds, 1)
            #enclose modified sql in triple quote
            subtract_sql = '"""' + subtract_sql + '"""'
            #[TODO]check if modified statement using variable

            #create python dictionary with kay as column to be updated and value as update value
            upd_col_dict = '{'
            #split set sction of update statement to get each column assignment
            set_fields = util.newSplit(upd_set_str, ',')
            #for each column assignment get update column and updating value
            for field in set_fields:
                side = field.split('=')
                side[0] = side[0].strip()
                side[1] = side[1].strip()
                upd_col_dict += f'"{side[0]}":"{side[1]} as {side[0]}",\n'
            upd_col_dict = upd_col_dict[:-2] + '}'

            #pyspark code for update value select statement
            cnv_code += f"df_col_list = mod_df['{upd_table_df}'].columns\n"
            cnv_code += "df_col_list_str = ','.join(df_col_list)\n"
            cnv_code += f"upd_col_dict = {upd_col_dict}\n\n"
            cnv_code += "for col in upd_col_dict.keys():\n"
            cnv_code += "\tdf_col_list_str = re.sub(col,upd_col_dict[col],df_col_list_str, flags=re.I)\n\n"
            #final update value select statement
            update_sql = f'"select " + df_col_list_str + """\nfrom {upd_table}' + (
                f'\nwhere {upd_where_str}"""' if upd_where_str else '')

            #remove semi-colon at the end
            if update_sql[-4:] == ';"""':
                update_sql = update_sql[:-4] + '"""'

        #replace db table names with corresponding data frame name
        update_sql = util.replaceTableWithDF(update_sql, cnv_ds, 1)
        #[TODO] check if modified statement using variable

        #pyspark code to update records in dataframe
        cnv_code += f"{upd_table_df_tmp_1} = spark.sql({subtract_sql})\n\n"
        cnv_code += f"{upd_table_df_tmp_2} = spark.sql({update_sql})\n\n"
        cnv_code += f"mod_df[\'{upd_table_df}\'] = mod_df[\'{upd_table_df}\'].subtract({upd_table_df_tmp_1}).union({upd_table_df_tmp_2})\n"
        cnv_code += f"mod_df[\'{upd_table_df}\'].createOrReplaceTempView('{upd_table_df}')\n\n"

        return cnv_code, cnv_log
Beispiel #6
0
    def toPyspark(self, cnv_ds):
        try:
            lexer = TDantlrLexer(InputStream(self.text))
            stream = CommonTokenStream(lexer)
            parser = TDantlrParser(stream)
            parser.addErrorListener(TDErrorListener())
            tree = parser.start()
            psp = PySparkParse(stream)
            walker = ParseTreeWalker()
            walker.walk(psp, tree)
            stmt = psp.out_sql
        except Exception as e:
            stmt = self.simpleParse() + ';'

        cnv_code = ''
        cnv_log = ''

        #get table name where records will be inserted
        table = re.match(r'\bINS(?:ERT)?\s+INTO\s+(\w*\.?\w+)', stmt,
                         re.S | re.I).group(1).strip()
        #get table df name
        table_df = cnv_ds.table_df_map[table]

        #check if insert statement with select
        if re.match(r'\bINSERT\s+INTO(.*?)\(?.*?\)?\s+\(?\bSEL(?:ECT)?\b',
                    stmt, re.S | re.I):

            #check if select statement has target table column list specified
            if re.search(table + r'\s+\(.*?\)[\s\(]+\bSEL(?:ECT)?\b', stmt,
                         re.S | re.I):

                #check if select is in circular bracket
                if re.search(r'(.)\s*SEL(?:ECT)?\b', stmt,
                             re.S | re.I).group(1) == '(':
                    #replace starting bracker with spaecial string
                    stmt = re.sub(r'\((\s*)(SEL(?:ECT)?\b)',
                                  r'‹\1\2',
                                  stmt,
                                  1,
                                  flags=re.S | re.I)

                #replace FROM keyword with spaecial character
                stmt = re.sub(r"\bFROM\b", 'ƒ', stmt, flags=re.S | re.I)
                #custom split insert statement
                stmt_part = util.newSplit(stmt, 'ƒ')

                #first part of the split
                stmt_part_1 = re.sub(r'ƒ',
                                     'from',
                                     re.sub(r'‹',
                                            r'(',
                                            stmt_part[0],
                                            flags=re.S | re.I),
                                     flags=re.S | re.I)
                #second part of the split
                stmt_part_2 = re.sub(r'ƒ',
                                     'from',
                                     re.sub(r'‹',
                                            r'(',
                                            stmt_part[1],
                                            flags=re.S | re.I),
                                     flags=re.S | re.I)

                #get target table column list
                col_list_str = re.search(
                    table + r'\s+\((.*?)\)[\(\s]*\bSEL(?:ECT)?\b', stmt_part_1,
                    re.S | re.I).group(1)
                col_list = util.newSplit(col_list_str, ',')

                #get target insert values
                val_list_str = re.search(r'\bSEL(?:ECT)?\b\s+(.*)',
                                         stmt_part_1, re.S | re.I).group(1)
                val_list = util.newSplit(val_list_str, ',')

                #create select column list
                cnct_str = ''
                for i in range(len(col_list)):
                    col = col_list[i].strip()
                    val = val_list[i].strip()

                    #remove alias from column value
                    if re.match(r'^\bCASE\b', val, re.S | re.I):
                        #ignore END keyword as alias for column value with CASE statement
                        val = re.sub(r'(?<=END)\s*(AS)?[ \t]+\w+$',
                                     '',
                                     val,
                                     flags=re.S | re.I)
                    else:
                        val = re.sub(r'(\s*AS)?[ \t]+[a-zA-Z]\w+$',
                                     '',
                                     val,
                                     flags=re.S | re.I)

                    cnct_str += val + ' as ' + col + ',\n'

                cnct_str = cnct_str[:-2]
                select_str = 'select ' + cnct_str + "\nfrom " + stmt_part_2

            else:
                #get select statement in insert into in case of target table column list not specified
                select_str = re.search(r'([\(\s]*\bSEL(?:ECT)?\b.*)', stmt,
                                       re.S | re.I).group(1)

            #replace table with dataframe
            select_str = util.replaceTableWithDF(select_str, cnv_ds, 1)

            #remove semi-colon at the end
            if select_str.strip()[-1] == ';':
                select_str = select_str.strip()[:-1]

                #enclose modified sql in triple quote
            select_str = '"""' + select_str + '"""'

            #[TODO] check if modified statement using variable

        #check if insert statement with vlaues
        if re.search(r'INSERT\s+INTO\s+\w*\.?\w.*?\bVALUES\s*\(', stmt,
                     re.S | re.I):
            #get tablename, column and value list
            match_grp = re.match(
                r'INSERT\s+INTO\s+(.*?)\s+\(?(.*?)\)?\s*VALUES\s*\((.*?)\)\s*;$',
                stmt, re.S | re.I)
            table = match_grp.group(1).strip()
            cols = match_grp.group(2).strip()
            vals = match_grp.group(3).strip()

            #if column list present then create python list
            if cols:
                cols = re.sub(r'\s', '', cols, flags=re.S | re.I)
                cols = re.sub(r',', '","', cols, flags=re.S | re.I)
                cols = '"' + cols + '"'
                cnv_code += "ins_cols = [" + cols + "]\n"

            else:
                #pyspark code to get column list from df
                cnv_code += "ins_cols = " + table_df + ".columns\n"

            #remove colon and new line from column value string
            vals = re.sub(r':', '', vals, flags=re.S | re.I)
            vals = re.sub(r'[\r\n]+', '', vals, flags=re.S | re.I)

            #custom split column value string
            val_list = util.newSplit(vals, ',')

            #create python list for column values
            val_str = ''
            for val in val_list:
                val = val.strip()
                #if column value is in single quote
                if re.search(r"^'", val):
                    val_str += '"' + val + '",'
                #if column value is function or CASE statement
                elif re.search(r'\w+\s*\(|^CASE\s+', val, re.S | re.I):
                    vals = re.sub(r'\s+', ' ', vals, flags=re.S | re.I)
                    val_str += '"' + val + '",'
                else:
                    val_str += val + ','
            val_str = val_str[:-1]

            #pyspark code to create select string
            cnv_code += "ins_vals = [" + val_str + "]\n"
            cnv_code += "col_val_str = ''\n"
            cnv_code += "for idx in (range(len(ins_cols))):\n"
            cnv_code += "\tcol_val_str = col_val_str + str(ins_vals[idx]) + ' as ' + ins_cols[idx] + ','\n"
            cnv_code += "else:\n"
            cnv_code += "\tcol_val_str = col_val_str[:-1]\n"
            #final select statement
            select_str = "'select ' + col_val_str"

        #if table present in union check list
        if table in cnv_ds.union_chklist:
            #create temporary data frame
            table_df_tmp = table_df + '_1'

            #pyspark code to load data in temporary data frame
            cnv_code += f"#Create temporary dataframe with records to be inserted into {table}\n"
            cnv_code += f"{table_df_tmp} = spark.sql({select_str})\n"

            #pyspark code to insert data into db table dataframe from temorary data frame
            cnv_code += f"#Load records into {table} dataframe \n"
            cnv_code += f"mod_df[\'{table_df}\'] = mod_df[\'{table_df}\'].union({table_df_tmp})\n"
            cnv_code += f"mod_df[\'{table_df}\'].createOrReplaceTempView('{table_df}')\n\n"
        else:
            #add table in union check list
            cnv_ds.union_chklist.append(table)
            #pyspark code to load data into db table dataframe
            cnv_code += f"#Load records into {table} dataframe \n"
            cnv_code += f"mod_df[\'{table_df}\'] = spark.sql({select_str})\n"
            cnv_code += f"mod_df[\'{table_df}\'].createOrReplaceTempView('{table_df}')\n\n"

        return cnv_code, cnv_log
Beispiel #7
0
    def loadStatements(self):
        statement_objs = []

        self.cntx.logger.add_log(
            'INFO',
            'Starting process to split script content to individual statements.'
        )

        #Antlr Parse
        try:
            script_text = re.sub(r'(\bLOAD\b|\bBLOCK\b)', r'__\1__', self.text,
                                 re.S | re.I)
            lexer = FullTSqlAntlrLexer(InputStream(script_text))
            stream = CommonTokenStream(lexer)
            parser = FullTSqlAntlrParser(stream)
            tree = parser.tsql_file()
            conv = TSqlScriptParse(stream)
            walker = ParseTreeWalker()
            walker.walk(conv, tree)
        except Exception as e:
            self.cntx.logger.add_log('ERROR', 'Failed to parse script content')
            self.cntx.logger.add_log_details(e.__str__())
            self.cntx.logger.add_log(
                'WARN',
                'Using unparsed script content. Result may be iscosistant.')
            raise
        else:
            self.cntx.logger.add_log('INFO', 'File contect parse completed.')
            script_text = conv.out_script
            self.upd_tbl_alias = conv.upd_tbl_alias

        #split file content
        self.cntx.logger.add_log(
            'INFO',
            'Splitting script content into individual statements based on semi-colon.'
        )
        script_statements = util.newSplit(script_text, ';')

        for stmt in script_statements:
            stmt = stmt.strip()

            if len(stmt) > 0:

                if re.search(r'^(?:\bCREATE\b|\bALTER\b)\s+\bPROCEDURE\b',
                             stmt, re.S | re.I):
                    stmt_obj = sss.SPheader(stmt)
                    statement_objs.append(stmt_obj)

                elif re.search(r'^DECLARE\s+@', stmt, re.S | re.I):
                    stmt_obj = sss.Declare(stmt)
                    statement_objs.append(stmt_obj)

                elif re.search(r'^(?:\bSET|\bSELECT)\s+@\w+\s*=', stmt,
                               re.S | re.I):
                    stmt_obj = sss.SetVar(stmt)
                    statement_objs.append(stmt_obj)

                elif re.search(
                        r'^DECLARE\s+\w+\s+CURSOR|OPEN\s+\w+|^FETCH\s+\w+\s+FROM|^CLOSE\s+\w+',
                        stmt, re.S | re.I):
                    stmt_obj = sss.Cursor(stmt)
                    statement_objs.append(stmt_obj)

                elif re.search(r'^(BEGIN|END)\s+(TRY|CATCH)', stmt,
                               re.S | re.I):
                    stmt_obj = sss.ErrorHadling(stmt)
                    statement_objs.append(stmt_obj)

                elif re.search(r'^IF\s*\(?|^END IF', stmt, re.S | re.I):
                    stmt_obj = sss.IfCondition(stmt)
                    statement_objs.append(stmt_obj)

                elif re.search(r'^WHILE\s*\(?|^END WHILE', stmt, re.S | re.I):
                    stmt_obj = sss.WhileLoop(stmt)
                    statement_objs.append(stmt_obj)

                elif re.search(r'^SELECT\s+|^WITH\s+', stmt, re.S | re.I):
                    stmt_obj = sss.Select(stmt)
                    statement_objs.append(stmt_obj)

                elif re.search(r'^INSERT\s+(?:INTO\s+)?', stmt, re.S | re.I):
                    stmt_obj = sss.Insert(stmt)
                    statement_objs.append(stmt_obj)

                elif re.search(r'^UPDATE\s+', stmt, re.S | re.I):
                    stmt_obj = sss.Update(stmt)
                    statement_objs.append(stmt_obj)

                elif re.search(r'^DELETE\s+|^TRUNCATE\s+TABLE\s', stmt,
                               re.S | re.I):
                    stmt_obj = sss.Delete(stmt)
                    statement_objs.append(stmt_obj)

                elif re.search(r'^MERGE\s+', stmt, re.S | re.I):
                    stmt_obj = sss.Merge(stmt)
                    statement_objs.append(stmt_obj)

                elif re.search(r'^EXEC(?:UTE)?\s+|^CALL \s+', stmt,
                               re.S | re.I):
                    stmt_obj = sss.Execute(stmt)
                    statement_objs.append(stmt_obj)

                else:
                    self.cntx.logger.add_log('WARN',
                                             'Statement Not Supported.')
                    self.cntx.logger.add_log_details(stmt)

        return statement_objs