def flatten_subquery(self, final_list, sub_queries, level_num): for q in sub_queries: for alias, query in q.items(): formatter = column_parser.Parser(query) formatted_query = formatter.format_query(query) unbundled = unbundle.Unbundle(formatted_query) query_dict = {} if unbundled.has_child(query): if alias == 'no alias' or alias == '' or alias == 'query': query_dict, sub_queries = unbundled.restructure_subquery( query_dict, 'level_{}_main'.format(level_num), formatted_query) else: query_dict, sub_queries = unbundled.restructure_subquery( query_dict, alias, formatted_query) if query_dict != {}: final_list.append(query_dict) for subq in sub_queries: for _, sub_query in subq.items(): if not unbundled.has_child(sub_query): final_list.append(subq) sub_queries.remove(subq) return final_list, sub_queries
def delevel(self, query): formatter = column_parser.Parser(query) formatted_query = formatter.format_query(query) query_list = formatted_query.split('\n') main_query, copy_query_list = self.get_sub_query(query_list) main_query, sub_queries = self.separator(copy_query_list, main_query) return main_query, sub_queries
def parse_query(self, raw_query): formatter = column_parser.Parser(raw_query) formatted_query = formatter.format_query(raw_query) if 'WITH' in formatted_query: if formatted_query.startswith('WITH'): cte_dict = formatter.parse_cte(formatted_query) unbundled = unbundle.Unbundle(formatted_query) final_list = self.flatten_cte_nested(unbundled, cte_dict) else: sub_query_list = self.flatten_pure_nested(formatted_query) final_list = [] for q in sub_query_list: for alias, query in q.items(): if 'WITH' in query: formatter = column_parser.Parser(query) formatted_query = formatter.format_query(query) cte_dict = formatter.parse_cte(formatted_query) unbundled = unbundle.Unbundle(formatted_query) cte_list = self.flatten_cte_nested( unbundled, cte_dict) final_list.append({alias: cte_list}) else: final_list.append(q) else: final_list = self.flatten_pure_nested(raw_query) return final_list
def formatter(sample_query): formatter = column_parser.Parser(sample_query) return formatter
def extract_subquery_fields(query, db_fields): formatter = column_parser.Parser(query) formatted = formatter.format_query(query) fields = formatter.match_queried_fields(formatted, db_fields) return fields
def main(query): formatter = column_parser.Parser(query) formatted_query = formatter.format_query(query) query_list_0 = formatted_query.split('\n') query_dict = {} sub_query = delevel(query_list_0) query_dict = sub_query for alias, query in sub_query.items(): formatter = column_parser.Parser(query) formatted_query = formatter.format_query(query) query_list = formatted_query.split('\n') if has_child(formatted_query) and alias != 'main': sub_query_dict = delevel(query_list) query_dict[alias] = sub_query_dict query_dict = clean_dict(query_dict) # for alias2, query2 in sub_query_dict.items(): # formatter2 = column_parser.Parser(query2) # formatted_query2 = formatter2.format_query(query2) # query_list2 = formatted_query2.split('\n') # if has_child(formatted_query2) and alias2 != 'main': # sub_query_dict2 = delevel(query_list2) # sub_query_dict[alias2] = sub_query_dict2 # for alias3, query3 in sub_query_dict2.items(): # formatter3 = column_parser.Parser(query3) # formatted_query3 = formatter3.format_query(query3) # query_list3 = formatted_query3.split('\n') # try: # if has_child(formatted_query3) and alias3 != 'main': # sub_query_dict3 = delevel(query_list3) # sub_query_dict2[alias3] = sub_query_dict3 # for alias4, query4 in sub_query_dict3.items(): # formatter4 = column_parser.Parser(query4) # formatted_query4 = formatter4.format_query(query4) # query_list4 = formatted_query4.split('\n') # # try: # # if has_child(formatted_query4) and alias4 != 'main': # # sub_query_dict4 = delevel(query_list4) # # sub_query_dict3[alias4] = sub_query_dict4 # # else: # # pass # # except: # # pass # query_dict[alias][alias2][alias3] = sub_query_dict3 # query_dict = clean_dict(query_dict) # else: # pass # except: # pass # query_dict[alias][alias2] = sub_query_dict2 # query_dict = clean_dict(query_dict) # else: # pass # query_dict[alias] = sub_query_dict # query_dict = clean_dict(query_dict) else: pass return query_dict
# else: # l_path[k + '_' + k1 + '_' + k2] = v2 # else: # l_path[k + '_' + k1] = v1 # else: # l_path[k] = v # return l_path if __name__ == '__main__': # query = open('query.sql').read() # query = open('long_query.sql').read() #### BUG: nested was not detected #### query = open('test_query.sql').read() formatter = column_parser.Parser(query) formatted_query = formatter.format_query(query) query_list = formatted_query.split('\n') if is_cte(formatted_query): cte_dict = formatter.parse_cte(formatted_query) final_dict = {} for alias, query in cte_dict.items(): formatter = column_parser.Parser(query) formatted_query = formatter.format_query(query) try: final_dict[alias] = main(formatted_query) except: final_dict[alias] = formatted_query with open('data.json', 'w') as outfile:
if isinstance(sub_sub_query, str): sub_list.append(sub_sub_query) preprocess_list.extend(sub_list) return preprocess_list if __name__ == '__main__': t0 = time.perf_counter() raw_query = open('queries/{}.sql'.format(sys.argv[1])).read() # formatting and analyze the query structure formatter = column_parser.Parser(raw_query) formatted = formatter.format_query(raw_query) analyzer = query_analyzer.Analyzer(formatted) query_dict = analyzer.parse_query(formatted) # preprocess query breakdowns preprocess_list = unnest_query_list(query_dict) # getting metastore db_fields_1 = pd.DataFrame({ 'db_table': 'sfdc.opportunity_product', 'all_columns': [ 'actual_quantity_c', 'annual_list_price_value_c', 'annual_product_value_c', 'annual_recurring_revenue_c', 'contract_is_12_months_or_more_c', 'created_by_id', 'created_date',