def build_column_dict(data_frame, sql): """ Build a dictionary with key column position and value the list of related items (predicate and objects). :param data_frame: The Pandas dataframe. :param sql: The sql query. :return:The column dictionary. """ column_dict = dict() for line in sql.splitlines(): left_stripped_line = line.lstrip(' ') words = left_stripped_line.split(' ') declare = words[0] declare_token = '--JOIN' if declare == declare_token: table_and_column = words[1] words = table_and_column.split('.') table_name = words[0] column_name = words[1] column_description = get_column_description(table_name, column_name) index = column_position_in_dataframe(column_description, data_frame) column_dict[index] = dict() column_dict[index]['table'] = table_name column_dict[index]['column'] = column_name col_dict = dict() for c, col in enumerate(data_frame.columns): if c not in column_dict: continue table_name = column_dict[c]['table'] column_name = column_dict[c]['column'] rows = get_metadata_on_column(table_name, column_name) for row in rows: t_predicate = "%s" % row[3] t_object = "%s" % row[4] if '#' in t_predicate: t_predicate = t_predicate.split('#')[1] else: continue if '#' in t_object: t_object = t_object.split('#')[1] else: continue item = Item(t_predicate.strip(), t_object.strip()) if c in col_dict: col_dict[c].append(item) else: elements = [item] col_dict[c] = elements return col_dict
def reconciles_data_frame(df, sql): """ Reconciles data frame using url instead of descriptions. REGARDS: Now this function works only on un-pivoted, plain data frame. :param df: Data frame. :param sql: The query sql code. :return: Reconciled Data frame. """ st = detect_special_columns(sql) fks_t = dict() code_to_url_col = dict() desc_to_code_col = dict() for key in st.cols: value = st.cols[key] column = value['column'] table = value['table'] column_desc = get_column_description(table, column) if not column_desc in df.columns: # It is not used in the query. continue if not table in fks_t: fks = build_foreign_keys(table) fks_t[table] = fks else: fks = fks_t[table] if column in fks: fk = fks[column] code_to_url = build_code_to_url_mapping(fk) if len(code_to_url) != 0: # It contains some reconciliation rows. code_to_url_col[column_desc] = code_to_url desc_to_code = build_desc_to_code_mapping(fk) desc_to_code_col[column_desc] = desc_to_code for n, col_name in enumerate(df.columns): if col_name is None or col_name not in code_to_url_col: continue code_to_url = code_to_url_col[col_name] desc_to_code = desc_to_code_col[col_name] c_position = df.columns.get_loc(col_name) values = df[col_name] for v, value in enumerate(values): if value in desc_to_code: code = desc_to_code[value] url = code_to_url[code] df.iloc[v, c_position] = url return df