Beispiel #1
0
def build_column_dict(data_frame, sql):
    """
    Build a dictionary with key column position and value the list of
    related items (predicate and objects).

    :param data_frame: The Pandas dataframe.
    :param sql: The sql query.
    :return:The column dictionary.
    """
    column_dict = dict()
    for line in sql.splitlines():
        left_stripped_line = line.lstrip(' ')
        words = left_stripped_line.split(' ')
        declare = words[0]
        declare_token = '--JOIN'
        if declare == declare_token:
            table_and_column = words[1]
            words = table_and_column.split('.')
            table_name = words[0]
            column_name = words[1]
            column_description = get_column_description(table_name,
                                                        column_name)
            index = column_position_in_dataframe(column_description,
                                                 data_frame)
            column_dict[index] = dict()
            column_dict[index]['table'] = table_name
            column_dict[index]['column'] = column_name

    col_dict = dict()

    for c, col in enumerate(data_frame.columns):
        if c not in column_dict:
            continue
        table_name = column_dict[c]['table']
        column_name = column_dict[c]['column']
        rows = get_metadata_on_column(table_name, column_name)
        for row in rows:
            t_predicate = "%s" % row[3]
            t_object = "%s" % row[4]
            if '#' in t_predicate:
                t_predicate = t_predicate.split('#')[1]
            else:
                continue
            if '#' in t_object:
                t_object = t_object.split('#')[1]
            else:
                continue

            item = Item(t_predicate.strip(), t_object.strip())
            if c in col_dict:
                col_dict[c].append(item)
            else:
                elements = [item]
                col_dict[c] = elements

    return col_dict
Beispiel #2
0
def reconciles_data_frame(df, sql):
    """
    Reconciles data frame using url instead of descriptions.
    REGARDS: Now this function works only on un-pivoted, plain data frame.

    :param df: Data frame.
    :param sql: The query sql code.
    :return: Reconciled Data frame.
    """
    st = detect_special_columns(sql)
    fks_t = dict()
    code_to_url_col = dict()
    desc_to_code_col = dict()

    for key in st.cols:
        value = st.cols[key]
        column = value['column']
        table = value['table']
        column_desc = get_column_description(table, column)
        if not column_desc in df.columns:
            # It is not used in the query.
            continue
        if not table in fks_t:
            fks = build_foreign_keys(table)
            fks_t[table] = fks
        else:
            fks = fks_t[table]
        if column in fks:
            fk = fks[column]
            code_to_url = build_code_to_url_mapping(fk)
            if len(code_to_url) != 0:
                # It contains some reconciliation rows.
                code_to_url_col[column_desc] = code_to_url
                desc_to_code = build_desc_to_code_mapping(fk)
                desc_to_code_col[column_desc] = desc_to_code

    for n, col_name in enumerate(df.columns):
        if col_name is None or col_name not in code_to_url_col:
            continue

        code_to_url = code_to_url_col[col_name]
        desc_to_code = desc_to_code_col[col_name]
        c_position = df.columns.get_loc(col_name)

        values = df[col_name]
        for v, value in enumerate(values):
            if value in desc_to_code:
                code = desc_to_code[value]
                url = code_to_url[code]
                df.iloc[v, c_position] = url

    return df