Esempio n. 1
0
def split_table_by_col_distinct(db, tbl, col):
    """
    Create a new table for each value in one column
    """
    
    from gasp.sql.fm import q_to_obj
    from gasp.sql.i  import cols_type
    from gasp.sql.to import q_to_ntbl
    
    fields_types = cols_type(db, tbl)
    
    # Get unique values
    VALUES = q_to_obj(db,
        "SELECT {col} FROM {t} GROUP BY {col}".format(
            col=col, t=tbl
        ), db_api='psql'
    )[col].tolist()
    
    whr = '{}=\'{}\'' if fields_types[col] == str else '{}={}'
    
    for row in VALUES:
        q_to_ntbl(
            db, '{}_{}'.format(tbl, str(row[0])),
            "SELECT * FROM {} WHERE {}".format(
                tbl, whr.format(col, str(row[0]))
        ), api='psql')
Esempio n. 2
0
def split_table_entity_number(db, table, entity_field, entity_number):
    """
    Split tables in several using as reference a number of entities per table
    
    If a table has 1 000 000 entities and the entity_number is 250 000,
    this method will create four tables, each one with 250 000 entities.
    250 000 entities, not rows. Don't forget that the main table may have
    more than one reference to the same entity.
    """
    
    import pandas
    from gasp.sql.fm import q_to_obj
    from gasp.sql.i  import cols_type
    from gasp.sql.to import q_to_ntbl
    
    # Select entities in table
    entities = q_to_obj(db, "SELECT {c} FROM {t} GROUP BY {c}".format(
        c=entity_field, t=table
    ), db_api='psql')
    
    # Split entities into groups acoording entity_number
    entityGroup = []
    
    lower = 0
    high = entity_number
    while lower <= len(entities.index):
        if high > len(entities.index):
            high = len(entities.index)
        
        entityGroup.append(entities.iloc[lower : high])
        
        lower += entity_number
        high  += entity_number
    
    # For each dataframe, create a new table
    COLS_TYPE = cols_type(db, table)
    
    c = 0
    for df in entityGroup:
        if COLS_TYPE[entity_field] != str:
            df[entity_field] = '{}='.format(entity_field) + df[entity_field].astype(str)
        else:
            df[entity_field] = '{}=\''.format(entity_field) + df[entity_field].astype(str) + '\''
        
        whr = ' OR '.join(df[entity_field])
        
        q_to_ntbl(db, '{}_{}'.format(table, str(c)), (
            "SELECT * FROM {} WHERE {}"
        ).format(table, whr), api='psql')
        
        c += 1
Esempio n. 3
0
def txt_cols_to_col(db, inTable, columns, strSep, newCol, outTable=None):
    """
    Several text columns to a single column
    """

    from gasp.pyt import obj_to_lst
    from gasp.sql.i import cols_type

    mergeCols = obj_to_lst(columns)

    tblCols = cols_type(db, inTable, sanitizeColName=None, pyType=False)

    for col in mergeCols:
        if tblCols[col] != 'text' and tblCols[col] != 'varchar':
            raise ValueError('{} should be of type text'.format(col))

    coalesce = ""
    for i in range(len(mergeCols)):
        if not i:
            coalesce += "COALESCE({}, '')".format(mergeCols[i])

        else:
            coalesce += " || '{}' || COALESCE({}, '')".format(
                strSep, mergeCols[i])

    if outTable:
        # Write new table
        colsToSelect = [_c for _c in tblCols if _c not in mergeCols]

        if not colsToSelect:
            sel = coalesce + " AS {}".format(newCol)
        else:
            sel = "{}, {}".format(", ".join(colsToSelect),
                                  coalesce + " AS {}".format(newCol))

        q_to_ntbl(db,
                  outTable,
                  "SELECT {} FROM {}".format(sel, inTable),
                  api='psql')

        return outTable

    else:
        # Add column to inTable
        from gasp.sql.tbl import update_table

        add_field(db, inTable, {newCol: 'text'})

        update_table(db, inTable, {newCol: coalesce})

        return inTable
Esempio n. 4
0
def trim_char_in_col(db,
                     pgtable,
                     cols,
                     trim_str,
                     outTable,
                     onlyTrailing=None,
                     onlyLeading=None):
    """
    Python implementation of the TRIM PSQL Function
    
    The PostgreSQL trim function is used to remove spaces or set of
    characters from the leading or trailing or both side from a string.
    """

    from gasp.pyt import obj_to_lst
    from gasp.sql.i import cols_type

    cols = obj_to_lst(cols)

    colsTypes = cols_type(db, pgtable, sanitizeColName=None, pyType=False)

    for col in cols:
        if colsTypes[col] != 'text' and colsTypes[col] != 'varchar':
            raise ValueError('{} should be of type text'.format(col))

    colsToSelect = [_c for _c in colsTypes if _c not in cols]

    tail_lead_str = "" if not onlyTrailing and not onlyLeading else \
        "TRAILING " if onlyTrailing and not onlyLeading else \
        "LEADING " if not onlyTrailing and onlyLeading else ""

    trimCols = [
        "TRIM({tol}{char} FROM {c}) AS {c}".format(c=col,
                                                   tol=tail_lead_str,
                                                   char=trim_str)
        for col in cols
    ]

    if not colsToSelect:
        cols_to_select = "{}".format(", ".join(trimCols))
    else:
        cols_to_select = "{}, {}".format(", ".join(colsToSelect),
                                         ", ".join(trimCols))

    q_to_ntbl(db,
              outTable,
              "SELECT {} FROM {}".format(colsToSelect, pgtable),
              api='psql')
Esempio n. 5
0
def replace_char_in_col(db, pgtable, cols, match_str, replace_str, outTable):
    """
    Replace char in all columns in cols for the value of replace_str
    
    Python implementation of the REPLACE PSQL Function
    """

    from gasp.pyt import obj_to_lst
    from gasp.sql.i import cols_type

    cols = obj_to_lst(cols)

    colsTypes = cols_type(db, pgtable, sanitizeColName=None, pyType=False)

    for col in cols:
        if colsTypes[col] != 'text' and colsTypes[col] != 'varchar':
            raise ValueError('{} should be of type text'.format(col))

    colsToSelect = [_c for _c in colsTypes if _c not in cols]

    colsReplace = [
        "REPLACE({c}, '{char}', '{nchar}') AS {c}".format(c=col,
                                                          char=match_str,
                                                          nchar=replace_str)
        for col in cols
    ]

    if not colsToSelect:
        cols_to_select = "{}".format(", ".join(colsReplace))
    else:
        cols_to_select = "{}, {}".format(", ".join(colsToSelect),
                                         ", ".join(colsReplace))

    q_to_ntbl(db,
              outTable,
              "SELECT {cols} FROM {tbl}".format(cols=cols_to_select,
                                                tbl=pgtable),
              api='psql')

    return outTable
Esempio n. 6
0
File: ref.py Progetto: jasp382/gasp
def select_using_excel_refs(db_name,
                            excel_file,
                            sheet_name,
                            pgtable,
                            ref_fields,
                            tableInRef,
                            tableOutRef=None):
    """
    Split PGTABLE using references in excel table
    
    Create two tables:
    * One with similar rows - columns combination are in excel table;
    * One with rows not in excel table.
    
    TODO: Check if it's works. 
    """

    from gasp.fm import tbl_to_obj
    from gasp.sql.i import cols_type
    from gasp.sql.to import q_to_ntbl

    def to_and(row, cols, ctype):
        def get_equal(_type):
            return '{}=\'{}\'' if _type == str else '{}={}'

        row['AND_E'] = ' AND '.join(
            get_equal(ctype[col]).format(col, row[col]) for col in cols)

        row['AND_E'] = '(' + row['AND_E'] + ')'

        return row

    # Get excel data
    table = tbl_to_obj(excel_file, sheet=sheet_name)

    # Get reference fields type
    TYPE_COLS = cols_type(db_name, pgtable)

    table = table.apply(lambda x: to_and(x, ref_fields, TYPE_COLS))

    whr_equal = ' OR '.join(table['AND_E'])

    q_to_ntbl(db_name,
              tableInRef,
              "SELECT * FROM {} WHERE {}".format(pgtable, whr_equal),
              api='psql')

    if tableOutRef:
        COLS_RELATION = " AND ".join([
            "{ft}.{f} = {st}.{f}".format(ft=pgtable, f=col, st=tableInRef)
            for col in TYPE_COLS
        ])

        q_to_ntbl(db_name,
                  tableOutRef,
                  ("SELECT {ft}.* FROM {ft} LEFT JOIN {st} ON "
                   "{rel} WHERE {st}.{c} IS NULL").format(
                       ft=pgtable,
                       st=tableInRef,
                       rel=COLS_RELATION,
                       c=TYPE_COLS.keys()[0]),
                  api='psql')
Esempio n. 7
0
File: time.py Progetto: jasp382/gasp
def ID_rows_with_temporal_proximity_by_entities(db, table, entity_field,
                                                day_field, hour_field,
                                                hour_decimal_field,
                                                time_tolerance, outXlsPath):
    """
    Retrieve rows from one pgtable with some temporal proximity
    
    Table structure should be
    entity |     day    | hour | hour_decimal
      0    | 2018-01-02 |  5   |   5,10
      0    | 2018-01-03 |  4   |   4,15
      0    | 2018-01-02 |  5   |   5,12
      0    | 2018-01-02 |  5   |   5,8
      1    | 2018-01-02 |  4   |   4,10
      1    | 2018-01-02 |  5   |   5,12
      1    | 2018-01-02 |  4   |   4,20
      1    | 2018-01-02 |  4   |   4,12
      1    | 2018-01-02 |  4   |   4,6
    
    For a time_tolerance of 5 minutes, the output table will have
    the rows with a temporal difference inside/bellow that time tolerance
    
    entity_field could be more than one field
    
    This method only identifies if one entity, for one day, has rows 
    very close of each others, in terms of time.
    
    Not a good strategy for large tables. For large tables, SQL based methods
    are needed
    """

    import pandas
    from gasp.pyt import obj_to_lst
    from gasp.sql.fm import q_to_obj
    from gasp.sql.i import cols_type
    from gasp.to import obj_to_tbl

    entity_field = obj_to_lst(entity_field)
    COLS = entity_field + [day_field, hour_field]
    COLS_TYPE = cols_type(db, table)

    # TIME TOLERANCE IN HOURS
    TIME_TOLERANCE = time_tolerance / 60.0

    def thereIsRowsSameTimeInt(row):
        whr = []
        for c in COLS:
            if COLS_TYPE[c] == str:
                whr.append("{}='{}'".format(c, row[c]))
            else:
                whr.append("{}={}".format(c, row[c]))

        hourRows = q_to_obj(db,
                            "SELECT {} FROM {} WHERE {}".format(
                                hour_decimal_field, table, " AND ".join(whr)),
                            db_api='psql')[hour_decimal_field].tolist()

        for i in range(len(hourRows)):
            for e in range(i + 1, len(hourRows)):
                dif = abs(hourRows[i][0] - hourRows[e][0])

                if dif < TIME_TOLERANCE:
                    break

            if dif < TIME_TOLERANCE:
                break

        if dif < TIME_TOLERANCE:
            row['time_difference'] = 1
        else:
            row['time_difference'] = 0

        return row

    # Count entity occourrences for one day and hour
    countsByEntityTime = q_to_obj(
        db,
        ("SELECT {scols}, conta FROM "
         "(SELECT {scols}, COUNT({ent}) AS conta FROM {tbl} "
         "GROUP BY {scols}) AS foo WHERE conta > 1").format(
             scols=', '.join(COLS), ent=entity_field[0], tbl=table),
        db_api='psql')

    # For each row in the last count, When count is > 1
    # Check time difference between rows for one day and hour
    countsByEntityTime = countsByEntityTime.apply(
        lambda x: thereIsRowsSameTimeInt(x), axis=1)

    obj_to_tbl(countsByEntityTime, outXlsPath)

    return outXlsPath
Esempio n. 8
0
def run_query_for_values_in_col(db, query,
                               table_interest_col, interest_col,
                               outworkspace):
    """
    Execute a query for each value in one column
    In each iteration, the values may participate in the query.
    
    Export the several tables to excel
    
    Example:
    ID_PERCURSO | PARAGEM |    DIA     | GEOM
        0       |   255   |'2018-01-01 | xxxx
        0       |   255   |'2018-01-01 | xxxx
        0       |   254   |'2018-01-01 | xxxx
        0       |   254   |'2018-01-01 | xxxx
        0       |   255   |'2018-01-02 | xxxx
        0       |   255   |'2018-01-02 | xxxx
        0       |   254   |'2018-01-02 | xxxx
        0       |   254   |'2018-01-02 | xxxx
    
    For a query as:
    SELECT ID_PERCURSO, PARAGEM, GEOM, DIA, COUNT(PARAGEM) AS conta FROM
    table WHERE DIA={} GROUP BY PARAGEM, GEOM, DIA;
    
    This method will generate two tables:
    First table:
    ID_PERCURSO | PARAGEM |    DIA     | GEOM | conta
         0     |   255   |'2018-01-01 | xxxx |   2
         0     |   254   |'2018-01-01 | xxxx |   2
    
    Second table:
    ID_PERCURSO | PARAGEM |    DIA     | GEOM | conta
          0     |   255   |'2018-01-02 | xxxx |   2
          0     |   254   |'2018-01-02 | xxxx |   2
    
    {} will be replaced for every value in the interest_column that will
    be iterated one by one
    """
    
    import os
    from gasp.sql.fm import q_to_obj
    from gasp.sql.i  import cols_type
    from gasp.to     import obj_to_tbl
    
    fields_types = cols_type(db, table_interest_col)
    
    # Get  unique values
    VALUES = q_to_obj(db,
        "SELECT {col} FROM {t} GROUP BY {col}".format(
            col=interest_col, t=table_interest_col
        ), db_api='psql'
    )[interest_col].tolist()
    
    # Aplly query for every value in VALUES
    # Write data in excel
    for value in VALUES:
        data = q_to_obj(db, query.format(
            str(value[0]) if fields_types[interest_col] != str else \
            "'{}'".format(str(value[0]))
        ), db_api='psql')
        
        obj_to_tbl(data, os.path.join(outworkspace, '{}_{}.xlsx'.format(
            table_interest_col, str(value[0])
        )))