def query_and_decrypt_data(
    db: sqlalchemy.engine.base.Engine,
    env_aead: tink.aead.KmsEnvelopeAead,
    table_name: str,
) -> None:
    with db.connect() as conn:
        # Execute the query and fetch all results
        recent_votes = conn.execute(
            f"SELECT team, time_cast, voter_email FROM {table_name} "
            "ORDER BY time_cast DESC LIMIT 5").fetchall()

        print("Team\tEmail\tTime Cast")

        for row in recent_votes:
            team = row[0]

            # Postgres pads CHAR fields with spaces. These will need to be removed before
            # decrypting.
            aad = team.rstrip()

            # Use the envelope AEAD primitive to decrypt the email, using the team name as
            # associated data. Encryption with associated data ensures authenticity
            # (who the sender is) and integrity (the data has not been tampered with) of that
            # data, but not its secrecy. (see RFC 5116 for more info)
            email = env_aead.decrypt(row[2], aad.encode()).decode()
            time_cast = row[1]

            # Print recent votes
            print(f"{team}\t{email}\t{time_cast}")
Esempio n. 2
0
def usage_file_is_processed(file_name: str,
                            db_engine: sqlalchemy.engine.base.Engine) -> bool:
    """
    Verifies if the current file has been previously processed

    Parameters
    ----------
    file_name: str
        Name of the file
    db_engine: sqlalchemy.engine.base.Engine

    Returns
    -------
    bool
        True if the file has been processed, False if it has not.
    """

    _create_usage_table_if_it_does_not_exist(db_engine)

    query = f"select exists(select 1 from {USAGE_TABLE_NAME} where FILE_NAME='{file_name}') as 'exists'"

    db_item = {"exists": False}
    with db_engine.connect() as con:
        result: Union[ResultProxy, None] = con.execute(query)
        db_item = _map_single_result_to_dict(result)

    return db_item["exists"] == 1
Esempio n. 3
0
def encrypt_and_insert_data(
    db: sqlalchemy.engine.base.Engine,
    env_aead: tink.aead.KmsEnvelopeAead,
    table_name: str,
    team: str,
    email: str,
) -> None:
    time_cast = datetime.datetime.now(tz=datetime.timezone.utc)
    # Use the envelope AEAD primitive to encrypt the email, using the team name as
    # associated data. Encryption with associated data ensures authenticity
    # (who the sender is) and integrity (the data has not been tampered with) of that
    # data, but not its secrecy. (see RFC 5116 for more info)
    encrypted_email = env_aead.encrypt(email.encode(), team.encode())
    # Verify that the team is one of the allowed options
    if team != "TABS" and team != "SPACES":
        logger.error(f"Invalid team specified: {team}")
        return

    # Preparing a statement before hand can help protect against injections.
    stmt = sqlalchemy.text(
        f"INSERT INTO {table_name} (time_cast, team, voter_email)"
        " VALUES (:time_cast, :team, :voter_email)"
    )

    # Using a with statement ensures that the connection is always released
    # back into the pool at the end of statement (even if an error occurs)
    with db.connect() as conn:
        conn.execute(
            stmt,
            time_cast=time_cast,
            team=team,
            voter_email=encrypted_email)
    print(f"Vote successfully cast for '{team}' at time {time_cast}!")
Esempio n. 4
0
def _table_to_csv(engine: sqlalchemy.engine.base.Engine, table_name: str, file_path: str):
    with open(file_path, 'w') as fh:
        outcsv = csv.writer(fh)

        with engine.connect() as con:
            records = con.execute(f"SELECT * FROM {table_name}")
            outcsv.writerow(records.keys())
            outcsv.writerows(records)
Esempio n. 5
0
    def getUSerById(id: int, engine: sa.engine.base.Engine) -> User:

        user = list(engine.connect().execute(
            f"select * from {User.__tablename__} where id = '{id}'"))
        if (len(user) == 0): return None
        else: user = user[0]
        user = User.from_row_to_obj(user)
        return user
Esempio n. 6
0
 def getUserByEmail(email: str, engine: sa.engine.base.Engine) -> User:
     ## should check username validity
     user = list(engine.connect().execute(
         f"select * from {User.__tablename__} where email = '{email}'"))
     if (len(user) == 0): return None
     else: user = user[0]
     user = User.from_row_to_obj(user)
     return user
Esempio n. 7
0
def create_tables(config: list, engine: sqlalchemy.engine.base.Engine):
    con = engine.connect()

    for table in config:
        name = table.get('name')
        schema = table.get('schema')
        ddl = f"""DROP TABLE IF EXISTS {name}"""
        con.execute(ddl)

        ddl = f"""CREATE TABLE {name} ({schema})"""
        con.execute(ddl)
Esempio n. 8
0
def last_sync_date(
        sync_db: sqlalchemy.engine.base.Engine) -> Optional[datetime]:
    with sync_db.connect() as con:
        try:
            usage_df = read_sql("SELECT asOfDate FROM Usage", con)
            if usage_df["asOfDate"].count() == 0:
                return None
            return date_parse(usage_df["asOfDate"].max())  # type: ignore
        except OperationalError:
            logger.debug("No Usage table yet")
            return None
Esempio n. 9
0
    def build_from_engine(self, engine: sqlalchemy.engine.base.Engine) -> dict:

        self.setup(engine)

        # Create a connection to the database
        conn = engine.connect()

        # Record the time spent
        duration = {
            'querying': {},
        }

        # Sample each relation
        self.relations = {}
        sampling_method = {
            True: 'SYSTEM',
            False: 'BERNOULLI'
        }[self.block_sampling]
        for rel_name in self.rel_names:

            # Sample the relation if the number of rows is high enough
            query = 'SELECT * FROM {}'.format(rel_name)
            # Add a sampling statement if the sampling ratio is lower than 1
            sampling_ratio = max(self.sampling_ratio,
                                 self.min_rows / self.rel_cards[rel_name])
            if sampling_ratio < 1:
                # Make sure there won't be less samples then the minimum number of allowed rows
                query += ' TABLESAMPLE {} ({}) REPEATABLE ({})'.format(
                    sampling_method, sampling_ratio * 100, self.seed)
            date_atts = [
                att for att, typ in self.att_types[rel_name].items()
                if typ == 'date'
            ]
            tic = time.time()
            rel = pd.read_sql_query(sql=query, con=conn, parse_dates=date_atts)
            duration['querying'][rel_name] = time.time() - tic

            # Convert the datetimes to ISO formatted strings
            for att in date_atts:
                rel[att] = rel[att].map(lambda x: x.isoformat())

            # Strip the whitespace from the string columns
            for att in rel.columns:
                if rel[att].dtype == 'object':
                    rel[att] = rel[att].str.rstrip()

            # Store the relation
            self.relations[rel_name] = rel

        # Close the connection to the database
        conn.close()

        return duration
Esempio n. 10
0
 def delete_table(engine: sa.engine.base.Engine) -> bool:
     conn = engine.connect()
     try:
         conn.execute(f"drop table {User.__tablename__}")
         return True
     except sa.exc.OperationalError as err:
         if f"no such table: {User.__tablename__}" in err._message():
             if (engine.echo):
                 print(f"[!] Table '{User.__tablename__}' does not exist!")
             return False
         else:
             raise err
Esempio n. 11
0
def temp_ids_con(engine: sa.engine.base.Engine, ids: set):
    """Create database connection that makes temp.ids available as single column temp table
    """

    with engine.connect() as con:
        rows = ", ".join([f"({sql_clause_format(id)})" for id in ids])
        queries = [
            "DROP TABLE IF EXISTS temp.ids",
            "CREATE TEMP TABLE temp.ids(id INTEGER)",
            f"INSERT INTO temp.ids(id) VALUES {rows}",
        ]
        for query in queries:
            con.execute(query)
        yield con
Esempio n. 12
0
def cleanup_after_sync(resource_name: str, sync_db: sqlalchemy.engine.base.Engine):
    """
    Delete sync temporary tables if they exist

    Parameters
    ----------
    resource_name: str
        the name of the API resource, e.g. "Courses", to be used in SQL
    sync_db: sqlalchemy.engine.base.Engine
        an Engine instance for creating database connections
    """
    with sync_db.connect() as con:
        con.execute(f"DROP TABLE IF EXISTS Sync_{resource_name}")
        con.execute(f"DROP TABLE IF EXISTS Unmatched_{resource_name}")
Esempio n. 13
0
def _create_sync_table_from_resource_df(
    resource_df: DataFrame,
    identity_columns: List[str],
    resource_name: str,
    sync_db: sqlalchemy.engine.base.Engine,
):
    """
    Take fetched data and push to a new temporary sync table.  Includes
    hash and tentative extractor CreateDate/LastModifiedDates.

    Parameters
    ----------
    resource_df: DataFrame
        a DataFrame with current fetched data.
    identity_columns: List[str]
        a List of the identity columns for the resource dataframe.
    resource_name: str
        the name of the API resource, e.g. "Courses", to be used in SQL
    sync_db: sqlalchemy.engine.base.Engine
        an Engine instance for creating database connections
    """
    with sync_db.connect() as con:
        # ensure sync table exists, need column ordering to be identical to regular table
        con.execute(f"DROP TABLE IF EXISTS Sync_{resource_name}")
        con.execute(
            f"""
            CREATE TABLE IF NOT EXISTS Sync_{resource_name} (
                {SYNC_COLUMNS_SQL}
            )
            """
        )

    sync_df: DataFrame = resource_df.copy()
    sync_df = add_hash_and_json_to(sync_df)

    # add (possibly composite) primary key, sorting for consistent ordering
    add_sourceid_to(sync_df, identity_columns)

    now: datetime = datetime.now()
    sync_df["CreateDate"] = now
    sync_df["LastModifiedDate"] = now
    sync_df["SyncNeeded"] = 1

    sync_df = sync_df[SYNC_COLUMNS]
    sync_df.set_index("SourceId", inplace=True)
    # push to temporary sync table
    sync_df.to_sql(
        f"Sync_{resource_name}", sync_db, if_exists="append", index=True, chunksize=1000
    )
Esempio n. 14
0
    def setup(self, engine: sqlalchemy.engine.base.Engine):

        # Retrieve the metadata to know what tables and joins are available
        metadata = tools.get_metadata(engine)
        self.rel_names = tuple(metadata.tables.keys())

        # Create a connection to the database
        conn = engine.connect()

        # Retrieve relation cardinalities
        self.rel_cards = {}
        query = '''
        SELECT relname, reltuples
        FROM pg_class
        WHERE relname IN :rel_names
        '''
        rows = conn.execute(sqlalchemy.text(query), rel_names=self.rel_names)
        for (rel_name, card) in rows:
            self.rel_cards[rel_name] = card

        # Retrieve attribute cardinalities and number of nulls
        self.att_cards = defaultdict(dict)
        self.null_fracs = defaultdict(dict)
        query = '''
        SELECT tablename, attname, n_distinct, null_frac
        FROM pg_stats
        WHERE tablename IN :rel_names
        '''
        rows = conn.execute(sqlalchemy.text(query), rel_names=self.rel_names)
        for (rel_name, att_name, card, null_frac) in rows:
            self.att_cards[rel_name][att_name] = -card * self.rel_cards[
                rel_name] if card < 0 else card
            self.null_fracs[rel_name][att_name] = null_frac

        # Retrieve the type of each attribute
        self.att_types = defaultdict(dict)
        query = '''
        SELECT table_name, column_name, data_type
        FROM information_schema.columns
        WHERE table_name IN :rel_names
        '''
        rows = conn.execute(sqlalchemy.text(query), rel_names=self.rel_names)
        for (rel_name, att_name, att_type) in rows:
            self.att_types[rel_name][att_name] = att_type

        # Close the connection to the database
        conn.close()
def sync_to_db_without_cleanup(
    resource_df: DataFrame,
    identity_columns: List[str],
    resource_name: str,
    sync_db: sqlalchemy.engine.base.Engine,
):
    """
    Take fetched data and sync with database. Creates tables when necessary,
    but ok if temporary tables are there to start. Does not delete temporary tables when finished.

    Parameters
    ----------
    resource_df: DataFrame
        a DataFrame with current fetched data
    identity_columns: List[str]
        a List of the identity columns for the resource dataframe.
    resource_name: str
        the name of the API resource, e.g. "Courses", to be used in SQL
    sync_db: sqlalchemy.engine.base.Engine
        an Engine instance for creating database connections

    Returns
    -------
    DataFrame
        a DataFrame with current fetched data and reconciled CreateDate/LastModifiedDate
    """
    assert (Series(identity_columns).isin(
        resource_df.columns).all()), "Identity columns missing from dataframe"

    # In certain cases we can end up with duplicate records, for example
    # in Canvas when a course belongs to a sub-account. De-duplicate the
    # DataFrame based on the identity_columns
    resource_df.drop_duplicates(subset=identity_columns, inplace=True)

    _create_sync_table_from_resource_df(resource_df, identity_columns,
                                        resource_name, sync_db)

    with sync_db.connect() as con:
        _ensure_main_table_exists(resource_name, con)
        _create_unmatched_records_temp_table(resource_name, con)
        _get_true_create_dates_for_unmatched_records(resource_name, con)
        _update_resource_table_with_changes(resource_name, con)
        result_df: DataFrame = _update_dataframe_with_true_dates(
            resource_df, identity_columns, resource_name, con)

    return result_df
Esempio n. 16
0
def _table_exist(
    table_name: str,
    db_engine: sqlalchemy.engine.base.Engine,
) -> bool:
    with db_engine.connect() as con:
        result: Union[ResultProxy, None] = con.execute(
            """
            SELECT name
            FROM sqlite_master
            WHERE type='table' AND name=?;
        """,
            (table_name, ),
        )

        if result is None:
            return False
        if result.first() is None:
            return False
        return True
Esempio n. 17
0
def load_features(engine: sqlalchemy.engine.base.Engine, processed_data,
                  processed_labels):
    processed_data.to_sql('features',
                          con=engine,
                          index_label='user_id',
                          if_exists='replace',
                          method=psql_insert_copy)

    processed_labels = pd.DataFrame({'is_fraudster': processed_labels},
                                    index=processed_labels.index)

    processed_labels.to_sql('labels',
                            con=engine,
                            index_label='user_id',
                            if_exists='replace',
                            method=psql_insert_copy)

    with engine.connect() as con:
        con.execute('ALTER TABLE features ADD PRIMARY KEY (user_id);')
        con.execute('ALTER TABLE labels ADD PRIMARY KEY (user_id);')
Esempio n. 18
0
def _drop_everything(engine: sqlalchemy.engine.base.Engine):
    """(On a live db) drops all foreign key constraints before dropping all
    tables. Workaround for SQLAlchemy not doing DROP ## CASCADE for drop_all()
    (https://github.com/pallets/flask-sqlalchemy/issues/722)
    """

    con = engine.connect()
    trans = con.begin()
    inspector = Inspector.from_engine(engine)

    # We need to re-create a minimal metadata with only the required things to
    # successfully emit drop constraints and tables commands for
    # postgres (based on the actual schema of the running instance)
    meta = MetaData()
    tables = []
    all_fkeys = []

    for view_name in inspector.get_view_names():
        con.execute(
            "DROP MATERIALIZED VIEW IF EXISTS {} CASCADE".format(view_name))

    for table_name in inspector.get_table_names():
        fkeys = []

        for fkey in inspector.get_foreign_keys(table_name):
            if not fkey["name"]:
                continue

            fkeys.append(ForeignKeyConstraint((), (), name=fkey["name"]))

        tables.append(Table(table_name, meta, *fkeys))
        all_fkeys.extend(fkeys)

    for fkey in all_fkeys:
        con.execute(DropConstraint(fkey))

    for table in tables:
        con.execute(DropTable(table))

    trans.commit()
    Base.metadata.drop_all(engine)
Esempio n. 19
0
def request_all_usage_as_df(
    resource: Optional[Resource],
    sync_db: sqlalchemy.engine.base.Engine,
    env_start_date: str,
    env_end_date: str,
) -> DataFrame:
    usage_df: DataFrame = request_latest_usage_as_df(
        resource, start_date(sync_db, env_start_date), end_date(env_end_date))
    if usage_df.empty:
        return usage_df

    usage_df.to_sql("Usage",
                    sync_db,
                    if_exists="append",
                    index=False,
                    chunksize=500)
    # remove duplicates - leave only the most recent
    with sync_db.connect() as con:
        con.execute("DELETE from Usage "
                    "WHERE rowid not in (select max(rowid) "
                    "FROM Usage "
                    "GROUP BY email, asOfDate)")

    return usage_df
Esempio n. 20
0
 def create_user(info: dict, engine: sa.engine.base.Engine) -> bool:
     user_table = User.get_table_obj(engine)
     ins = user_table.insert()
     conn = engine.connect()
     ## incomplete fields check integrity hashing, email/uname/pass format validation etc
     try:
         #user = list(
         conn.execute(
             ins.values(
                 name=info['name'],
                 username=info['username'],
                 email=info['email'],
                 password=bcrypt.hashpw(info['password'].encode(),
                                        bcrypt.gensalt()).decode(),
                 role=info['role'],
             ))  #.returning(sa.literal_column('*'))))
         #if(len(user) == 0): return False
         #else: user = User.from_row_to_obj(engine)
         #print(user)
         return True
     except sa.exc.IntegrityError as err:
         if (engine.echo):
             print(f"[!] {err._message()}")
         return False
Esempio n. 21
0
    def build_from_engine(self, engine: sqlalchemy.engine.base.Engine) -> dict:

        self.setup(engine)

        # Create a connection to the database
        conn = engine.connect()

        # Record the time spent
        duration = {'querying': {}, 'parameters': {}}

        # Create histograms per attribute
        self.histograms = {}
        self.n_in_bin = {}
        sampling_method = {
            True: 'SYSTEM',
            False: 'BERNOULLI'
        }[self.block_sampling]

        for rel_name in self.rel_names:

            self.histograms[rel_name] = {}
            self.n_in_bin[rel_name] = {}

            rel_card = self.rel_cards[rel_name]

            # Sample the relation if the number of rows is high enough
            query = 'SELECT * FROM {}'.format(rel_name)
            # Add a sampling statement if the sampling ratio is lower than 1
            sampling_ratio = max(self.sampling_ratio, self.min_rows / rel_card)
            if sampling_ratio < 1:
                # Make sure there won't be less samples then the minimum number of allowed rows
                query += ' TABLESAMPLE {} ({}) REPEATABLE ({})'.format(
                    sampling_method, sampling_ratio * 100, self.seed)
            date_atts = [
                att for att, typ in self.att_types[rel_name].items()
                if typ == 'date'
            ]
            tic = time.time()
            rel = pd.read_sql_query(sql=query, con=conn, parse_dates=date_atts)
            duration['querying'][rel_name] = time.time() - tic

            # Convert the datetimes to ISO formatted strings
            for att in date_atts:
                rel[att] = rel[att].map(lambda x: x.isoformat())

            # Strip the whitespace from the string columns
            for att in rel.columns:
                if rel[att].dtype == 'object':
                    rel[att] = rel[att].str.rstrip()

            # Blacklist the ID columns
            blacklist = [
                att for att in rel.columns
                if '_id' in att or 'id_' in att or att == 'id' or '_sk' in att
                or self.att_types[rel_name][att] == 'character varying'
                or round(rel_card * self.null_fracs[rel_name][att] +
                         self.att_cards[rel_name][att]) == rel_card
            ]

            # Create one histogram per attribute
            tic = time.time()
            for att in set(rel.columns) - set(blacklist):
                rel[att], self.n_in_bin[rel_name][
                    att] = tools.discretize_series(rel[att],
                                                   n_mcv=self.n_mcv,
                                                   n_bins=self.n_bins)
                self.histograms[rel_name][att] = distribution.Distribution(
                    on=att, by=None)
                self.histograms[rel_name][att].build_from_df(
                    rel, types=self.att_types[rel_name])

            duration['parameters'][rel_name] = time.time() - tic

        # Close the connection to the database
        conn.close()

        return duration
Esempio n. 22
0
def update_on_table(df: pd.DataFrame, keys: update_key_type,
                    values: update_key_type, table_name: str,
                    engine: sa.engine.base.Engine, schema: str) -> int:
    """

    :param df: a dataframe with data tha needs to be updated. Must have columns to be used as key and some for values
    :param keys: the set of columns to use as key, i.e. update when matched
    :param values: the set of columns to update, i.e. set when matched
    :param table_name: a table name as in util_function
    :param engine: the sqlalchemy engine for the database
    :param schema: a schema of interest - None if default schema of database is ok
    :return: the number of records updated
    """

    # get table
    tbl = util_function(table_name, engine, schema)

    # change nan to None, make sure columns are modified so that we can easily bindparam
    df_ = df.copy()
    df_.columns = [f"{el.lower()}_updt" for el in df_.columns]
    groups = toolz.partition_all(
        CHUNK_SIZE,
        df_.where(pd.notnull(df_), None).to_dict(orient='records'))

    if not isinstance(keys, tuple) and not isinstance(keys, dict):
        raise BadArgumentType(
            "keys and values must either be both tuples or both dicts", None)

    # create where clause, and update statement
    update_statement: dml.Update
    if isinstance(keys, tuple):
        if not isinstance(values, tuple):
            raise BadArgumentType(
                "keys and values must either be both tuples or both dicts",
                None)

        where = [
            tbl.c[el] == sa.bindparam(f"{el.lower()}_updt") for el in keys
        ]
        update_statement = tbl.update().where(sa.and_(*where)).values(
            dict((a, sa.bindparam(f"{a.lower()}_updt")) for a in values))

    if isinstance(keys, dict):
        if not isinstance(values, dict):
            raise BadArgumentType(
                "keys and values must either be both tuples or both dicts",
                None)
        where = [
            tbl.c[k] == sa.bindparam(f"{v.lower()}_updt")
            for k, v in keys.items()
        ]
        update_statement = tbl.update().where(sa.and_(*where)).values(
            dict((k, sa.bindparam(f"{v.lower()}_updt"))
                 for k, v in values.items()))

    # update
    count, last_successful_update = 0, None
    with engine.connect() as connection:
        for group in groups:
            try:
                result = connection.execute(update_statement, group)
                last_successful_update = group[-1]
                count += result.rowcount
            except exc.OperationalError as _:
                # try again
                time.sleep(2)

                try:
                    result = connection.execute(update_statement, group)
                    last_successful_update = group[-1]
                    count += result.rowcount
                except exc.OperationalError as e:
                    raise OperationalError(
                        f"Failed to update records. Last successful update: {last_successful_update}",
                        e)

    return count