def __init__(self, source_table, destination_table, post_process_file):
     self.source_schema, self.source_table = source_table.split(".")
     self.destination_schema, self.destination_table = destination_table.split(
         ".")
     self.sf_engine = snowflake_engine_factory(os.environ, "TRANSFORMER",
                                               self.source_schema)
     self.post_process_file = post_process_file
Example #2
0
def extract_from_stage(import_table, stage, schema, path, pattern, env):
    engine = snowflake_engine_factory(env, 'LOADER', schema)

    query = f"""
        COPY INTO raw.{schema}.{import_table}
        FROM @{stage}/{path}
        PATTERN = '{pattern}'
        ON_ERROR = 'CONTINUE';
    """
    execute_query(engine, query)
def update_chronological_sequence():
    engine = snowflake_engine_factory(os.environ, "TRANSFORMER", "util")

    query = f'''
    UPDATE ANALYTICS.EVENTS.USER_EVENTS_BY_DATE
    SET chronological_sequence = a.chronological_sequence,
        seconds_after_prev_event = a.seconds_after_prev_event
    FROM (
        SELECT id,
            updated_at,
            ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY min_timestamp) as chronological_sequence,
            datediff(second, lag(min_timestamp) over (partition by user_id order by min_timestamp), min_timestamp) as seconds_after_prev_event
        FROM ANALYTICS.EVENTS.USER_EVENTS_BY_DATE
        WHERE length(user_id) < 36
        AND user_id IS NOT NULL
    ) a
    WHERE user_events_by_date.updated_at::timestamp = (SELECT MAX(UPDATED_AT)::timestamp FROM analytics.events.user_events_by_date)
    AND a.id = user_events_by_date.id;
    '''

    execute_query(engine, query)
 def __init__(self, processing_table="analytics.util.pg_imports"):
     self.processing_table = processing_table
     self.sf_engine = snowflake_engine_factory(os.environ, "TRANSFORMER",
                                               "util")
import clearbit
import psycopg2
import os
import snowflake.connector
import sys
from datetime import datetime
from sqlalchemy import create_engine
import pandas as pd
from extract.utils import snowflake_engine_factory, execute_query, execute_dataframe

# Create database connection and cursor
engine = snowflake_engine_factory(os.environ, "TRANSFORMER", "util")
connection = engine.connect()

# SET CLEARBIT API KEY
clearbit.key = os.getenv('CLEARBIT_KEY')

# RETRIEVE COLUMN NAMES IF TABLE ALREADY EXISTS
try:
    col_q = f'''
    SELECT COLUMN_NAME 
    FROM ANALYTICS.INFORMATION_SCHEMA.COLUMNS
    WHERE TABLE_NAME = 'CLOUD_CLEARBIT'
    AND TABLE_SCHEMA = 'MATTERMOST'
    ORDER BY ORDINAL_POSITION
    '''
    col_df = execute_dataframe(engine, query=col_q)
    clearbit_cols = []
    if len(col_df) > 0:
        for index, row in col_df.iterrows():
            clearbit_cols.append(row['COLUMN_NAME'].lower())
Example #6
0
def parse_user_agent():
    engine = snowflake_engine_factory(os.environ, "TRANSFORMER", "util")
    """ This function searches for and parses all available user agents received via telemetry data that are
    not currently in the analytics.mattermost.user_agent_registry table."""

    # CREATE USER_AGENT_REGISTRY IF IT DOES NOT ALREADY EXIST.
    query = f"""
    CREATE TABLE IF NOT EXISTS analytics.WEB.user_agent_registry
(
    context_useragent VARCHAR,
    browser           VARCHAR,
    browser_version   VARCHAR,
    operating_system  VARCHAR,
    os_version        VARCHAR,
    device_type       VARCHAR,
    device_brand      VARCHAR,
    device_model      VARCHAR
);"""

    execute_query(engine, query)

    # UNION ALL SOURCES OF CONTEXT_USERAGENT DATA THAT ARE NOT CURRENTLY IN THE USER_AGENT_REGISTRY TABLE.
    query = f"""
    SELECT *
    FROM (
        SELECT COALESCE(ANALYTICS.EVENTS.USER_EVENTS_TELEMETRY.CONTEXT_USER_AGENT, ANALYTICS.EVENTS.USER_EVENTS_TELEMETRY.CONTEXT_USERAGENT) AS CONTEXT_USERAGENT
        FROM ANALYTICS.EVENTS.USER_EVENTS_TELEMETRY
        LEFT JOIN (SELECT CONTEXT_USERAGENT as JOIN_KEY FROM analytics.WEB.user_agent_registry GROUP BY 1) a
            ON COALESCE(ANALYTICS.EVENTS.USER_EVENTS_TELEMETRY.CONTEXT_USER_AGENT, ANALYTICS.EVENTS.USER_EVENTS_TELEMETRY.CONTEXT_USERAGENT) = a.JOIN_KEY
        WHERE COALESCE(ANALYTICS.EVENTS.USER_EVENTS_TELEMETRY.CONTEXT_USER_AGENT, ANALYTICS.EVENTS.USER_EVENTS_TELEMETRY.CONTEXT_USERAGENT) IS NOT NULL
            AND a.JOIN_KEY IS NULL
        GROUP BY 1
        UNION ALL
        SELECT USERAGENT AS CONTEXT_USERAGENT
        FROM RAW.RELEASES.LOG_ENTRIES
        LEFT JOIN (SELECT CONTEXT_USERAGENT as JOIN_KEY FROM analytics.WEB.user_agent_registry GROUP BY 1) a
            ON RAW.RELEASES.LOG_ENTRIES.USERAGENT = a.JOIN_KEY
        WHERE USERAGENT IS NOT NULL
            AND a.JOIN_KEY IS NULL
        GROUP BY 1
        UNION ALL
        SELECT USERAGENT AS CONTEXT_USERAGENT
        FROM RAW.DIAGNOSTICS.LOG_ENTRIES
        WHERE USERAGENT IS NOT NULL
            AND USERAGENT NOT IN (SELECT CONTEXT_USERAGENT FROM analytics.WEB.user_agent_registry GROUP BY 1)
            AND LOGDATE::date >= CURRENT_DATE - INTERVAL '1 DAY'
        GROUP BY 1
        UNION ALL
        SELECT CONTEXT_USERAGENT
        FROM ANALYTICS.WEB.DAILY_WEBSITE_TRAFFIC
        LEFT JOIN (SELECT CONTEXT_USERAGENT as JOIN_KEY FROM analytics.WEB.user_agent_registry GROUP BY 1) a
            ON ANALYTICS.WEB.DAILY_WEBSITE_TRAFFIC.CONTEXT_USERAGENT = a.JOIN_KEY
        WHERE CONTEXT_USERAGENT IS NOT NULL
            AND a.JOIN_KEY IS NULL
        GROUP BY 1
        )
    GROUP BY 1;
    """

    df = execute_dataframe(engine, query)

    if (
            len(df) == 0
    ):  # CHECKS TO SEE IF THERE ARE ANY NEW CONTEXT_USERAGENTS TO INSERT INTO THE TABLE
        print("Nothing to do.")
    else:  # PARSES USERAGENT COMPONENTS AND APPENDS EACH COMPONENT AS A COLUMN TO THE EXISTING DATAFRAME.
        browser = []
        browser_family = []
        browser_version = []
        browser_version_string = []
        operating_system = []
        os_family = []
        os_version = []
        os_version_string = []
        device = []
        device_family = []
        device_brand = []
        device_model = []

        for index, row in df.iterrows():
            ua_string = row["CONTEXT_USERAGENT"]
            user_agent = parse(ua_string)

            browser.append(user_agent.browser)
            browser_family.append(user_agent.browser.family)
            browser_version.append(user_agent.browser.version)
            browser_version_string.append(user_agent.browser.version_string)

            # Accessing user agent's operating system properties
            operating_system.append(user_agent.os)
            os_family.append(user_agent.os.family)
            os_version.append(user_agent.os.version)
            os_version_string.append(user_agent.os.version_string)

            # Accessing user agent's device properties
            device.append(user_agent.device)
            device_family.append(user_agent.device.family)
            device_brand.append(user_agent.device.brand)
            device_model.append(user_agent.device.model)

        browser = pd.Series(browser_family, name="browser")
        browser_version = pd.Series(browser_version_string,
                                    name="browser_version")
        op_sys = pd.Series(os_family, name="operating_system")
        os_version = pd.Series(os_version_string, name="os_version")
        device_type = pd.Series(device_family, name="device_type")
        device_brand = pd.Series(device_brand, name="device_brand")
        device_model = pd.Series(device_model, name="device_model")

        agent_lists = [
            browser,
            browser_version,
            op_sys,
            os_version,
            device_type,
            device_brand,
            device_model,
        ]
        for item in agent_lists:
            df = df.join(item)

        connection = engine.connect()

        # 16,384 is Snowflake Insert statement row limit. To ensure the job executes successfully we use the below code to check that the data being inserted
        # is not more than the allowed row limit. If it is, we incrementally load the dataframe.
        df[0:16384 if len(df) > 16384 else len(df)].to_sql(
            "user_agent_registry",
            con=connection,
            index=False,
            schema="WEB",
            if_exists="append",
        )
        i = 2  # The default number of times to increment. Will autoincrement if more than 2 inserts are required.

        if i <= math.ceil(len(df) / 16384):
            x = 16384  # The start row of the dataframe slice to be inserted. Will autoincrement if more than 2 inserts are required.
            y = (
                16384 * 2
            )  # The end row of the dataframe slice to be inserted. Will autoincrement if more than 2 inserts are required.

            # Loops through the remaining insert statements required to finish the job i.e. load all new user agents found in the mattermostcom.pages table.
            for n in range(math.ceil(len(df) / 16384) - 1):
                df[x:y if y < len(df) else len(df)].to_sql(
                    "user_agent_registry",
                    con=connection,
                    index=False,
                    schema="WEB",
                    if_exists="append",
                )
                x = y
                y += 16384
                i += 1
        return print(
            f"""Successfully uploaded {len(df)} records to mattermost.user_agent_registry!"""
        )
Example #7
0
def contributors():
    org = "mattermost"
    data = []

    repo = []
    records = []
    has_next = True
    cursor = ""
    while has_next:
        try:
            result = graphql_query(gen_repo_query(org, cursor))
        except Exception as e:
            print(e)
            return

        repo_results = result["data"]["organization"]["repositories"]["nodes"]
        has_next = result["data"]["organization"]["repositories"]["pageInfo"][
            "hasNextPage"]
        cursor = result["data"]["organization"]["repositories"]["pageInfo"][
            "endCursor"]

        for i in repo_results:
            repo.append(i["name"])

    for one_repo in repo:
        has_next = True
        cursor = ""
        while has_next:
            query = gen_query(org, one_repo, cursor)

            try:
                result = graphql_query(query)
            except Exception as e:
                print(e)
                return

            pull_requests = result["data"]["repository"]["pullRequests"]
            has_next = pull_requests["pageInfo"]["hasNextPage"]
            cursor = pull_requests["pageInfo"]["endCursor"]

            for node in pull_requests["nodes"]:
                if node and node["author"]:
                    records.append({
                        "PR_NUMBER": node["number"],
                        "MERGED_AT": node["mergedAt"],
                        "AUTHOR": node["author"]["login"],
                        "REPO": one_repo,
                    })

    df = pd.DataFrame.from_records(records)

    try:
        engine = snowflake_engine_factory(os.environ, "TRANSFORMER", "util")
        connection = engine.connect()
        connection.execute("DELETE FROM staging.github_contributions_all")

        print(
            f"Preparing to load results to Snowflake. Records: {len(records)}")

        df.to_sql(
            "github_contributions_all",
            con=connection,
            index=False,
            schema="STAGING",
            if_exists="append",
            method=pd_writer,
        )
    except Exception as e:
        print(e)
        connection.close()
        return
Example #8
0
def get_twitter_mentions():
    # Twitter credentials
    # Obtain them from your twitter developer account

    # Need to add this or another Twitter Developer API Key to SysVars - This is currently my personal API Key
    consumer_key = os.getenv("TWITTER_CONSUMER_KEY")
    consumer_secret = os.getenv("TWITTER_CONSUMER_SECRET")
    access_key = os.getenv("TWITTER_ACCESS_KEY")
    access_secret = os.getenv("TWITTER_ACCESS_SECRET")

    # Pass your twitter credentials to tweepy via its OAuthHandler
    auth = OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_key, access_secret)
    api = tweepy.API(auth,
                     wait_on_rate_limit=True,
                     wait_on_rate_limit_notify=True)

    # Create database connection and cursor
    engine = snowflake_engine_factory(os.environ, "TRANSFORMER", "util")
    connection = engine.connect()

    # Create empty dataframe with required columns
    db_tweets = pd.DataFrame(columns=[
        "username",
        "text",
        "full_name",
        "user_url",
        "url",
        "retweet_text",
        "original_tweet_date",
        "retweeted_status",
        "retweet_count",
        "created_at",
        "location",
        "followers",
        "user_id",
        "favorite_count",
        "lang",
        "verified",
        "hashtags",
        "following_count",
        "is_tweet_reply",
        "id",
        "longitude_latitude",
    ])

    # Fetch latest data from existing ANALYTICS.SOCIAL_MENTIONS.TWITTER relation
    query = f"""
    SELECT MAX(CREATED_AT - interval '1 day')::date::varchar AS DATE, 
    MAX(CREATED_AT)::VARCHAR AS TIMESTAMP 
    FROM analytics.social_mentions.twitter
    """

    try:
        results = execute_query(engine, query)
    except Exception as e:
        print(f"""Oh no! There was an error executing your query: {e}""")

    # Retrieve all tweets >= Max Created At in ANALYTICS.SOCIAL_MENTIONS.TWITTER relation
    tweets = tweepy.Cursor(api.search,
                           q="mattermost",
                           since=f"{results[0][0]}").items(5000)

    # Loop through new tweets and extract relevant fields to populate dataframe.
    for tweet in tweets:
        is_tweet_reply = True if tweet.in_reply_to_screen_name != None else False
        username = tweet.user.screen_name
        full_name = tweet.user.name
        user_url = tweet.user.url
        url = f"https://twitter.com/{tweet.user.screen_name}/status/{tweet.id}"
        retweet_count = tweet.retweet_count
        verified = tweet.user.verified
        user_id = tweet.user.id
        favorite_count = tweet.favorite_count
        acctdesc = tweet.user.description
        location = tweet.user.location
        following = tweet.user.friends_count
        followers = tweet.user.followers_count
        totaltweets = tweet.user.statuses_count
        usercreatedts = tweet.user.created_at
        created_at = tweet.created_at.strftime("%Y-%m-%d %H:%M:%S")
        lang = tweet.lang
        hashtags = str(tweet.entities["hashtags"])
        longitude_latitude = tweet.coordinates
        tweet_id = tweet.id

        try:
            text = tweet.text
            retweet_text = tweet.retweeted_status.text
            original_tweet_date = tweet.retweeted_status.created_at.strftime(
                "%Y-%m-%d %H:%M:%S")
            is_retweet = True if tweet.retweeted_status.text != None else False
        except AttributeError:  # Not a Retweet
            text = tweet.text
            original_tweet_date = None
            retweet_text = None
            is_retweet = False

        # Add variables to tweet list to be inserted into dataframe:
        ith_tweet = [
            username,
            text,
            full_name,
            user_url,
            url,
            retweet_text,
            original_tweet_date,
            is_retweet,
            retweet_count,
            created_at,
            location,
            followers,
            user_id,
            favorite_count,
            lang,
            verified,
            hashtags,
            following,
            is_tweet_reply,
            tweet_id,
            longitude_latitude,
        ]
        # Append to dataframe - db_tweets
        db_tweets.loc[len(db_tweets)] = ith_tweet

    # Append dataframe to ANALYTICS.SOCIAL_MENTIONS.TWITTER relation
    db_tweets[db_tweets["created_at"] > results[0][1]].to_sql(
        "twitter",
        con=connection,
        index=False,
        schema="SOCIAL_MENTIONS",
        if_exists="append",
    )
#!/usr/bin/python

import argparse
import os
import sys

from extract.utils import snowflake_engine_factory, execute_query

parser = argparse.ArgumentParser()
parser.add_argument("sql_file", help="The SQL file to run on Snowflake")
parser.add_argument("role",
                    default="TRANSFORMER",
                    help="The role to run the queries for")
parser.add_argument("schema", help="Default schema to use for queries")

if __name__ == "__main__":
    args = parser.parse_args()

    engine = snowflake_engine_factory(os.environ.copy(), args.role,
                                      args.schema)

    with open(f"transform/sql/snowflake/{args.sql_file}.sql") as f:
        content = f.read()
        content = content.replace("\;", "SEMICOLONTEMPFIX")
        queries = content.split(";")
        with engine.begin() as conn:
            [
                conn.execute(query.replace("SEMICOLONTEMPFIX", ";"))
                for query in queries
            ]