def transform_data():
    import pyspark
    import pandas as pd
    from pyspark.sql import SQLContext
    from pyspark import SparkContext

    dest = SqliteHook(sqlite_conn_id='sqlite2')
    dest_conn = dest.get_conn()
    BASE_DIR = os.path.dirname(os.path.abspath("__file__"))

    sc = pyspark.SparkContext()
    sqlContext = SQLContext(sc)

    df = sc.textFile(os.path.join(BASE_DIR, "InputFile.txt"))
    df.collect()
    sparkDF = df.map(lambda x: str(x).translate({ord(c): None
                                                 for c in '][""'})
                     ).map(lambda w: w.split(',')).toDF()
    pdDF = sparkDF.toPandas()

    sqlUpdate = 'INSERT OR REPLACE INTO Currency(USD, JPY, CAD, GBP, NZD, INR, Date_of_rate) VALUES (?, ?, ?, ?, ?, ?, ?)'
    data = pdDF.values
    dest_conn.executemany(sqlUpdate, data)
    dest_conn.commit()
    sc.stop()
Example #2
0
def process_weather(station, field_to_process):
    db_hook = SqliteHook(conn_name_attr='sqlite_default')

    weather_select = (f'select record_date, {field_to_process} '
                      f'from weather where station_id={station} '
                      f'order by record_date;')

    data = db_hook.get_pandas_df(weather_select)
    average = data.rolling(
        3, center=True).mean().rename(columns={field_to_process: 'average'})
    data = data.merge(average, left_index=True, right_index=True)

    del average

    weather_update = """
    update weather
    set average = ?
    where station_id=? and record_date=?;
    """

    # iteration over data is used like a hack to avoid using either DataFrame.itertuples(), iteritems() or iterrows(),
    # which may be a bottleneck in case if number of rows is more then several thousands

    data.apply(lambda row: db_hook.run(weather_update,
                                       parameters=(row['average'], station,
                                                   row['record_date'])),
               axis=1)
def save_data_into_db():
    sqlite_hook = SqliteHook(sqlite_conn_id='playgroundDB')
    with open('data.json') as f:
        data = json.load(f)
    insert = """
            INSERT INTO Covid19Report (
                confirmed,
                recovered,
                hospitalized,
                deaths,
                new_confirmed,
                new_recovered,
                new_hospitalized,
                new_deaths,
                update_date,
                source,
                dev_by,
                sever_by)
            VALUES (?,?,?,?,?,?,?,?,?,?,?,?);
             """
    sqlite_hook.run(
        insert,
        parameters=(data['Confirmed'], data['Recovered'], data['Hospitalized'],
                    data['Deaths'], data['NewConfirmed'], data['NewRecovered'],
                    data['NewHospitalized'], data['NewDeaths'],
                    datetime.strptime(data['UpdateDate'], '%d/%m/%Y %H:%M'),
                    data['Source'], data['DevBy'], data['SeverBy']))
Example #4
0
def get_weather(url_params):
    db_hook = SqliteHook(conn_name_attr='sqlite_default ')
    api_hook = HttpHook(http_conn_id='http_default', method='GET')

    url = add_or_replace_parameters(f'v1/history/daily', url_params)

    resp = api_hook.run(url)
    data = resp.json()['data']

    # usually I don't really care about this, but in case of big data, I guess it may be very useful
    del resp

    weather_insert = """
    insert or ignore into weather 
    (station_id, record_date, temperature, 
    temperature_min, temperature_max, winddirection, windspeed, sunshine, pressure) 
    values (?, ?, ?, ?, ?, ?, ?, ?, ?);
    """

    for day in data:
        db_hook.run(weather_insert,
                    parameters=(url_params['station'], day['date'],
                                day['temperature'], day['temperature_min'],
                                day['temperature_max'], day['winddirection'],
                                day['windspeed'], day['sunshine'],
                                day['pressure']))
Example #5
0
def getdf():
    import os.path
    h = SqliteHook(conn_id)
    df = h.get_pandas_df(
        "SELECT A.DocumentNo, A.FullName, A.Device, A.Country, B.OrderId, B.DocumentNo, B.OrderDate, B.CatalogId,C.CatalogId, C.ProductId, C.CUID, D.ProductId, D.ProductName, D.CUID FROM CUSTOMERS as A inner join ORDERS as B on A.DocumentNo = B.DocumentNo inner join CATALOG as C on C.CatalogId = B.CatalogId inner join PRODUCTS AS D ON C.CUID = D.CUID"
    )
    print(df)
    df.to_csv(os.path.join(BASE_DIR, "SourceData.txt"), index=False)
def createdb():

    try:
        dest = SqliteHook(sqlite_conn_id='sqlite2')
        dest_conn = dest.get_conn()
        dest_conn.execute(
            '''CREATE TABLE if not exists Currency(USD text, JPY text, CAD text, GBP text, NZD text, INR text, Date_of_rate date)'''
        )
    except:
        print("SQLite Connection Failed")
Example #7
0
    def test_env_var_priority(self):
        c = SqliteHook.get_connection(conn_id='airflow_db')
        self.assertNotEqual('ec2.compute.com', c.host)

        with mock.patch.dict('os.environ', {
            'AIRFLOW_CONN_AIRFLOW_DB': 'postgres://*****:*****@ec2.compute.com:5432/the_database',
        }):
            c = SqliteHook.get_connection(conn_id='airflow_db')
            self.assertEqual('ec2.compute.com', c.host)
            self.assertEqual('the_database', c.schema)
            self.assertEqual('username', c.login)
            self.assertEqual('password', c.password)
            self.assertEqual(5432, c.port)
Example #8
0
def get_num_active_dagruns(dag_id, conn_id='sqlite_default'):
    # if you've opted for a different backend for airflow, you will need to
    # refactor the two lines below. For a Postgres example, please refer to
    # https://github.com/Nextdoor/airflow_examples/blob/master/dags/util.py#L8
    airflow_db = SqliteHook(sqlite_conn_id=conn_id)
    conn = airflow_db.get_conn()
    cursor = conn.cursor()
    sql = """
         select count(*)
         from dag_run
         where dag_id = '{dag_id}'
         and state in ('running', 'queued', 'up_for_retry')
          """.format(dag_id=dag_id)
    cursor.execute(sql)
    num_active_dagruns = cursor.fetchone()[0]
    return num_active_dagruns
Example #9
0
 def test_using_env_var(self):
     c = SqliteHook.get_connection(conn_id='test_uri')
     self.assertEqual('ec2.compute.com', c.host)
     self.assertEqual('the_database', c.schema)
     self.assertEqual('username', c.login)
     self.assertEqual('password', c.password)
     self.assertEqual(5432, c.port)
Example #10
0
 def test_using_unix_socket_env_var(self):
     c = SqliteHook.get_connection(conn_id='test_uri_no_creds')
     self.assertEqual('ec2.compute.com', c.host)
     self.assertEqual('the_database', c.schema)
     self.assertIsNone(c.login)
     self.assertIsNone(c.password)
     self.assertIsNone(c.port)
Example #11
0
def load_csv_to_sqlite(csv_path, sqlite_table, **context):
    """
    Loads CSV data (i.e. research_papers and mct_talks) in the default Sqlite DB.
    """
    df = pd.read_csv(csv_path)
    with SqliteHook().get_conn() as conn:
        df.to_sql(sqlite_table, con=conn, if_exists='replace')
Example #12
0
 def test_get_connections_env_var(self):
     conns = SqliteHook.get_connections(conn_id='test_uri')
     assert len(conns) == 1
     assert conns[0].host == 'ec2.compute.com'
     assert conns[0].schema == 'the_database'
     assert conns[0].login == 'username'
     assert conns[0].password == 'password'
     assert conns[0].port == 5432
def create_covid19_report_table():
    sqlite_hook = SqliteHook(sqlite_conn_id='playgroundDB')
    create = """
             CREATE TABLE  "Covid19Report" (
	            "confirmed"	INTEGER,
	            "recovered"	INTEGER,
	            "hospitalized"	INTEGER,
	            "deaths"	INTEGER,
	            "new_confirmed"	INTEGER,
	            "new_recovered"	INTEGER,
	            "new_hospitalized"	INTEGER,
	            "new_deaths"	INTEGER,
	            "update_date"	TEXT,
                "source"    TEXT,
                "dev_by"    TEXT,
                "sever_by"  TEXT);
             """
    sqlite_hook.run(create)
Example #14
0
def predict(classifier, **context):
    """
    Makes predictions for a model created by the given classifier and returns its
    stores the results in the mct_talks table.
    """
    # Load model
    model = _load_model(classifier.__name__)

    # Load data
    db = SqliteHook()
    df = db.get_pandas_df('select * from mct_talks')

    # Make predictions
    df['Conference'] = model.predict(df['Title'].tolist())

    # Save predictions
    with db.get_conn() as conn:
        df.to_sql('mct_talks', con=conn, if_exists='replace')
Example #15
0
def split_data(**context):
    """
    Splits the sample data (i.e. research_papers) into training and test sets
    and stores them in the Sqlite DB.
    """
    # Load full dataset
    db = SqliteHook()
    df = db.get_pandas_df('select * from research_papers')

    # Create train/test split
    train, _ = train_test_split(df.index,
                                test_size=0.33,
                                stratify=df['Conference'],
                                random_state=42)

    # Save training and test data in separate tables
    with db.get_conn() as conn:
        df.iloc[train].to_sql('training_data', con=conn, if_exists='replace')
        df.drop(train).to_sql('test_data', con=conn, if_exists='replace')
Example #16
0
def createdb():
    dest = SqliteHook(sqlite_conn_id='sqlite2')
    dest_conn = dest.get_conn()
    dest_conn.execute(
        '''CREATE TABLE if not exists ORDERS(OrderId text, DocumentNo text, OrderDate text, CatalogId text)'''
    )
    dest_conn.execute(
        '''CREATE TABLE if not exists CUSTOMERS(DocumentNo text, FullName text, Device text, Counttry real)'''
    )
    dest_conn.execute(
        '''CREATE TABLE if not exists CATALOG(CatalogId text, ProductId text, CUID text)'''
    )
    dest_conn.execute(
        '''CREATE TABLE if not exists PRODUCTS(ProductId text, ProductName text, CUID text)'''
    )
    dest_conn.execute(
        '''CREATE TABLE if not exists LEADS(Id text, CustomerId text)''')
    dest_conn.execute(
        '''CREATE TABLE if not exists LOGS(ClientIp text, UserName text, Time text)'''
    )
    def etl_data(**kwargs):
        # Extract Transform Load csv in DB
        task_instance = kwargs['ti']
        filename = task_instance.xcom_pull(task_ids='get_file')

        conn_host = SqliteHook(sqlite_conn_id='sqlite_db', supports_autocommit = True).get_conn()

        loader = ETLoader(filename, downloads_path_dir, conn_host, 'revenues', revenue_table)
        loader.orchestrate_etl()

        return filename
Example #18
0
 def get_hook(self):
     try:
         if self.conn_type == 'mysql':
             from airflow.hooks.mysql_hook import MySqlHook
             return MySqlHook(mysql_conn_id=self.conn_id)
         elif self.conn_type == 'google_cloud_platform':
             from airflow.contrib.hooks.bigquery_hook import BigQueryHook
             return BigQueryHook(bigquery_conn_id=self.conn_id)
         elif self.conn_type == 'postgres':
             from airflow.hooks.postgres_hook import PostgresHook
             return PostgresHook(postgres_conn_id=self.conn_id)
         elif self.conn_type == 'hive_cli':
             from airflow.hooks.hive_hooks import HiveCliHook
             return HiveCliHook(hive_cli_conn_id=self.conn_id)
         elif self.conn_type == 'presto':
             from airflow.hooks.presto_hook import PrestoHook
             return PrestoHook(presto_conn_id=self.conn_id)
         elif self.conn_type == 'hiveserver2':
             from airflow.hooks.hive_hooks import HiveServer2Hook
             return HiveServer2Hook(hiveserver2_conn_id=self.conn_id)
         elif self.conn_type == 'sqlite':
             from airflow.hooks.sqlite_hook import SqliteHook
             return SqliteHook(sqlite_conn_id=self.conn_id)
         elif self.conn_type == 'jdbc':
             from airflow.hooks.jdbc_hook import JdbcHook
             return JdbcHook(jdbc_conn_id=self.conn_id)
         elif self.conn_type == 'mssql':
             from airflow.hooks.mssql_hook import MsSqlHook
             return MsSqlHook(mssql_conn_id=self.conn_id)
         elif self.conn_type == 'oracle':
             from airflow.hooks.oracle_hook import OracleHook
             return OracleHook(oracle_conn_id=self.conn_id)
         elif self.conn_type == 'vertica':
             from airflow.contrib.hooks.vertica_hook import VerticaHook
             return VerticaHook(vertica_conn_id=self.conn_id)
         elif self.conn_type == 'cloudant':
             from airflow.contrib.hooks.cloudant_hook import CloudantHook
             return CloudantHook(cloudant_conn_id=self.conn_id)
         elif self.conn_type == 'jira':
             from airflow.contrib.hooks.jira_hook import JiraHook
             return JiraHook(jira_conn_id=self.conn_id)
         elif self.conn_type == 'redis':
             from airflow.contrib.hooks.redis_hook import RedisHook
             return RedisHook(redis_conn_id=self.conn_id)
         elif self.conn_type == 'wasb':
             from airflow.contrib.hooks.wasb_hook import WasbHook
             return WasbHook(wasb_conn_id=self.conn_id)
         elif self.conn_type == 'docker':
             from airflow.hooks.docker_hook import DockerHook
             return DockerHook(docker_conn_id=self.conn_id)
     except:
         pass
Example #19
0
def store_recording(**context):
    """Stores a random integer from 1 to 10 with timestamp into data table"""

    conn = SqliteHook.get_connection(
        context['params']['conn_id']).get_hook().get_conn()
    time = context['execution_date']

    values = (time, random.randint(1, 10))
    logging.info('Inserting: {0:%Y-%m-%d %H:%M:%S}, {1}'.format(*values))

    with conn:
        cur = conn.cursor()
        cur.execute("INSERT INTO data VALUES (?, ?)", values)
def fill_devices_table(**kwargs):
    ti = kwargs['ti']
    devices = ti.xcom_pull(task_ids='get_devices_dates')
    conn_host = SqliteHook(sqlite_conn_id='sqlite_devices').get_conn()

    for device in devices:
        sql_insert = f"""INSERT OR REPLACE INTO {DEVICES_TABLE} 
                     (device, last_reading, latest_postprocessing)
                     VALUES ({device},
                             '{devices[device]["last_reading"]}',
                             '{devices[device]["latest_postprocessing"]}'
                            )
                     ;"""

        conn_host.execute(sql_insert)
        conn_host.commit()
    conn_host.close()
Example #21
0
def train_model(classifier, **context):
    """
    Trains a model using the given classifier and stores it at MODEL_PATH.
    """
    # Load data
    df = SqliteHook().get_pandas_df('select * from training_data')

    # Create pipeline
    pipeline = Pipeline([('vectorizer', CountVectorizer(stop_words='english')),
                         ('classifier', classifier())])

    # Train model
    pipeline.fit(df['Title'].tolist(), df['Conference'].tolist())

    # Save model
    _save_model(pipeline, classifier.__name__)
Example #22
0
def save_prediction(**kwargs):
    # Tasks can pass parameters to downstream tasks through the XCom space.
    # In this example the current task `save_prediction` takes the output of
    # the previous task `run_prediction` "pulling" it from the XCom space
    ti = kwargs['ti']
    prediction_dict = ti.xcom_pull(task_ids='run_prediction')
    # INSERT OR REPLACE guarantees to have unique row for each date_to_predict:
    # it inserts a new prediction if it doesn't exists or replace the existing
    # one due to the index idx_date_to_predict on date_to_predict
    sql_insert = f"""INSERT OR REPLACE INTO {PREDICTION_TABLE} 
                     (date_to_predict, run_date, yhat, yhat_upper, yhat_lower)
                     VALUES ('{prediction_dict["date_to_predict"]}', 
                             '{prediction_dict["run_date"]}',
                             {prediction_dict["yhat"]}, 
                             {prediction_dict["yhat_upper"]},
                             {prediction_dict["yhat_lower"]}
                            )
                     ;"""
    # Hooks are interface to external platforms (e.g. Amazon S3)
    # and DBs (e.g. SQLite DB, PostgreSQL)
    conn_host = SqliteHook(sqlite_conn_id='sqlite_ml').get_conn()
    conn_host.execute(sql_insert)
    conn_host.commit()
    conn_host.close()
Example #23
0
def calc_hourly_totals(**context):
    """Calculates sum of values from data table for the previous hour and puts it into totals table"""

    conn = SqliteHook.get_connection(
        context['params']['conn_id']).get_hook().get_conn()
    hour = context['execution_date']
    prev_hour = hour - timedelta(hours=1)

    with conn:
        cur = conn.cursor()
        cur.execute(
            "SELECT count(*) FROM data "
            "WHERE strftime('%Y-%m-%d %H', time) = strftime('%Y-%m-%d %H', ?)",
            (prev_hour, ))

        data_count = cur.fetchone()[0]
        logging.info('Found {} values for {}'.format(data_count, hour))

        if data_count != 4:
            logging.warn('Need all 4 values to calculate hourly value.')
            raise AirflowException(
                "Need all 4 values to calculate hourly value.")

        cur.execute(
            "SELECT count(*) FROM hourly "
            "WHERE hour = strftime('%Y-%m-%d %H:00:00', ?)", (hour, ))

        hour_count = cur.fetchone()[0]
        if hour_count:
            logging.warn(
                'There is an hourly value for {} already.'.format(hour))
            return

        cur.execute(
            "INSERT INTO hourly SELECT strftime('%Y-%m-%d %H:00:00', ?), sum(value) FROM data "
            "WHERE strftime('%Y-%m-%d %H', time) = strftime('%Y-%m-%d %H', ?)",
            (hour, prev_hour))
        logging.info('Inserted hourly value for {}.'.format(hour))
Example #24
0
def test_model(classifier, **context):
    """
    Tests a model created by the given classifier and returns its accuracy score
    as an XCom.
    """

    # Simulate a task failure
    # raise RuntimeError('BOOM!')

    # Load model
    model = _load_model(classifier.__name__)

    # Load data
    df = SqliteHook().get_pandas_df('select * from test_data')

    # Make predictions
    X, y = df['Title'].tolist(), df['Conference'].tolist()
    y_pred = model.predict(X)

    # Log classification results
    print(classification_report(y, y_pred))

    # Push Accuracy as XCom
    return classifier.__name__, f1_score(y, y_pred, average='weighted')
Example #25
0
def save_model_accuracy(**kwargs):
    # Tasks can pass parameters to downstream tasks through the XCom space.
    # XCom (cross-communication) allows communication between task instances.
    # In this example the current task `save_model_accuracy` takes the output
    # of the previous task `measure_accuracy` "pulling" it from the XCom space
    ti = kwargs['ti']
    accuracy = ti.xcom_pull(task_ids='measure_accuracy')

    sql_insert = f"""INSERT INTO {TRAINING_TABLE} 
                            (mape_test, rmse_test, days_in_test)
                     VALUES({accuracy['mape_test']}, 
                            {accuracy['rmse_test']},
                            {accuracy['days_in_test']})
                    ;
                  """
    # Hooks are interface to external platforms (e.g. Amazon S3)
    # and DBs (e.g. SQLite DB, PostgreSQL)
    conn_host = SqliteHook(sqlite_conn_id='sqlite_ml').get_conn()
    conn_host.execute(sql_insert)
    conn_host.commit()
Example #26
0
 def make_hook() -> Optional[BaseHook]:
     """ Makes a SqliteHook to test with. """
     return SqliteHook()
Example #27
0
def create_tables(**context):
    hook = SqliteHook.get_connection(context['params']['conn_id']).get_hook()
    run_sql(hook,
            "CREATE TABLE IF NOT EXISTS data (time TIMESTAMP, value INT);")
    run_sql(hook,
            "CREATE TABLE IF NOT EXISTS hourly (hour TIMESTAMP, total INT);")
 def execute(self, context):
     _log.info('Executing: ' + self.sql)
     hook = SqliteHook(sqlite_conn_id=self.sqlite_conn_id)
     hook.run(self.sql, parameters=self.parameters)
Example #29
0
 def get_hook(self):
     if self.conn_type == 'mysql':
         from airflow.hooks.mysql_hook import MySqlHook
         return MySqlHook(mysql_conn_id=self.conn_id)
     elif self.conn_type == 'google_cloud_platform':
         from airflow.gcp.hooks.bigquery import BigQueryHook
         return BigQueryHook(bigquery_conn_id=self.conn_id)
     elif self.conn_type == 'postgres':
         from airflow.hooks.postgres_hook import PostgresHook
         return PostgresHook(postgres_conn_id=self.conn_id)
     elif self.conn_type == 'pig_cli':
         from airflow.hooks.pig_hook import PigCliHook
         return PigCliHook(pig_cli_conn_id=self.conn_id)
     elif self.conn_type == 'hive_cli':
         from airflow.hooks.hive_hooks import HiveCliHook
         return HiveCliHook(hive_cli_conn_id=self.conn_id)
     elif self.conn_type == 'presto':
         from airflow.hooks.presto_hook import PrestoHook
         return PrestoHook(presto_conn_id=self.conn_id)
     elif self.conn_type == 'hiveserver2':
         from airflow.hooks.hive_hooks import HiveServer2Hook
         return HiveServer2Hook(hiveserver2_conn_id=self.conn_id)
     elif self.conn_type == 'sqlite':
         from airflow.hooks.sqlite_hook import SqliteHook
         return SqliteHook(sqlite_conn_id=self.conn_id)
     elif self.conn_type == 'jdbc':
         from airflow.hooks.jdbc_hook import JdbcHook
         return JdbcHook(jdbc_conn_id=self.conn_id)
     elif self.conn_type == 'mssql':
         from airflow.hooks.mssql_hook import MsSqlHook
         return MsSqlHook(mssql_conn_id=self.conn_id)
     elif self.conn_type == 'oracle':
         from airflow.hooks.oracle_hook import OracleHook
         return OracleHook(oracle_conn_id=self.conn_id)
     elif self.conn_type == 'vertica':
         from airflow.contrib.hooks.vertica_hook import VerticaHook
         return VerticaHook(vertica_conn_id=self.conn_id)
     elif self.conn_type == 'cloudant':
         from airflow.contrib.hooks.cloudant_hook import CloudantHook
         return CloudantHook(cloudant_conn_id=self.conn_id)
     elif self.conn_type == 'jira':
         from airflow.contrib.hooks.jira_hook import JiraHook
         return JiraHook(jira_conn_id=self.conn_id)
     elif self.conn_type == 'redis':
         from airflow.contrib.hooks.redis_hook import RedisHook
         return RedisHook(redis_conn_id=self.conn_id)
     elif self.conn_type == 'wasb':
         from airflow.contrib.hooks.wasb_hook import WasbHook
         return WasbHook(wasb_conn_id=self.conn_id)
     elif self.conn_type == 'docker':
         from airflow.hooks.docker_hook import DockerHook
         return DockerHook(docker_conn_id=self.conn_id)
     elif self.conn_type == 'azure_data_lake':
         from airflow.contrib.hooks.azure_data_lake_hook import AzureDataLakeHook
         return AzureDataLakeHook(azure_data_lake_conn_id=self.conn_id)
     elif self.conn_type == 'azure_cosmos':
         from airflow.contrib.hooks.azure_cosmos_hook import AzureCosmosDBHook
         return AzureCosmosDBHook(azure_cosmos_conn_id=self.conn_id)
     elif self.conn_type == 'cassandra':
         from airflow.contrib.hooks.cassandra_hook import CassandraHook
         return CassandraHook(cassandra_conn_id=self.conn_id)
     elif self.conn_type == 'mongo':
         from airflow.contrib.hooks.mongo_hook import MongoHook
         return MongoHook(conn_id=self.conn_id)
     elif self.conn_type == 'gcpcloudsql':
         from airflow.gcp.hooks.cloud_sql import CloudSqlDatabaseHook
         return CloudSqlDatabaseHook(gcp_cloudsql_conn_id=self.conn_id)
     elif self.conn_type == 'grpc':
         from airflow.contrib.hooks.grpc_hook import GrpcHook
         return GrpcHook(grpc_conn_id=self.conn_id)
     raise AirflowException("Unknown hook type {}".format(self.conn_type))
Example #30
0
 def execute(self, context):
     self.log.info('Executing: %s', self.sql)
     hook = SqliteHook(sqlite_conn_id=self.sqlite_conn_id)
     return hook.get_records(self.sql, parameters=self.parameters)
 def execute(self, context):
     logging.info('Executing: ' + self.sql)
     hook = SqliteHook(sqlite_conn_id=self.sqlite_conn_id)
     hook.run(self.sql, parameters=self.parameters)