Python BaseHook Exemples, airflow.hooks.base_hook.BaseHook Python Exemples

Exemple #1

1

Afficher le fichier

Fichier : gcp_container_operator.py Projet : apache/incubator-airflow

    def execute(self, context):
        # Specifying a service account file allows the user to using non default
        # authentication for creating a Kubernetes Pod. This is done by setting the
        # environment variable `GOOGLE_APPLICATION_CREDENTIALS` that gcloud looks at.
        key_file = None

        # If gcp_conn_id is not specified gcloud will use the default
        # service account credentials.
        if self.gcp_conn_id:
            from airflow.hooks.base_hook import BaseHook
            # extras is a deserialized json object
            extras = BaseHook.get_connection(self.gcp_conn_id).extra_dejson
            # key_file only gets set if a json file is created from a JSON string in
            # the web ui, else none
            key_file = self._set_env_from_extras(extras=extras)

        # Write config to a temp file and set the environment variable to point to it.
        # This is to avoid race conditions of reading/writing a single file
        with tempfile.NamedTemporaryFile() as conf_file:
            os.environ[KUBE_CONFIG_ENV_VAR] = conf_file.name
            # Attempt to get/update credentials
            # We call gcloud directly instead of using google-cloud-python api
            # because there is no way to write kubernetes config to a file, which is
            # required by KubernetesPodOperator.
            # The gcloud command looks at the env variable `KUBECONFIG` for where to save
            # the kubernetes config file.
            subprocess.check_call(
                ["gcloud", "container", "clusters", "get-credentials",
                 self.cluster_name,
                 "--zone", self.location,
                 "--project", self.project_id])

            # Since the key file is of type mkstemp() closing the file will delete it from
            # the file system so it cannot be accessed after we don't need it anymore
            if key_file:
                key_file.close()

            # Tell `KubernetesPodOperator` where the config file is located
            self.config_file = os.environ[KUBE_CONFIG_ENV_VAR]
            return super().execute(context)

Exemple #2

0

Afficher le fichier

Fichier : generic_transfer.py Projet : sstm2/incubator-airflow

    def execute(self, context):
        source_hook = BaseHook.get_hook(self.source_conn_id)

        self.log.info("Extracting data from %s", self.source_conn_id)
        self.log.info("Executing: \n %s", self.sql)
        results = source_hook.get_records(self.sql)

        destination_hook = BaseHook.get_hook(self.destination_conn_id)
        if self.preoperator:
            self.log.info("Running preoperator")
            self.log.info(self.preoperator)
            destination_hook.run(self.preoperator)

        self.log.info("Inserting rows into %s", self.destination_conn_id)
        destination_hook.insert_rows(table=self.destination_table, rows=results)

Exemple #3

0

Afficher le fichier

Fichier : generic_transfer.py Projet : 16522855/airflow

    def execute(self, context):
        source_hook = BaseHook.get_hook(self.source_conn_id)

        logging.info("Extracting data from {}".format(self.source_conn_id))
        logging.info("Executing: \n" + self.sql)
        results = source_hook.get_records(self.sql)

        destination_hook = BaseHook.get_hook(self.destination_conn_id)
        if self.preoperator:
            logging.info("Running preoperator")
            logging.info(self.preoperator)
            destination_hook.run(self.preoperator)

        logging.info("Inserting rows into {}".format(self.destination_conn_id))
        destination_hook.insert_rows(table=self.destination_table, rows=results)

Exemple #4

0

Afficher le fichier

Fichier : gcp_container_operator.py Projet : mozilla/telemetry-airflow

    def execute(self, context):
        # If gcp_conn_id is not specified gcloud will use the default
        # service account credentials.
        if self.gcp_conn_id:
            from airflow.hooks.base_hook import BaseHook
            # extras is a deserialized json object
            extras = BaseHook.get_connection(self.gcp_conn_id).extra_dejson
            self._set_env_from_extras(extras=extras)

        # Write config to a temp file and set the environment variable to point to it.
        # This is to avoid race conditions of reading/writing a single file
        with tempfile.NamedTemporaryFile() as conf_file:
            os.environ[KUBE_CONFIG_ENV_VAR] = conf_file.name
            # Attempt to get/update credentials
            # We call gcloud directly instead of using google-cloud-python api
            # because there is no way to write kubernetes config to a file, which is
            # required by KubernetesPodOperator.
            # The gcloud command looks at the env variable `KUBECONFIG` for where to save
            # the kubernetes config file.
            subprocess.check_call(
                ["gcloud", "container", "clusters", "get-credentials",
                 self.cluster_name,
                 "--zone", self.location,
                 "--project", self.project_id])

            # Tell `KubernetesPodOperator` where the config file is located
            self.config_file = os.environ[KUBE_CONFIG_ENV_VAR]
            super(GKEPodOperator, self).execute(context)

Exemple #5

0

Afficher le fichier

Fichier : sql_sensor.py Projet : MiguelPeralvo/incubator-airflow

    def poke(self, context):
        hook = BaseHook.get_connection(self.conn_id).get_hook()

        self.log.info('Poking: %s', self.sql)
        records = hook.get_records(self.sql)
        if not records:
            return False
        return str(records[0][0]) not in ('0', '')

Exemple #6

0

Afficher le fichier

Fichier : training.py Projet : GoogleCloudPlatform/training-data-analyst

def _get_project_id():
  """Get project ID from default GCP connection."""

  extras = BaseHook.get_connection('google_cloud_default').extra_dejson
  key = 'extra__google_cloud_platform__project'
  if key in extras:
    project_id = extras[key]
  else:
    raise ('Must configure project_id in google_cloud_default '
           'connection from Airflow Console')
  return project_id

Exemple #7

0

Afficher le fichier

Fichier : sensors.py Projet : Zen-Slug/incubator-airflow

    def poke(self, context):
        hook = BaseHook.get_connection(self.conn_id).get_hook()

        logging.info('Poking: ' + self.sql)
        records = hook.get_records(self.sql)
        if not records:
            return False
        else:
            if str(records[0][0]) in ('0', '',):
                return False
            else:
                return True
            print(records[0][0])

Exemple #8

0

Afficher le fichier

Fichier : qubole_sensor.py Projet : caseybrown89/airflow

    def poke(self, context):
        conn = BaseHook.get_connection(self.qubole_conn_id)
        Qubole.configure(api_token=conn.password, api_url=conn.host)

        this.log.info('Poking: %s', self.data)

        status = False
        try:
            status = self.sensor_class.check(self.data)
        except Exception as e:
            logging.exception(e)
            status = False

        this.log.info('Status of this Poke: %s', status)

        return status

Exemple #9

0

Afficher le fichier

Fichier : sql_sensor.py Projet : Fokko/incubator-airflow

    def poke(self, context):
        conn = BaseHook.get_connection(self.conn_id)

        allowed_conn_type = {'google_cloud_platform', 'jdbc', 'mssql',
                             'mysql', 'oracle', 'postgres',
                             'presto', 'sqlite', 'vertica'}
        if conn.conn_type not in allowed_conn_type:
            raise AirflowException("The connection type is not supported by SqlSensor. " +
                                   "Supported connection types: {}".format(list(allowed_conn_type)))
        hook = conn.get_hook()

        self.log.info('Poking: %s (with parameters %s)', self.sql, self.parameters)
        records = hook.get_records(self.sql, self.parameters)
        if not records:
            return False
        return str(records[0][0]) not in ('0', '')

Exemple #10

0

Afficher le fichier

Fichier : qubole_hook.py Projet : apache/incubator-airflow

    def get_extra_links(self, operator, dttm):
        """
        Get link to qubole command result page.

        :param operator: operator
        :param dttm: datetime
        :return: url link
        """
        conn = BaseHook.get_connection(operator.kwargs['qubole_conn_id'])
        if conn and conn.host:
            host = re.sub(r'api$', 'v2/analyze?command_id=', conn.host)
        else:
            host = 'https://api.qubole.com/v2/analyze?command_id='

        ti = TaskInstance(task=operator, execution_date=dttm)
        qds_command_id = ti.xcom_pull(task_ids=operator.task_id, key='qbol_cmd_id')
        url = host + str(qds_command_id) if qds_command_id else ''
        return url

Exemple #11

0

Afficher le fichier

Fichier : transfertoteradata_operator.py Projet : flolas/docker-airflow

    def execute(self, context):
        source_hook = BaseHook.get_hook(self.source_conn_id)

        logging.info("Extracting data from {}".format(self.source_conn_id))
        logging.info("Executing: \n" + self.sql)
        results = source_hook.get_records(self.sql)

        destination_hook = TeradataHook(teradata_conn_id=self.destination_conn_id)
        if self.preoperator:
            logging.info("Running preoperator")
            logging.info(self.preoperator)
            destination_hook.run(self.preoperator)

        if self.batch:
            logging.info("Inserting {} rows into {} with a batch size of {} rows".format(len(results), self.destination_conn_id, self.batch_size))
            destination_hook.bulk_insert_rows(table=self.destination_table, rows=iter(results), commit_every=self.batch_size,  unicode_source=self.unicode_source)
        else:
            logging.info("Inserting {} rows into {}".format(len(results), self.destination_conn_id))
            destination_hook.insert_rows(table=self.destination_table, rows=iter(results), commit_every=1000, unicode_source=self.unicode_source )

Exemple #12

0

Afficher le fichier

Fichier : gcp_sql_operator.py Projet : Fokko/incubator-airflow

 def __init__(self,
              sql,
              autocommit=False,
              parameters=None,
              gcp_conn_id='google_cloud_default',
              gcp_cloudsql_conn_id='google_cloud_sql_default',
              *args, **kwargs):
     super(CloudSqlQueryOperator, self).__init__(*args, **kwargs)
     self.sql = sql
     self.gcp_conn_id = gcp_conn_id
     self.gcp_cloudsql_conn_id = gcp_cloudsql_conn_id
     self.autocommit = autocommit
     self.parameters = parameters
     self.gcp_connection = BaseHook.get_connection(self.gcp_conn_id)
     self.cloudsql_db_hook = CloudSqlDatabaseHook(
         gcp_cloudsql_conn_id=gcp_cloudsql_conn_id,
         default_gcp_project_id=self.gcp_connection.extra_dejson.get(
             'extra__google_cloud_platform__project'))
     self.cloud_sql_proxy_runner = None
     self.database_hook = None

Exemple #13

0

Afficher le fichier

Fichier : Enviroment_Setup_Create_CDC_Field_Reference_for_Customer_Database.py Projet : dalehanson/Airflow_ingestion

Airflow_snowflake_connection_name = Variable.get('Airflow_snowflake_connection_name')
orchestration_country = Variable.get('orchestration_country')
max_task_time = int(Variable.get('set_task_max_time_minutes')) #set the max runtime for a task
max_task_retries_on_error = int(Variable.get('max_task_retries_on_error'))

database_include_patterns = ['trans*', 'gateway'] #only inlcude the staging, transaction, and gateway databases, for multiple format as a list seperated by commas




##################################################################
#Collecting Connection attributes from Airflow connections repo
##################################################################

sf_con_parm = BaseHook.get_connection(Airflow_snowflake_connection_name)
snowflake_username = sf_con_parm.login 
snowflake_password = sf_con_parm.password 
snowflake_account = sf_con_parm.host 
snowflake_schema = 'A_UTILITY' 
snowflake_warehouse = "MYSQL_TO_RAW_MIGRATION_XSMALL_1" 
if orchestration_country.lower() in ['us', 'usa','united states','u.s.','u.s.a']:
    snowflake_database = "US_RAW"
if orchestration_country.lower() in ['ca', 'canada','c.a.']:
    snowflake_database = "CA_RAW"
if orchestration_country.lower() in ['uk', 'u.k.','united kingdom']:
    snowflake_database = "UK_RAW"

########################################################################
#Defining Utility functions
########################################################################

Exemple #14

0

Afficher le fichier

Fichier : neo4j_con_dag.py Projet : MarcusDoubleYou/airflow-kube-dags-import

    'depends_on_past': False,
    'start_date': datetime(2019, 6, 1),
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 0,
    'retry_delay': timedelta(minutes=0)
}

dag = DAG(
    'neo4j-con-1',
    default_args=default_args,
    description='testing generic cypher',
    # schedule_interval=timedelta(days=1)
    schedule_interval='@hourly',
    catchup=False)

t1 = BashOperator(task_id='print_date', bash_command='date', dag=dag)

connection = BaseHook.get_connection("neo4j_default")
uri = connection.host
pw = connection.password

cypher_1 = Neo4jOperator(task_id='node_count',
                         cql="MATCH (n) RETURN count(n)",
                         uri=uri,
                         pw=pw,
                         dag=dag)

t1.set_upstream(cypher_1)

Exemple #15

0

Afficher le fichier

    :return:
    """

    ti = kwargs['ti']

    emr_dns = ti.xcom_pull(task_ids='iac_create_emr_cluster')

    ssh_conn = BaseHook.get_connection('ssh_default')
    ssh_conn.host = emr_dns

    session = settings.Session()  # get the session
    session.add(ssh_conn)
    session.commit()


ssh_emr_host = BaseHook.get_connection('ssh_default').host
ssh_emr_key = BaseHook.get_connection('ssh_default').extra_dejson.get(
    'key_file')
ssh_emr_user = BaseHook.get_connection('ssh_default').login
files_to_upload = '{constants.py,create_integration_layer.py,create_landing_zone.py,create_presentation_layer.py,' \
                  'helper_functions,load_integration_layer,load_landing_zone,load_presentation_layer,sql_queries,quality_checks}'

spark_master = 'yarn'

default_args = {
    'owner': 'flights_dl',
    'depends_on_past': False,
    'retries': 0,
    'catchup': False,
    'email_on_retry': False,
    'concurrency': 3

Exemple #16

0

Afficher le fichier

Fichier : facebook_operator.py Projet : sdaltmann/ewah

    def __init__(
        self,
        account_ids,
        insight_fields,
        level,
        data_from=None,
        data_until=None,
        time_increment=1,
        breakdowns=None,
        execution_waittime_seconds=15, # wait for a while before execution
        #   between account_ids to avoid hitting rate limits during backfill
        pagination_limit=1000,
        async_job_read_frequency_seconds=5,
        reload_data_from=None,
    *args, **kwargs):

        if kwargs.get('update_on_columns'):
            raise Exception('update_on_columns is set by operator!')

        if not account_ids.__iter__:
            raise Exception('account_ids must be an iterable, such as a list,' \
                + ' of strings or integers!')

        if level == self.levels.ad:
            kwargs['update_on_columns'] = [
                'ad_id',
                'date_start',
                'date_stop',
            ] + (breakdowns or [])
            insight_fields += ['ad_id', 'ad_name']
            insight_fields = list(set(insight_fields))
        else:
            raise Exception('Specified level not supported!')

        if not (
            (
                type(time_increment) == str
                and time_increment in ['monthly', 'all_days']
            )
            or
            (
                type(time_increment) == int
                and time_increment >= 1
                and time_increment <= 1
            )
        ):
            raise Exception('time_increment must either be an integer ' \
                + 'between 1 and 90, or a string of either "monthly" '\
                + 'or "all_days". Recommended and default is the integer 1.')

        allowed_insight_fields = [
            _attr[1] for _attr in [
                member for member in inspect.getmembers(
                    AdsInsights.Field,
                    lambda a:not (inspect.isroutine(a)),
                )
                if not (member[0].startswith('__') and member[0].endswith('__'))
            ]
        ]
        for i_f in insight_fields:
            if not i_f in allowed_insight_fields:
                raise Exception((
                    'Field {0} is not an accepted value for insight_fields! ' \
                    + 'Accepted field values:\n\t{1}\n'
                ).format(
                    i_f,
                    '\n\t'.join(allowed_insight_fields)
                ))

        self.data_from = data_from
        self.data_until = data_until

        super().__init__(*args, **kwargs)

        credentials = BaseHook.get_connection(self.source_conn_id)
        extra = credentials.extra_dejson

        # Note: app_secret is not always required!
        if not extra.get('app_id'):
            raise Exception('Connection extra must contain an "app_id"!')
        if not extra.get('access_token', credentials.password):
            raise Exception('Connection extra must contain an "access_token" ' \
                + 'if it is not saved as the connection password!')

        self.credentials = {
            'app_id': extra.get('app_id'),
            'app_secret': extra.get('app_secret'),
            'access_token': extra.get('access_token', credentials.password),
        }

        self.account_ids = account_ids
        self.insight_fields = insight_fields
        self.level = level
        self.time_increment = time_increment
        self.breakdowns = breakdowns
        self.execution_waittime_seconds = execution_waittime_seconds
        self.pagination_limit = pagination_limit
        self.async_job_read_frequency_seconds = async_job_read_frequency_seconds
        self.reload_data_from = reload_data_from

Exemple #17

0

Afficher le fichier

import smtplib
from datetime import timedelta
from email import encoders
from email.mime.base import MIMEBase
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText

import pandas as pd
import plotly.graph_objects as go
from airflow.hooks.base_hook import BaseHook
from pendulum import Pendulum
from plotly.subplots import make_subplots

from utils import db

email_connection = BaseHook.get_connection('sender_email')


def send_email(**kwargs):
    execution_date: Pendulum = kwargs['execution_date']

    if execution_date.weekday() == 6:
        table = analyse_prices(execution_date=execution_date)
        report_text = create_html_report(table=table)
        __send_email(execution_date=execution_date, report_text=report_text)


def create_html_report(table: pd.DataFrame) -> str:
    fig = make_subplots(specs=[[{"secondary_y": True}]])
    fig.add_trace(go.Scatter(x=table.index,
                             y=table.average_price,

Exemple #18

0

Afficher le fichier

Fichier : check_operator.py Projet : RogerThomas/incubator-airflow

 def get_db_hook(self):
     return BaseHook.get_hook(conn_id=self.conn_id)

Exemple #19

0

Afficher le fichier

        # hive create table
        hive_hook = HiveCliHook()
        sql = ODS_CREATE_TABLE_SQL.format(db_name=HIVE_DB,
                                          table_name=hive_table_name,
                                          columns=",\n".join(rows),
                                          ufile_path=UFILE_PATH %
                                          (db_name, table_name))
        logging.info('Executing: %s', sql)
        hive_hook.run_cli(sql)
    return


conn_conf_dict = {}
for db_name, table_name, conn_id, prefix_name, priority_weight_nm in table_list:
    if conn_id not in conn_conf_dict:
        conn_conf_dict[conn_id] = BaseHook.get_connection(conn_id)

    hive_table_name = HIVE_TABLE % (prefix_name, table_name)
    if table_name in ['data_opay_transaction']:
        m = 1
    else:
        m = 12
    # sqoop import
    import_table = BashOperator(
        task_id='import_table_{}'.format(hive_table_name),
        priority_weight=priority_weight_nm,
        bash_command='''
            #!/usr/bin/env bash
            sqoop import "-Dorg.apache.sqoop.splitter.allow_text_splitter=true" \
            -D mapred.job.queue.name=root.collects \
            --connect "jdbc:mysql://{host}:{port}/{schema}?tinyInt1isBit=false&useUnicode=true&characterEncoding=utf8" \

Exemple #20

0

Afficher le fichier

Fichier : Migrate_all_databases_mysql_to_snowflake_for_prod_added_overlow_audit_table.py Projet : dalehanson/Airflow_ingestion

database_include_patterns = [
    'prefix*'
]  #only inlcude the staging, transaction, and gateway databases, for multiple format as a list seperated by commas

excluded_tables = ['table1', 'table2']  #list of tables we dont want to migrate

max_task_time = int(
    Variable.get('set_task_max_time_minutes'))  #set the max runtime for a task
max_task_retries_on_error = int(Variable.get('max_task_retries_on_error'))

##################################################################
#Collection Connection attributes from Airflow connections repo
##################################################################

sf_con_parm = BaseHook.get_connection(
    'snowflake_1')  #Airflow_snowflake_connection_name
snowflake_username = sf_con_parm.login
snowflake_password = sf_con_parm.password
snowflake_account = sf_con_parm.host
snowflake_stage_schema = 'A_UTILITY'
#snowflake_warehouse = "XSMALL"
snowflake_database = "US_RAW"

mysql_con = BaseHook.get_connection(
    'mysql_celltrak_1')  #Airflow_mysql_connection_name
mysql_username = mysql_con.login
mysql_password = mysql_con.password
mysql_hostname = mysql_con.host
mysql_port = mysql_con.port

########################################################################

Exemple #21

0

Afficher le fichier

Fichier : check_operator.py Projet : apache/incubator-airflow

 def get_db_hook(self):
     return BaseHook.get_hook(conn_id=self.conn_id)

Exemple #22

0

Afficher le fichier

Fichier : google_ads_operator.py Projet : sdaltmann/ewah

    def ewah_execute(self, context):
        # Task execution happens here
        def get_data_from_ads_output(fields_dict, values, prefix=None):
            if prefix is None:
                prefix = ''
            elif not prefix[-1] == '_':
                prefix += '_'
                # e.g. 2b prefix = 'ad_group_criterion_'
            data = {}
            for key, value in fields_dict.items():
                # e.g. 1 key = 'metrics', value = ['impressions', 'clicks']
                # e.g. 2a key = 'ad_group_criterion', value = [{'keyword': ['text', 'match_type']}]
                # e.g. 2b key = 'keyword', value = ['text', 'match_type']
                node = getattr(values, key)
                # e.g. 1 node = row.metrics
                # e.g. 2a node = row.ad_group_criterion
                # e.g. 2b node = row.ad_group_criterion.keyword
                for item in value:
                    # e.g. 1 item = 'clicks'
                    # e.g. 2a item = {'keyword': ['text', 'match_type']}
                    # e.g. 2b item = 'text'
                    if type(item) == dict:
                        data.update(
                            get_data_from_ads_output(
                                fields_dict=item,
                                values=node,
                                prefix=prefix +
                                key,  # e.g. 2a '' + 'ad_group_criterion'
                            ))
                    else:
                        # e.g. 1: {'' + 'metrics' + '_' + 'clicks': row.metrics.clicks.value}
                        # e.g. 2b: {'ad_group_criterion_' + 'keyeword' + '_' + 'text': row.ad_group_criterion.keyword.text.value}
                        if hasattr(getattr(node, item), 'value'):
                            data.update({
                                prefix + key + '_' + item: \
                                    getattr(node, item).value
                            })
                        else:
                            # some node ends don't respond to .value but are
                            #   already the value
                            data.update({
                                prefix + key + '_' + item:
                                getattr(node, item)
                            })
            return data

        self.data_until = airflow_datetime_adjustments(self.data_until)
        self.data_until = self.data_until or context['next_execution_date']
        if isinstance(self.data_from, timedelta):
            self.data_from = self.data_until - self.data_from
        else:
            self.data_from = airflow_datetime_adjustments(self.data_from)
            self.data_from = self.data_from or context['execution_date']

        conn = BaseHook.get_connection(self.source_conn_id).extra_dejson
        credentials = {}
        for key in self._REQUIRED_KEYS:
            if not key in conn.keys():
                raise Exception(
                    '{0} must be in connection extra json!'.format(key))
            credentials[key] = conn[key]

        # build the query
        query = 'SELECT {0} FROM {1} WHERE segments.date {2} {3}'.format(
            ', '.join(self.fields_list),
            self.resource,
            "BETWEEN '{0}' AND '{1}'".format(
                self.data_from.strftime('%Y-%m-%d'),
                self.data_until.strftime('%Y-%m-%d'),
            ),
            ('AND' + ' AND '.join(self.conditions)) if self.conditions else '',
        )

        self.log.info('executing this google ads query:\n{0}'.format(query))
        cli = GoogleAdsClient.load_from_dict(credentials)
        service = cli.get_service("GoogleAdsService", version="v3")
        search = service.search(
            self.client_id.replace('-', ''),
            query=query,
        )
        data = [row for row in search]

        # get into uploadable format
        upload_data = []
        while data:
            datum = data.pop(0)
            upload_data += [
                get_data_from_ads_output(
                    deepcopy(self.fields_dict),
                    datum,
                )
            ]

        self.upload_data(upload_data)

Exemple #23

0

Afficher le fichier

def create_connection():
    c= BaseHook.get_connection('mssql_pi') 
    return c

Exemple #24

0

Afficher le fichier

Fichier : censo_dag.py Projet : jasondavindev/data-eng-censo-escolar

def hdfs_conn(conn_id='hdfs'):
    conn = BaseHook.get_connection(conn_id)
    return f"hdfs://{conn.host}:{conn.port}"

Exemple #25

0

Afficher le fichier

 def test_dbapi_get_sqlalchemy_engine(self):
     conn = BaseHook.get_connection(conn_id='test_uri')
     hook = conn.get_hook()
     engine = hook.get_sqlalchemy_engine()
     self.assertIsInstance(engine, sqlalchemy.engine.Engine)
     self.assertEqual('postgres://*****:*****@ec2.compute.com:5432/the_database', str(engine.url))

Exemple #26

0

Afficher le fichier

 def from_conn_id(cls, conn_id: str) -> 'CustomBaseHook':
     conn_params = BaseHook.get_connection(conn_id)
     return cls(conn_params)

Exemple #27

0

Afficher le fichier

user = logging.getLogger(__name__)

default_args = {
    'owner': 'airflow',
    'retries': 1,
    'retry_delay': timedelta(minutes=1)
}

# Snowflake information
# Information must be stored in connections
# It can be done with Airflow UI -
# Admin -> Connections -> Create
database_name = 'TEST_DB'
table_name = 'customer'
schema_name = 'public'
snowflake_username = BaseHook.get_connection('snowflake').login
snowflake_password = BaseHook.get_connection(
    'snowflake').password  #Snowflake conn id=snowflake
snowflake_account = BaseHook.get_connection('snowflake').host

dag = DAG(dag_id="finally_done",
          default_args=default_args,
          start_date=datetime(2020, 3, 31),
          schedule_interval='*/12 * * * *',
          catchup=False)

#Loading Data


def load_data(**context):
    con = snowflake.connector.connect(user = snowflake_username, \

Exemple #28

0

Afficher le fichier

def send_slack_message(text):
    connection = BaseHook.get_connection("slack")
    headers = {"Content-type": "application/json"}
    requests.post(connection.host,
                  data=json.dumps({"text": text}),
                  headers=headers)

Exemple #29

0

Afficher le fichier

import datetime as dt

# Third Party
from airflow import DAG
from airflow.utils.helpers import chain
from airflow.hooks.base_hook import BaseHook
from airflow.operators.python_operator import PythonOperator
from airflow.operators.postgres_operator import PostgresOperator

# Custom
sys.path.append("/usr/local/airflow/dags/efs")
import redb.scripts.transfer_to_s3 as toS3
import redb.scripts.mdb_to_postgres as mdbToREDB

# Credentials for S3 Bucket
BUCKET_CONN = BaseHook.get_connection('redb-workbucket')
BUCKET_NAME = BUCKET_CONN.conn_id
AWS_ACCESS_KEY_ID = BUCKET_CONN.login
AWS_SECRET_ACCESS_KEY = BUCKET_CONN.password

# Credentials for Database
DATABASE_CONN = BaseHook.get_connection('redb_postgres')
DATABASE_NAME = DATABASE_CONN.schema
DATABASE_HOST = DATABASE_CONN.host
DATABASE_USER = DATABASE_CONN.login
DATABASE_PORT = DATABASE_CONN.port
DATABASE_PASSWORD = DATABASE_CONN.password

default_args = {
    'owner': 'redb',
    'start_date': dt.datetime(2020, 7, 23),

Exemple #30

0

Afficher le fichier

Fichier : invoices_dag.py Projet : yasserhu/productionalizing-data-pipelines-airflow

    BranchPythonOperator

default_args = {
    "owner": "airflow",
    "start_date": datetime(2020, 11, 1),
    "depends_on_past": False,
    "email_on_failure": False,
    "email_on_retry": False,
    "email": "*****@*****.**",
    "retries": 1,
    "retry_delay": timedelta(minutes=5)
}

data_path = f'{json.loads(BaseHook.get_connection("data_path").get_extra()).get("path")}/data.csv'
transformed_path = f'{os.path.splitext(data_path)[0]}-transformed.csv'
slack_token = BaseHook.get_connection("slack_conn").password


def transform_data(*args, **kwargs):
    invoices_data = pd.read_csv(filepath_or_buffer=data_path,
                                sep=',',
                                header=0,
                                usecols=[
                                    'StockCode', 'Quantity', 'InvoiceDate',
                                    'UnitPrice', 'CustomerID', 'Country'
                                ],
                                parse_dates=['InvoiceDate'],
                                index_col=0)
    invoices_data.to_csv(path_or_buf=transformed_path)

Exemple #31

0

Afficher le fichier

from airflow.utils.dates import days_ago
from datetime import datetime
import sqlalchemy
import pymysql
import papermill as pm
import airflow.hooks.S3_hook
from airflow.hooks.base_hook import BaseHook

bucket_name = "spotify-billboard-airflow-project"
reports_storage_path = "pass"
data_storage_path = "/Users/jkocher/Documents/airflow_home/data/"
jupyter_notebook_storage = "pass"
billboard_location = "data/raw_data/billboard_pickle"
audio_features_location = "data/raw_data/audio_feature_pickle"
hook = airflow.hooks.S3_hook.S3Hook('my_conn_S3')
c = BaseHook.get_connection('postgres_conn')
engine = sqlalchemy.create_engine('postgresql+psycopg2://' + str(c.login) +
                                  ':' + str(c.password) + '@' + str(c.host) +
                                  ':5432/music_db')
#engine = sqlalchemy.create_engine('postgresql+psycopg2://airflow:[email protected]:5432/music_db')

default_args = {
    'owner': 'James Kocher',
    'depends_on_past': False,
    'start_date': datetime.now(),
    'retries': 0
}

dag = DAG(
    "data_world_music_pipeline",
    default_args=default_args,

Exemple #32

0

Afficher le fichier

def load_data(**context):
    postgres_hook = PostgresHook('admin_postgres')
    tickers = postgres_hook.get_records(
        'select yf_code from tickers where fetch_from_yahoo_finance')
    logging.info('Loaded %d tickers from db.' % len(tickers))
    tickers = [x[0] for x in tickers]
    frequency = '1d'
    start_dt = parse_execution_date(
        context['yesterday_ds']) - timedelta(days=7)
    data = yf.download(tickers=tickers,
                       start=start_dt,
                       end=context['tomorrow_ds'],
                       interval=frequency,
                       auto_adjust=True,
                       group_by='ticker',
                       progress=False,
                       threads=True)

    columns_mapping = {
        'Date': 'ts',
        'Open': 'open',
        'High': 'high',
        'Low': 'low',
        'Close': 'close',
        'Volume': 'volume',
        'Adj Close': 'adj_close'
    }
    ch_columns = ['ticker', 'frequency', 'source', 'type'] + list(
        columns_mapping.values())
    df = None
    for ticker in tickers:
        try:
            _df = data[ticker].copy()
        except KeyError:
            logging.error('Ticker %s not found in data' % ticker)
            continue
        _df = _df.reset_index()
        _df['ticker'] = ticker
        _df['frequency'] = frequency
        _df['source'] = 'yfinance'
        _df['type'] = 'history'
        _df = _df.rename(columns=columns_mapping)
        if 'adj_close' not in _df.columns:
            _df['adj_close'] = np.nan
        _df = _df[ch_columns]
        _df = _df[~_df.close.isna()]

        if df is None:
            df = _df
        else:
            df = pd.concat([df, _df])

    logging.info('Prepared df with shape (%s, %s)' % df.shape)
    ch_hook = BaseHook(None)
    ch_conn = ch_hook.get_connection('rocket_clickhouse')
    data_json_each = ''
    df.reset_index(drop=True, inplace=True)
    for i in df.index:
        json_str = df.loc[i].to_json(date_format='iso')
        data_json_each += json_str + '\n'

    result = requests.post(
        url=ch_conn.host,
        data=data_json_each,
        params=dict(
            query='insert into rocket.events format JSONEachRow',
            user=ch_conn.login,
            password=ch_conn.password,
            date_time_input_format='best_effort',
        ))
    if result.ok:
        logging.info('Insert ok.')
    else:
        raise requests.HTTPError('Request response code: %d. Message: %s' %
                                 (result.status_code, result.text))

Exemple #33

0

Afficher le fichier

# custom operators
from operators.s3toredshift_operator import S3ToRedshiftOperator
from operators.dimension_operator import DimensionOperator

from operators.data_quality_count_operator import DataQualityCountOperator
from operators.data_quality_dimension_operator import DataQualityDimensionOperator

# helpers
from helpers import sql_queries_staging, sql_queries_presentation

# OS variables
start_time = datetime.now()
start_time_str = start_time.strftime("%d/%m/%Y %H:%M:%S")

# Configuration variables
aws_connection = BaseHook.get_connection("aws_credentials")
aws_username: str = aws_connection.login
aws_password: str = aws_connection.password
'''

    MANIFOLD DAG CONFIGURATION

'''

########################
# Airflow Dag Configs  #
########################
DEFAULT_ARGS = {
    'owner': 'Guilherme Banhudo',
    'depends_on_past': False,
    'email': ['*****@*****.**'],

Exemple #34

0

Afficher le fichier

Fichier : nba_box_scores.py Projet : ShrayBans/docker-airflow

default_args = {
    "owner": "airflow",
    "depends_on_past": False,
    "start_date": datetime(2018, 10, 14),
    "email": ["*****@*****.**"],
    "email_on_failure": False,
    "email_on_retry": False,
    "retries": 1,
    "retry_delay": timedelta(minutes=5),
    # 'queue': 'bash_queue',
    # 'pool': 'backfill',
    # 'priority_weight': 10,
    # 'end_date': datetime(2016, 1, 1),
}

SIXTHMAN_PROD = BaseHook.get_connection("sixthman_prod")
SIXTHMAN_CONN_PASSWORD = SIXTHMAN_PROD.password

dag = DAG("nba_box_scores",
          default_args=default_args,
          schedule_interval=timedelta(minutes=20),
          catchup=False)

t1 = BashOperator(
    task_id="nba_box_scores_task",
    pool="nba_box_scores",
    bash_command=
    f"DATABASE_API_CONNECTION=postgres://sixthman:{SIXTHMAN_CONN_PASSWORD}@sixthman-prod.cbdmxavtswxu.us-west-1.rds.amazonaws.com:5432/sixthman node /usr/local/airflow/build/ingestJobs/scrapeNbaBoxscore.js",
    retries=3,
    execution_timeout=timedelta(minutes=3),
    dag=dag)

Exemple #35

0

Afficher le fichier

import gc


##################################################################
#Setting variable definitions and connections
##################################################################

#database = Variable.get('Create_customer_database_tables_var__database_name')
Airflow_snowflake_connection_name = Variable.get('Airflow_snowflake_connection_name')
Airflow_mysql_connection_name = Variable.get('Airflow_mysql_connection_name')

database_list = ['database']

parent_dag_name = 'Collect_Mysql_Table_Counts_Load_to_Snowflake_Muliple_dbs'

sf_con = BaseHook.get_connection(Airflow_snowflake_connection_name)
snowflake_username = sf_con.login 
snowflake_password = sf_con.password 
snowflake_account = sf_con.host 
snowflake_warehouse = "XSMALL" 
snowflake_database = "sf_db"

mysql_con = BaseHook.get_connection(Airflow_mysql_connection_name)
mysql_username = mysql_con.login 
mysql_password = mysql_con.password 
mysql_hostname = mysql_con.host
mysql_port = mysql_con.port

########################################################################
#Defining Utility functions
########################################################################

Exemple #36

0

Afficher le fichier

Fichier : mine_boliga_postal.py Projet : Proteusiq/advance_scraping

input of single zipcode

Contains two major codes: thread and unnessary data copy: bug > pd.DataFrame is not thread safe, flow: pd.append used in loop
"""

from airflow import DAG
from airflow.models import Variable
from airflow.hooks.base_hook import BaseHook
from airflow.operators.python_operator import PythonOperator
from airflow.utils.dates import datetime
import sqlalchemy
import pandas as pd
from pipelines.boligax import BoligaRecent


CONNECTION_URI = BaseHook.get_connection("bolig_db").get_uri()
TABLE_NAME = f'recent_bolig_{Variable.get("postal",2650)}'

args = {
    "owner": "Prayson",
    "catchup_by_default": False,
}


def get_bolig(
    postal: int, engine: sqlalchemy.types.TypeEngine = None, **kwargs
) -> None:
    """get bolig[estate] from a given postal code

    Arguments:
        postal {int} -- Danish postal code: e.g. 2560

Exemple #37

0

Afficher le fichier

default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'start_date': START_DATE,
    'email': ['*****@*****.**'],
    'email_on_failure': True,
    'email_on_retry': False,
    'retries': 23,
    'retry_delay': timedelta(minutes=20),
}

dag = DAG('DAG_NAME', default_args=default_args, schedule_interval="@once")

dag_config = Variable.get('VARIABLES_NAME', deserialize_json=True)
aws_conn = BaseHook.get_connection("aws_conn")
s3_bucket = dag_config['s3_bucket']
datasource_type = dag_config['datasource_type']
date_preset = dag_config['date_preset']
account_id = dag_config['account_id']
access_token = dag_config['access_token']
api_version = dag_config['api_version']
insight_fields = dag_config['insight_fields']
action_attribution_windows = dag_config['action_attribution_windows']
file_path = dag_config['file_path']
time_increment = dag_config['time_increment']
backlogdays = dag_config['backlog_days']
days = 29 + int(dag_config['backlog_days'])
# today = datetime.today().strftime('%Y-%m-%d')
today = date.today().isoformat()
start_date = (date.today() - timedelta(days=29)).isoformat()

Exemple #38

0

Afficher le fichier

Fichier : yas-dev-sii_DAG_AB_HOM_CUE_MOV_01.py Projet : sruriel/dataEngineeringGCP

def subdag(parent_dag_name, child_dag_name, args, json_gs):
    dag_subdag = DAG(
        dag_id=f'{parent_dag_name}.{child_dag_name}',
        default_args=args,
        start_date=datetime.datetime(2021, 8, 5, 20, 0),
        schedule_interval='0 13,14,15,16,17,18,19,20,21,22,23,0,1 * * *',
    )

    connection_airflow_yas_sa_sii_de = BaseHook.get_connection(
        'google_cloud_yas_sa_sii_de')
    service_account_yas_sa_sii_de = ast.literal_eval(
        connection_airflow_yas_sa_sii_de.
        extra_dejson["extra__google_cloud_platform__keyfile_dict"])

    with gcsfs.GCSFileSystem(
            project='yas-dev-sii-pid',
            token=service_account_yas_sa_sii_de).open(json_gs) as f:
        jd = json.load(f)

    # Variables para ejecucion desde JSON
    url_trn = jd['url_trn']

    # Datos de TRN
    job_name_hom = jd['job_name_hom']
    url_hom = jd['url_hom']
    file_name_hom = jd['file_name_hom']
    template_location_hom = jd['template_location_hom']

    # Datos Generales para la ejecucion
    temp_location = jd['temp_location']
    project = jd['project']
    region = jd['region']
    subnetwork = jd['subnetwork']
    service_account_email = jd['service_account_email']
    machine_type = jd['machine_type']
    max_num_workers = jd['max_num_workers']
    num_workers = jd['num_workers']

    folders = gcsfs.GCSFileSystem(
        project='yas-dev-sii-pid',
        token=service_account_yas_sa_sii_de).ls(url_trn)

    if len(folders) > 0:
        for folder in folders:
            date_folder = folder.split('/')[3]

            if len(date_folder) >= 10:
                url_source = 'gs://' + folder
                url_dest = url_hom + date_folder + '/' + file_name_hom

                parent_dag_name_for_id = parent_dag_name.lower()

                print('url_source: ' + url_source)
                print('url_dest: ' + url_dest)

                DataflowTemplateOperator(
                    template=template_location_hom,
                    job_name=
                    f'{parent_dag_name_for_id}-{child_dag_name}-{date_folder}',
                    task_id=
                    f'{parent_dag_name_for_id}-{child_dag_name}-{date_folder}',
                    location=region,
                    parameters={
                        'url_trn': url_source,
                        'url_hom': url_dest,
                    },
                    default_args=args,
                    dataflow_default_options={
                        'project': project,
                        'zone': 'us-east1-c',
                        'tempLocation': temp_location,
                        'machineType': machine_type,
                        'serviceAccountEmail': service_account_email,
                        'subnetwork': subnetwork,
                    },
                    gcp_conn_id='google_cloud_yas_sa_sii_de',
                    dag=dag_subdag,
                )
    return dag_subdag

Exemple #39

0

Afficher le fichier

def check_if_tweet_is_avalaible(twitter_account_id=None,
                                since_id=None,
                                find_param=None,
                                **kwargs):
    """
    This method tweepy api via TwitterHook to check if a tweet from a specific twitter_account
    containing a specific search_string or not
    :param: twitter_account_id : for which tweets are to be fetched
    :param: since_id : airflow execution date of the dag
    :return: tweet_id
    """
    log = LoggingMixin().log
    try:
        # Load Configuration Data
        config = json.loads(Variable.get("config"))
        log.info("Config found")

    except AirflowException as e:
        log.error("Config missing")
        raise ConfigVariableNotFoundException()

    try:
        twitter_account_id = config['twitter_account_id']
    except KeyError as e:
        raise AirflowException('Missing Twitter Account Id in config variable')

    try:
        since_id = config['since_id']
    except KeyError as e:
        log.warn("Since id missing")

    try:
        find_param = config['find_param'].lower()
    except KeyError as e:
        raise AirflowException('Missing Find Param in config variable')

    try:
        twitter_credentials = BaseHook.get_connection("twitter_default")
        twitter_credentials = json.loads(twitter_credentials.extra)
        consumer_key = twitter_credentials['consumer_key']
        consumer_secret = twitter_credentials['consumer_secret']
        access_token = twitter_credentials['access_token']
        access_token_secret = twitter_credentials['access_token_secret']

    except AirflowException as e:
        raise TwitterConnectionNotFoundException()

    twitter_hook = TwitterHook(consumer_key=consumer_key,
                               consumer_secret=consumer_secret,
                               access_token=access_token,
                               access_token_secret=access_token_secret)

    tweepy_api = twitter_hook.get_tweepy_api()
    today = date.today()
    curr_date = today.strftime("%d-%m-%Y")
    # try to get tweet related to covid media bulliten from @diprjk handle

    tweets = tweepy_api.user_timeline(id=twitter_account_id,
                                      since_id=since_id,
                                      count=1000,
                                      exclude_replies=True,
                                      include_rts=False,
                                      tweet_mode="extended")
    if len(tweets) > 0:
        # find_param = "Media Bulletin on Novel".lower()
        log.info("Found : {}  tweets".format(len(tweets) + 1))
        # loop over all extracted tweets and
        # if tweet.full_text contains string "Media Bulletin On Novel"
        # then we got our concerned tweet and save its tweet_id
        image_urls = []
        for tweet in tweets:
            tweet_date = tweet.created_at
            tweet_date = tweet_date.strftime("%d-%m-%Y")
            text = tweet.full_text.lower()
            if find_param in text and tweet_date == curr_date:
                bulletin_tweet_id = tweet.id
                print('Tweet found')
                # save bulliten tweet id as environ variable or on file and then use in next run
                log.info("Tweet ID: {}  TEXT : {} ".format(
                    bulletin_tweet_id, tweet.full_text))
                if 'media' in tweet.entities:
                    for media in tweet.extended_entities['media']:
                        image_urls.append(media['media_url'])
                    detail_image_url = image_urls[2]
                    log.info("Tweet Image Url: {} ".format(detail_image_url))
                else:
                    log.info("No media found")
                    #skip the processing and end dag
                    return False
                data = {
                    "tweet_id": bulletin_tweet_id,
                    "tweet_date": tweet_date,
                    "media_url": detail_image_url
                }
                Variable.set("bulliten_tweet", json.dumps(data))
                return True
            else:
                pass
        else:
            log.info("No tweets related to {} found".format(find_param))
            return False

    else:
        log.info("No tweets found!")
        return False

Exemple #40

0

Afficher le fichier

Fichier : useless_hook.py Projet : biellls/airflow-plus

 def from_conn_id(conn_id: str):
     return UselessHookImplicitProtocol(BaseHook.get_connection(conn_id))

Exemple #41

0

Afficher le fichier

import logging
from datetime import datetime, timedelta

import pandas as pd
import pendulum
import sqlalchemy
from airflow import DAG
from airflow.hooks.base_hook import BaseHook
from airflow.operators.python_operator import PythonOperator
from airflow.utils.email import send_email

pg_conn = BaseHook.get_connection("postgres_default")

local_tz = pendulum.timezone("America/Los_Angeles")
default_args = {
    "owner": "airflow",
    "depends_on_past": False,
    "start_date": datetime(2018, 10, 30, tzinfo=local_tz),
    "email": [
        "*****@*****.**",
        "*****@*****.**",
        "*****@*****.**",
    ],
    "email_on_failure": True,
    "email_on_retry": False,
    "retries": 1,
    "retry_delay": timedelta(minutes=15)
    # 'queue': 'bash_queue',
    # 'pool': 'backfill',
    # 'priority_weight': 10,
    # 'end_date': datetime(2016, 1, 1),

Exemple #42

0

Afficher le fichier

Fichier : sql.py Projet : beingbisht/airflow-tests

 def get_db_hook(self):
     """
     Returns DB hook
     """
     return BaseHook.get_hook(conn_id=self.conn_id)

Exemple #43

0

Afficher le fichier

from airflow.hooks.base_hook import BaseHook
from sqlalchemy import create_engine
from sqlalchemy_utils import create_database, database_exists
import sqlalchemy

datalake_conn_string = BaseHook.get_connection('postgres_datalake').get_uri()

engine = create_engine(datalake_conn_string)

# create database
if not database_exists(engine.url):
    create_database(engine.url)
    engine.execute("GRANT ALL PRIVILEGES ON DATABASE {db} TO {user};".format(user = engine.url.username, db = engine.url.database))

# create schema, give permissions
if not engine.dialect.has_schema(engine, 'views'):
    engine.execute(sqlalchemy.schema.CreateSchema('views'))
    engine.execute("GRANT ALL PRIVILEGES ON SCHEMA views TO {user};".format(user = engine.url.username))
    engine.execute("GRANT ALL PRIVILEGES ON ALL TABLES IN SCHEMA views TO {user};".format(user = engine.url.username))
    engine.execute("ALTER DEFAULT PRIVILEGES IN SCHEMA views GRANT ALL PRIVILEGES ON TABLES TO {user};".format(user = engine.url.username))