"owner": "Robin",
        "start_date": days_ago(0),
        "retries": 3,
        "retry_delay": timedelta(minutes=30),
        "email": ["*****@*****.**"]
    }

    dag = DAG(
        dag_id="platform_kpi_scraper",
        default_args=args,
        schedule_interval='0 * * * *',
    )
    # https://github.com/apache/incubator-airflow/blob/5a3f39913739998ca2e9a17d0f1d10fccb840d36/airflow/contrib/operators/kubernetes_pod_operator.py#L129
    surveys_to_s3 = KubernetesPodOperator(
        namespace="airflow",
        image=SCRAPER_IMAGE,
        image_pull_policy='Always',
        cmds=["bash", "-c"],
        arguments=["python main.py"],
        labels={"foo": "bar"},
        name="airflow-test-pod",
        in_cluster=True,
        task_id="get_kpis",
        get_logs=True,
        dag=dag,
        annotations={"iam.amazonaws.com/role": SCRAPER_IAM_ROLE},
    )

except ImportError as e:
    log.warn("Could not import KubernetesPodOperator: " + str(e))
    # pip install apache-airflow[kubernetes]
    from airflow.contrib.operators.kubernetes_pod_operator import KubernetesPodOperator

    args = {
        'owner': 'airflow',
        'start_date': days_ago(2)
    }

    dag = DAG(
        dag_id='example_kubernetes_operator',
        default_args=args,
        schedule_interval=None)

    k = KubernetesPodOperator(
        namespace='default',
        image="ubuntu:16.04",
        cmds=["bash", "-cx"],
        arguments=["echo", "10"],
        labels={"foo": "bar"},
        name="airflow-test-pod",
        in_cluster=False,
        task_id="task",
        get_logs=True,
        dag=dag,
        is_delete_operator_pod=False)

except ImportError as e:
    log.warn("Could not import KubernetesPodOperator: " + str(e))
    log.warn("Install kubernetes dependencies with: "
             "    pip install apache-airflow[kubernetes]")
예제 #3
0
from airflow.models import DAG

log = LoggingMixin().log

try:
    # Kubernetes is optional, so not available in vanilla Airflow
    # pip install airflow[kubernetes]
    from airflow.contrib.operators.kubernetes_pod_operator import KubernetesPodOperator

    args = {'owner': 'airflow', 'start_date': days_ago(2)}

    dag = DAG(dag_id='example_kubernetes_operator',
              default_args=args,
              schedule_interval=None)

    k = KubernetesPodOperator(namespace='default',
                              image="ubuntu:16.04",
                              cmds=["bash", "-cx"],
                              arguments=["echo", "10"],
                              labels={"foo": "bar"},
                              name="airflow-test-pod",
                              in_cluster=False,
                              task_id="task",
                              get_logs=True,
                              dag=dag)

except ImportError as e:
    log.warn("Could not import KubernetesPodOperator: " + str(e))
    log.warn("Install kubernetes dependencies with: "
             "    pip install airflow['kubernetes']")
예제 #4
0
def check_if_tweet_is_avalaible(twitter_account_id=None,
                                since_id=None,
                                find_param=None,
                                **kwargs):
    """
    This method tweepy api via TwitterHook to check if a tweet from a specific twitter_account
    containing a specific search_string or not
    :param: twitter_account_id : for which tweets are to be fetched
    :param: since_id : airflow execution date of the dag
    :return: tweet_id
    """
    log = LoggingMixin().log
    try:
        # Load Configuration Data
        config = json.loads(Variable.get("config"))
        log.info("Config found")

    except AirflowException as e:
        log.error("Config missing")
        raise ConfigVariableNotFoundException()

    try:
        twitter_account_id = config['twitter_account_id']
    except KeyError as e:
        raise AirflowException('Missing Twitter Account Id in config variable')

    try:
        since_id = config['since_id']
    except KeyError as e:
        log.warn("Since id missing")

    try:
        find_param = config['find_param'].lower()
    except KeyError as e:
        raise AirflowException('Missing Find Param in config variable')

    try:
        twitter_credentials = BaseHook.get_connection("twitter_default")
        twitter_credentials = json.loads(twitter_credentials.extra)
        consumer_key = twitter_credentials['consumer_key']
        consumer_secret = twitter_credentials['consumer_secret']
        access_token = twitter_credentials['access_token']
        access_token_secret = twitter_credentials['access_token_secret']

    except AirflowException as e:
        raise TwitterConnectionNotFoundException()

    twitter_hook = TwitterHook(consumer_key=consumer_key,
                               consumer_secret=consumer_secret,
                               access_token=access_token,
                               access_token_secret=access_token_secret)

    tweepy_api = twitter_hook.get_tweepy_api()
    today = date.today()
    curr_date = today.strftime("%d-%m-%Y")
    # try to get tweet related to covid media bulliten from @diprjk handle

    tweets = tweepy_api.user_timeline(id=twitter_account_id,
                                      since_id=since_id,
                                      count=1000,
                                      exclude_replies=True,
                                      include_rts=False,
                                      tweet_mode="extended")
    if len(tweets) > 0:
        # find_param = "Media Bulletin on Novel".lower()
        log.info("Found : {}  tweets".format(len(tweets) + 1))
        # loop over all extracted tweets and
        # if tweet.full_text contains string "Media Bulletin On Novel"
        # then we got our concerned tweet and save its tweet_id
        image_urls = []
        for tweet in tweets:
            tweet_date = tweet.created_at
            tweet_date = tweet_date.strftime("%d-%m-%Y")
            text = tweet.full_text.lower()
            if find_param in text and tweet_date == curr_date:
                bulletin_tweet_id = tweet.id
                print('Tweet found')
                # save bulliten tweet id as environ variable or on file and then use in next run
                log.info("Tweet ID: {}  TEXT : {} ".format(
                    bulletin_tweet_id, tweet.full_text))
                if 'media' in tweet.entities:
                    for media in tweet.extended_entities['media']:
                        image_urls.append(media['media_url'])
                    detail_image_url = image_urls[2]
                    log.info("Tweet Image Url: {} ".format(detail_image_url))
                else:
                    log.info("No media found")
                    #skip the processing and end dag
                    return False
                data = {
                    "tweet_id": bulletin_tweet_id,
                    "tweet_date": tweet_date,
                    "media_url": detail_image_url
                }
                Variable.set("bulliten_tweet", json.dumps(data))
                return True
            else:
                pass
        else:
            log.info("No tweets related to {} found".format(find_param))
            return False

    else:
        log.info("No tweets found!")
        return False
try:
    # Kubernetes is optional, so not available in vanilla Airflow
    # pip install 'apache-airflow[kubernetes]'
    from airflow.contrib.operators.kubernetes_pod_operator import KubernetesPodOperator

    args = {'owner': 'airflow', 'start_date': days_ago(2)}

    dag = DAG(dag_id='example_kubernetes_operator',
              default_args=args,
              schedule_interval=None)

    tolerations = [{'key': "key", 'operator': 'Equal', 'value': 'value'}]

    k = KubernetesPodOperator(namespace='default',
                              image="ubuntu:16.04",
                              cmds=["bash", "-cx"],
                              arguments=["echo", "10"],
                              labels={"foo": "bar"},
                              name="airflow-test-pod",
                              in_cluster=False,
                              task_id="task",
                              get_logs=True,
                              dag=dag,
                              is_delete_operator_pod=False,
                              tolerations=tolerations)

except ImportError as e:
    log.warn("Could not import KubernetesPodOperator: " + str(e))
    log.warn("Install kubernetes dependencies with: "
             "    pip install 'apache-airflow[kubernetes]'")
from flask_login import current_user, logout_user, login_required, login_user
from flask_oauthlib.client import OAuth

from airflow import models, configuration
from airflow.utils.db import provide_session
from airflow.utils.log.logging_mixin import LoggingMixin
import ssl

log = LoggingMixin().log

if configuration.conf.has_section("http_client") \
        and configuration.conf.has_option("http_client", "insecure") \
        and configuration.conf.getboolean("http_client", "insecure") \
        and getattr(ssl, '_create_unverified_context', None):
    ssl._create_default_https_context = ssl._create_unverified_context
    log.warn(
        "Airflow is using an insecure HTTP client. ROOT CA check is disabled.")


def get_config_param(param):
    return str(configuration.conf.get('oauth', param))


def has_config_param(param):
    return configuration.conf.has_option('oauth', param)


class OAuthUser(models.User):
    def __init__(self, user):
        self.user = user

    @property