Ejemplo n.º 1
0
def get_rates(ds, **kwargs):
    pg_hook = PostgresHook(postgres_conn_id='rates')
    api_hook = HttpHook(http_conn_id='openexchangerates', method='GET')

    # If either of these raises an exception then we'll be notified via
    # Airflow
    resp = api_hook.run('')
    resp = json.loads(resp.content)

    # These are the only valid pairs the DB supports at the moment. Anything
    # else that turns up will be ignored.
    valid_pairs = (
        'AED', 'AFN', 'ALL', 'AMD', 'ANG', 'AOA', 'ARS',
        'AUD', 'AWG', 'AZN', 'BAM', 'BBD', 'BDT', 'BGN',
        'BHD', 'BIF', 'BMD', 'BND', 'BOB', 'BRL', 'BSD',
        'BTC', 'BTN', 'BWP', 'BYN', 'BYR', 'BZD', 'CAD',
        'CDF', 'CHF', 'CLF', 'CLP', 'CNY', 'COP', 'CRC',
        'CUC', 'CUP', 'CVE', 'CZK', 'DJF', 'DKK', 'DOP',
        'DZD', 'EEK', 'EGP', 'ERN', 'ETB', 'EUR', 'FJD',
        'FKP', 'GBP', 'GEL', 'GGP', 'GHS', 'GIP', 'GMD',
        'GNF', 'GTQ', 'GYD', 'HKD', 'HNL', 'HRK', 'HTG',
        'HUF', 'IDR', 'ILS', 'IMP', 'INR', 'IQD', 'IRR',
        'ISK', 'JEP', 'JMD', 'JOD', 'JPY', 'KES', 'KGS',
        'KHR', 'KMF', 'KPW', 'KRW', 'KWD', 'KYD', 'KZT',
        'LAK', 'LBP', 'LKR', 'LRD', 'LSL', 'LTL', 'LVL',
        'LYD', 'MAD', 'MDL', 'MGA', 'MKD', 'MMK', 'MNT',
        'MOP', 'MRO', 'MTL', 'MUR', 'MVR', 'MWK', 'MXN',
        'MYR', 'MZN', 'NAD', 'NGN', 'NIO', 'NOK', 'NPR',
        'NZD', 'OMR', 'PAB', 'PEN', 'PGK', 'PHP', 'PKR',
        'PLN', 'PYG', 'QAR', 'RON', 'RSD', 'RUB', 'RWF',
        'SAR', 'SBD', 'SCR', 'SDG', 'SEK', 'SGD', 'SHP',
        'SLL', 'SOS', 'SRD', 'STD', 'SVC', 'SYP', 'SZL',
        'THB', 'TJS', 'TMT', 'TND', 'TOP', 'TRY', 'TTD',
        'TWD', 'TZS', 'UAH', 'UGX', 'USD', 'UYU', 'UZS',
        'VEF', 'VND', 'VUV', 'WST', 'XAF', 'XAG', 'XAU',
        'XCD', 'XDR', 'XOF', 'XPD', 'XPF', 'XPT', 'YER',
        'ZAR', 'ZMK', 'ZMW', 'ZWL')

    rates_insert = """INSERT INTO rates (pair, valid_until, rate)
                      VALUES (%s, %s, %s);"""

    # If this raises an exception then we'll be notified via Airflow
    valid_until = datetime.fromtimestamp(resp['timestamp'])

    for (iso2, rate) in resp['rates'].iteritems():
        # If converting the rate to a float fails for whatever reason then
        # just move on.
        try:
            rate = float(rate)
        except:
            continue

        iso2 = iso2.upper().strip()

        if iso2 not in valid_pairs or rate < 0:
            continue

        pg_hook.run(rates_insert, parameters=(iso2,
                                              valid_until,
                                              rate))
Ejemplo n.º 2
0
 def execute(self, context):
     http = HttpHook(self.method, http_conn_id=self.http_conn_id)
     logging.info("Calling HTTP method")
     response = http.run(self.endpoint, self.data, self.headers,
                         self.extra_options)
     if self.response_check:
         if not self.response_check(response):
             raise AirflowException("Response check returned False.")
 def execute(self, context):
     http = HttpHook(self.method, http_conn_id=self.http_conn_id)
     logging.info("Calling HTTP method")
     response = http.run(self.endpoint,
                         self.data,
                         self.headers,
                         self.extra_options)
     if self.response_check:
         if not self.response_check(response):
             raise AirflowException("Response check returned False.")
Ejemplo n.º 4
0
def download_api_data(**kwargs):

    tfl_hook = HttpHook(http_conn_id='tfl_parking_conn', method='GET')
    resp = tfl_hook.run('', extra_options={'check_response': True})

    filepostfix = str(time.time()) + '.json'
    base_dir = Variable.get("tfl_park_base_dir")
    filename = join(base_dir, filepostfix)

    with open(filename, 'w') as f:
        f.write(resp.text)
Ejemplo n.º 5
0
def get_people(**kwargs):
    api_hook = HttpHook(http_conn_id='swapi_people', method='GET')

    for i in range(9):
        response = api_hook.run('', data={'page': i + 1})
        response_json = json.loads(response.content)
        print(response_json)
        store_people(response_json['results'])
        time.sleep(3)

    return True
Ejemplo n.º 6
0
    def __init__(
            self,
            spark_script,
            session_kind="spark",  # spark, pyspark, or sparkr
            http_conn_id='http_default',
            poll_interval=30,
            *args,
            **kwargs):
        super(LivySparkOperator, self).__init__(*args, **kwargs)

        self.spark_script = spark_script
        self.session_kind = session_kind
        self.http_conn_id = http_conn_id
        self.poll_interval = poll_interval

        self.http = HttpHook("GET", http_conn_id=self.http_conn_id)
Ejemplo n.º 7
0
def get_rates(ds, **kwargs):
    # connection: a Airflow connection
    pg_hook = PostgresHook(postgres_conn_id='crypto')
    api_hook = HttpHook(http_conn_id='cryptocoincharts_eth', method='GET')

    resp = api_hook.run('')
    resp = json.loads(resp.content)

    rates_insert = """INSERT INTO rates (id, price, last_price, volume)
                      VALUES (%s, %s, %s, %s);"""
    markets_insert = """INSERT INTO markets (market, pair, price, volume_btc, volume)
                      VALUES (%s, %s, %s, %s, %s);"""

    pg_hook.run(rates_insert, parameters=(resp['id'], resp['price'], resp['price_before_24h'], resp['volume_second']))

    for market in resp['markets']:
        pg_hook.run(markets_insert, parameters=(market['market'], PAIR, market['price'], market['volume_btc'], market['volume']))
class KairosDBOperator(BaseOperator):
    """
   Operator to facilitate interacting with the kairosDB which executes Apache Spark code via a REST API.
   :param query: Scala, the kairos query
   :type spark_script: string

   """
    @apply_defaults
    def __init__(self, query, http_conn_id='http_kairosdb', *args, **kwargs):
        super(KairosDBOperator, self).__init__(*args, **kwargs)
        self.query = query
        self.http_conn_id = http_conn_id
        self.acceptable_response_codes = [200, 201]
        self.http = HttpHook("GET", http_conn_id=self.http_conn_id)

    def execute(self, context):
        # Simple test
        headers = {'Content-Type': 'application/json'}

        response = self._http_rest_call("POST",
                                        "/api/v1/datapoints/query",
                                        data=json.dumps(self.query),
                                        headers=headers)
        logging.debug("Status code: %d" % response.status_code)
        if not (response.status_code in self.acceptable_response_codes):
            return None
        r = response.json()
        logging.debug("JSON response: %s" % r)

        if r:
            return r.get("queries")
        else:
            return None

    def _http_rest_call(self,
                        method,
                        endpoint,
                        data=None,
                        headers=None,
                        extra_options=None):
        if not extra_options:
            extra_options = {}
        logging.info("Performing HTTP REST call... (method: " + str(method) +
                     ", endpoint: " + str(endpoint) + ", data: " + str(data) +
                     ", headers: " + str(headers) + ")")
        self.http.method = method

        response = self.http.run(endpoint,
                                 data,
                                 headers,
                                 extra_options=extra_options)

        logging.debug("status_code: " + str(response.status_code))
        logging.debug("response_as_json: " + str(response.json()))

        return response
Ejemplo n.º 9
0
def get_stocks(ds, **context):
    symbol = context['params']['symbol']

    pg_hook = PostgresHook(postgres_conn_id='stocks')
    api_hook = HttpHook(http_conn_id='alphavantage', method='GET')

    # If either of these raises an exception then we'll be notified via
    # Airflow
    resp = api_hook.run(f'query?function=TIME_SERIES_DAILY_ADJUSTED&symbol={symbol}&apikey=537201H9R203WT4C&datatype=csv')
    resp = json.loads(resp.content)

    # These are the only valid stocks the DB supports at the moment. Anything
    # else that turns up will be ignored.

    stocks_insert = """INSERT INTO stocks (symbol, valid_until, price)
                      VALUES (%s, %s, %s);"""

    # If this raises an exception then we'll be notified via Airflow
    valid_until = datetime.fromtimestamp(resp['timestamp'])

    for iso2, price in resp['stocks'].items():
        # If converting the price to a float fails for whatever reason then
        # just move on.
        try:
            price = float(price)
        except:
            continue

        iso2 = iso2.upper().strip()

        if iso2 not in stocks or price < 0:
            continue

        pg_hook.run(stocks_insert, parameters=(iso2,
                                               valid_until,
                                               price))
Ejemplo n.º 10
0
class LivySparkOperator(BaseOperator):
    """
   Operator to facilitate interacting with the Livy Server which executes Apache Spark code via a REST API.

   :param spark_script: Scala, Python or R code to submit to the Livy Server (templated)
   :type spark_script: string
   :param session_kind: Type of session to setup with Livy. This will determine which type of code will be accepted. Possible values include "spark" (executes Scala code), "pyspark" (executes Python code) or "sparkr" (executes R code).
   :type session_kind: string
   :param http_conn_id: The http connection to run the operator against
   :type http_conn_id: string
   :param poll_interval: The polling interval to use when checking if the code in spark_script has finished executing. In seconds. (default: 30 seconds)
   :type poll_interval: integer
   """

    template_fields = ['spark_script']  # todo : make sure this works
    template_ext = ['.py', '.R', '.r']
    ui_color = '#34a8dd'  # Clouderas Main Color: Blue

    acceptable_response_codes = [200, 201]

    @apply_defaults
    def __init__(
            self,
            spark_script,
            session_kind="spark",  # spark, pyspark, or sparkr
            http_conn_id='http_default',
            poll_interval=30,
            *args,
            **kwargs):
        super(LivySparkOperator, self).__init__(*args, **kwargs)

        self.spark_script = spark_script
        self.session_kind = session_kind
        self.http_conn_id = http_conn_id
        self.poll_interval = poll_interval

        self.http = HttpHook("GET", http_conn_id=self.http_conn_id)

    def execute(self, context):
        logging.info("Executing LivySparkOperator.execute(context)")

        logging.info("Validating arguments...")
        self._validate_arguments()
        logging.info("Finished validating arguments")

        logging.info("Creating a Livy Session...")
        session_id = self._create_session()
        logging.info("Finished creating a Livy Session. (session_id: " +
                     str(session_id) + ")")

        logging.info("Submitting spark script...")
        statement_id, overall_statements_state = self._submit_spark_script(
            session_id=session_id)
        logging.info("Finished submitting spark script. (statement_id: " +
                     str(statement_id) + ", overall_statements_state: " +
                     str(overall_statements_state) + ")")

        poll_for_completion = (overall_statements_state == "running")

        if poll_for_completion:
            logging.info(
                "Spark job did not complete immediately. Starting to Poll for completion..."
            )

        while overall_statements_state == "running":  # todo: test execution_timeout
            logging.info("Sleeping for " + str(self.poll_interval) +
                         " seconds...")
            time.sleep(self.poll_interval)
            logging.info(
                "Finished sleeping. Checking if Spark job has completed...")
            statements = self._get_session_statements(session_id=session_id)

            is_all_complete = True
            for statement in statements:
                if statement["state"] == "running":
                    is_all_complete = False

            if is_all_complete:
                overall_statements_state = "available"

            logging.info(
                "Finished checking if Spark job has completed. (overall_statements_state: "
                + str(overall_statements_state) + ")")

        if poll_for_completion:
            logging.info("Finished Polling for completion.")

        logging.info("Session Logs:\n" +
                     str(self._get_session_logs(session_id=session_id)))

        for statement in self._get_session_statements(session_id):
            logging.info("Statement '" + str(statement["id"]) + "' Output:\n" +
                         str(statement["output"]))

        logging.info("Closing session...")
        response = self._close_session(session_id=session_id)
        logging.info("Finished closing session. (response: " + str(response) +
                     ")")

        logging.info("Finished executing LivySparkOperator.execute(context)")

    def _validate_arguments(self):
        if self.session_kind is None or self.session_kind == "":
            raise AirflowException(
                "session_kind argument is invalid. It is empty or None. (value: '"
                + str(self.session_kind) + "')")
        elif self.session_kind not in ["spark", "pyspark", "sparkr"]:
            raise AirflowException(
                "session_kind argument is invalid. It should be set to 'spark', 'pyspark', or 'sparkr'. (value: '"
                + str(self.session_kind) + "')")

    def _get_sessions(self):
        method = "GET"
        endpoint = "sessions"
        response = self._http_rest_call(method=method, endpoint=endpoint)

        if response.status_code in self.acceptable_response_codes:
            return response.json()["sessions"]
        else:
            raise AirflowException("Call to get sessions didn't return " +
                                   str(self.acceptable_response_codes) +
                                   ". Returned '" + str(response.status_code) +
                                   "'.")

    def _get_session(self, session_id):
        sessions = self._get_sessions()
        for session in sessions:
            if session["id"] == session_id:
                return session

    def _get_session_logs(self, session_id):
        method = "GET"
        endpoint = "sessions/" + str(session_id) + "/log"
        response = self._http_rest_call(method=method, endpoint=endpoint)
        return response.json()

    def _create_session(self):
        method = "POST"
        endpoint = "sessions"

        data = {"kind": self.session_kind}

        response = self._http_rest_call(method=method,
                                        endpoint=endpoint,
                                        data=data)

        if response.status_code in self.acceptable_response_codes:
            response_json = response.json()
            session_id = response_json["id"]
            session_state = response_json["state"]

            if session_state == "starting":
                logging.info(
                    "Session is starting. Polling to see if it is ready...")

            session_state_polling_interval = 10
            while session_state == "starting":
                logging.info("Sleeping for " +
                             str(session_state_polling_interval) + " seconds")
                time.sleep(session_state_polling_interval)
                session_state_check_response = self._get_session(
                    session_id=session_id)
                session_state = session_state_check_response["state"]
                logging.info("Got latest session state as '" + session_state +
                             "'")

            return session_id
        else:
            raise AirflowException(
                "Call to create a new session didn't return " +
                str(self.acceptable_response_codes) + ". Returned '" +
                str(response.status_code) + "'.")

    def _submit_spark_script(self, session_id):
        method = "POST"
        endpoint = "sessions/" + str(session_id) + "/statements"

        logging.info("Executing Spark Script: \n" + str(self.spark_script))

        data = {'code': textwrap.dedent(self.spark_script)}

        response = self._http_rest_call(method=method,
                                        endpoint=endpoint,
                                        data=data)

        if response.status_code in self.acceptable_response_codes:
            response_json = response.json()
            return response_json["id"], response_json["state"]
        else:
            raise AirflowException(
                "Call to create a new statement didn't return " +
                str(self.acceptable_response_codes) + ". Returned '" +
                str(response.status_code) + "'.")

    def _get_session_statements(self, session_id):
        method = "GET"
        endpoint = "sessions/" + str(session_id) + "/statements"
        response = self._http_rest_call(method=method, endpoint=endpoint)

        if response.status_code in self.acceptable_response_codes:
            response_json = response.json()
            statements = response_json["statements"]
            return statements
        else:
            raise AirflowException(
                "Call to get the session statement response didn't return " +
                str(self.acceptable_response_codes) + ". Returned '" +
                str(response.status_code) + "'.")

    def _close_session(self, session_id):
        method = "DELETE"
        endpoint = "sessions/" + str(session_id)
        return self._http_rest_call(method=method, endpoint=endpoint)

    def _http_rest_call(self,
                        method,
                        endpoint,
                        data=None,
                        headers=None,
                        extra_options=None):
        if not extra_options:
            extra_options = {}
        logging.debug("Performing HTTP REST call... (method: " + str(method) +
                      ", endpoint: " + str(endpoint) + ", data: " + str(data) +
                      ", headers: " + str(headers) + ")")
        self.http.method = method
        response = self.http.run(endpoint,
                                 json.dumps(data),
                                 headers,
                                 extra_options=extra_options)

        logging.debug("status_code: " + str(response.status_code))
        logging.debug("response_as_json: " + str(response.json()))

        return response
 def __init__(self, query, http_conn_id='http_kairosdb', *args, **kwargs):
     super(KairosDBOperator, self).__init__(*args, **kwargs)
     self.query = query
     self.http_conn_id = http_conn_id
     self.acceptable_response_codes = [200, 201]
     self.http = HttpHook("GET", http_conn_id=self.http_conn_id)
Ejemplo n.º 12
0
def httpcall(**kwargs):
    api_hook = HttpHook(http_conn_id='http_default', method='GET')
    resp = api_hook.run('')
    resp = json.loads(resp.text)
    print resp