def sql_task(execution_date, input_file, **kwargs):
    if os.path.exists(os.path.join(reports_folder, input_file)):
        pg_hook = PostgresHook("postgres")
        pg_cursor = pg_hook.get_cursor()

        try:
            sql_file = open(os.path.join(reports_folder, input_file))
            sql = sql_file.read()
        finally:
            sql_file.close()

        pg_cursor.execute(sql)
        result = pg_cursor.fetchall()

        with TemporaryDirectory() as tempdir:
            with open(os.path.join(tempdir, input_file + "-report.csv"),
                      "w") as out_file:
                wr = csv.writer(out_file, quoting=csv.QUOTE_ALL)
                wr.writerows(result)

            copy_to_report_path(
                os.path.join(tempdir, input_file + "-report.csv"),
                report_path(execution_date),
            )

    else:
        return FileNotFoundError
    def execute(self, context):
        # First get all index names, so it's known which indices to rename
        hook = PostgresHook(
            postgres_conn_id=self.postgres_conn_id, schema=self.database
        )

        # Start a list to hold rename information
        table_drops = [self.table_name]

        # Find the cross-tables for n-m relations, we assume they have
        # a name that start with f"{table_name}_"

        with hook.get_cursor() as cursor:

            cursor.execute(
                """
                    SELECT tablename FROM pg_tables
                    WHERE schemaname = 'public' AND tablename like %s
                """,
                (f"{self.table_name}_%",),
            )

            cross_tables = [row["tablename"] for row in cursor.fetchall()]

        # Define the SQL to execute by the super class.
        # This supports executing multiple statements in a single transaction:
        init_operation = "DROP TABLE IF EXISTS" if self.drop_table else "TRUNCATE TABLE"
        self.sql = [
            f"{init_operation} {table_name} CASCADE"
            for table_name in table_drops + cross_tables
        ]

        return super().execute(context)
    def execute(self, context=None):
        """look up the geometry where no geometry is present"""

        # get location data without geometry
        rows = self.get_non_geometry_records()

        # get BAG verblijfsobject ID from typeahead
        for record in rows:

            get_typeadhead_result = self.prepare_typeahead_call(record)
            record_key = None
            bag_url = None
            for key, value in get_typeadhead_result.items():
                record_key = key
                bag_url = value

            # extract the BAG id from the url, which is the last
            # series of numbers before the last forward-slash
            try:
                get_uri = urlparse(bag_url)
                if not isinstance(get_uri, ParseResult):
                    self.log.info(f"No BAG id found for {record}")
                    continue
                else:
                    bag_id = get_uri.path.rsplit("/")[-2]
                    self.log.info(f"BAG id found for {record_key}: {bag_id}")

            except AttributeError:
                self.log.error(
                    f"No BAG id found for {record_key} {bag_id} {bag_url}, empty result..."
                )
                continue

            pg_hook = PostgresHook(postgres_conn_id=self.postgres_conn_id)

            with pg_hook.get_cursor() as cursor:

                # update record with found geometry
                cursor.execute(
                    f"""
                        WITH BAG_VBO_GEOM AS (
                        SELECT geometrie
                        FROM public.bag_verblijfsobjecten
                        WHERE 1=1
                        AND identificatie = %s
                        )
                        UPDATE {self.source_table}
                        SET {self.geometry_column} = BAG_VBO_GEOM.geometrie
                        FROM BAG_VBO_GEOM
                        WHERE 1=1
                        AND {self.source_key_column} = %s;
                        COMMIT;
                        """,
                    (
                        bag_id,
                        record_key,
                    ),
                )
    def execute(self, context):
        hook = PostgresHook(postgres_conn_id=self.postgres_conn_id,
                            schema=self.database)

        # Start a list to hold copy information
        table_copies = [(
            self.source_table_name,
            self.target_table_name,
        )]

        # Find the cross-tables for n-m relations, we assume they have
        # a name that start with f"{source_table_name}_"

        with hook.get_cursor() as cursor:
            # the underscore must be escaped because of it's special meaning in a like
            # the exclamation mark was used as an escape chacater because
            # a backslash was not interpreted as an escape
            cursor.execute(
                """
                    SELECT tablename AS name FROM pg_tables
                    WHERE schemaname = 'public' AND tablename like %s ESCAPE '!'
                """,
                (f"{self.source_table_name}!_%", ),
            )

            cross_tables = cursor.fetchall()

        copies = []
        for row in cross_tables:
            source_table_name = row["name"]
            target_table_name = source_table_name.replace("_new", "")
            copies.append((source_table_name, target_table_name))

        # Define the SQL to execute by the super class.
        # This supports executing multiple statements in a single transaction
        self.sql = []

        for source_table_name, target_table_name in table_copies + copies:
            lookup = dict(
                source_table_name=source_table_name,
                target_table_name=target_table_name,
            )
            for sql in (
                    "CREATE TABLE IF NOT EXISTS {target_table_name} (LIKE {source_table_name} "
                    "INCLUDING CONSTRAINTS INCLUDING INDEXES)",
                    "TRUNCATE TABLE {target_table_name} CASCADE",
                    "INSERT INTO {target_table_name} SELECT * FROM {source_table_name}",
                    "DROP TABLE IF EXISTS {source_table_name} CASCADE"
                    if self.ind_drop else None,
            ):

                self.sql.append(sql.format(**lookup))

        return super().execute(context)
Example #5
0
def _load_from_database(**context):
    params = context['params']
    postgres_conn_id = params['postgres_conn_id']
    pg_hook = PostgresHook(postgres_conn_id=postgres_conn_id)
    cur = pg_hook.get_cursor()
    table_name = 'repositories'
    constraint_col0 = 'processed'
    constraint_col1 = 'contains_logging'
    raw_query = f"""SELECT * from {table_name} WHERE {constraint_col0} = %s AND {constraint_col1} = %s  LIMIT 20"""
    query = cur.mogrify(raw_query)
    repos = pg_hook.get_pandas_df(query, parameters=[True, True])
    task_instance = context['task_instance']
    task_instance.xcom_push('target_repositories', repos)
    return True
Example #6
0
    def execute(self, context):
        try:
            # Executing the Copy command
            redshift_hook = PostgresHook(self.redshift_conn_id)
            cur = redshift_hook.get_cursor()
            cur.execute(self.sql)
            int_row_count = cur.rowcount
            cur.execute("END TRANSACTION;")
            self.logger.info(
                ' {} Records have been Merged.'.format(int_row_count))

        except Exception as e:
            self.logger.error('Failed to load data, {}'.format(e))
            raise AirflowException('Failed to load data, {}'.format(e))
    def get_non_geometry_records(self):
        """get location values from table (for record with no geometry)"""

        pg_hook = PostgresHook(postgres_conn_id=self.postgres_conn_id)

        with pg_hook.get_cursor() as cursor:

            cursor.execute(f"""
                    SELECT
                        {self.source_location_column}
                    ,   {self.source_key_column}
                    FROM {self.source_table}
                    WHERE 1=1
                    AND {self.geometry_column} is NULL
                """)

            rows = cursor.fetchall()

        return rows
def index_jobs(**context):
    global algolia_conn_id

    pgsql = PostgresHook(postgres_conn_id="pgsql")
    cur = pgsql.get_cursor()

    algolia_conn = BaseHook.get_connection('algolia')
    client = SearchClient.create(algolia_conn.login, algolia_conn.password)
    index = client.init_index('jobs')

    jobs_sql_query = """
      SELECT 
        j.id AS objectID,
        j.provider_id AS provider_id,
        j.remote_id_on_provider AS remote_id_on_provider,
        j.remote_url AS remote_url,
        j.location AS location,
        j.currency_code AS currency_code,
        j.company_id AS company_id,
        j.company_name AS company_name,
        j.title AS title,
        j.description AS description,
        j.tags AS tags,
        j.salary AS salary,
        j.salary_max AS salary_max,
        j.salary_frequency AS salary_frequency,
        j.has_relocation_package AS has_relocation_package,
        j.expires_at AS expires_at,
        j.published_at AS published_at,
        c.id AS child_company_id,
        c.name AS child_company_name,
        c.remote_url AS child_company_remote_url,
      FROM job_vacancies j
        LEFT JOIN companies c ON (c.id = j.company_id)
      WHERE
        CAST(j.published_at AS DATE) = '{}'::DATE
    """.format(context['execution_date'])

    cur.execute(jobs_sql_query)
    rows = cur.fetchall()
    index.save_objects(rows)
Example #9
0
    def execute(self, context):
        postgres_hook = PostgresHook(postgres_conn_id=self._postgres_conn_id)
        s3_hook = S3Hook(aws_conn_id=self._s3_conn_id)

        with postgres_hook.get_cursor() as cursor:
            cursor.execute(self._query)
            results = cursor.fetchall()
            headers = [_[0] for _ in cursor.description]

        data_buffer = io.StringIO()
        csv_writer = csv.writer(data_buffer,
                                quoting=csv.QUOTE_ALL,
                                lineterminator=os.linesep)
        csv_writer.writerow(headers)
        csv_writer.writerows(results)
        data_buffer_binary = io.BytesIO(data_buffer.getvalue().encode())

        s3_hook.load_file_obj(
            file_obj=data_buffer_binary,
            bucket_name=self._s3_bucket,
            key=self._s3_key,
            replace=True,
        )
    def execute(self, context):
        # First get all index names, so it's known which indices to rename
        hook = PostgresHook(postgres_conn_id=self.postgres_conn_id,
                            schema=self.database)
        with hook.get_cursor() as cursor:
            cursor.execute(
                "SELECT indexname FROM pg_indexes"
                " WHERE schemaname = 'public' AND indexname like %s"
                " ORDER BY indexname;",
                (f"%{self.old_table_name}%", ),
            )
            indexes = list(cursor.fetchall())

        index_renames = [(
            row["indexname"],
            re.sub(
                pattern=_get_complete_word_pattern(self.old_table_name),
                repl=self.new_table_name,
                string=row["indexname"],
                count=1,
            ),
        ) for row in indexes]

        backup_table = f"{self.new_table_name}_old"

        # Define the SQL to execute by the super class.
        # This supports executing multiple statements in a single transaction:
        self.sql = [
            f"ALTER TABLE IF EXISTS {self.new_table_name} RENAME TO {backup_table}",
            f"ALTER TABLE {self.old_table_name} RENAME TO {self.new_table_name}",
            f"DROP TABLE IF EXISTS {backup_table}",
        ] + [
            f"ALTER INDEX {old_index} RENAME TO {new_index}"
            for old_index, new_index in index_renames
        ]

        return super().execute(context)
Example #11
0
def load_data(**kwargs):
    params = kwargs['params']
    stock_index_name = params['stock_index_name']
    from_s3 = params['from_s3']

    ti = kwargs['ti']

    filename = ti.xcom_pull(key='return_value',
                            task_ids="clean_and_merge_industries")

    scraper = StockIndexScraper(stock_index_name,
                                from_s3=from_s3,
                                load_all=False)
    scraper.df = pd.read_csv(filename, index_col='Symbol')
    scraper.data = scraper.create_data()

    pghook = PostgresHook('postgres_db')
    cur = pghook.get_cursor()

    # delete old data
    delete_stmt = ("DELETE FROM visuals.index_component_stocks "
                   "WHERE stock_index_name = %s")
    cur.execute(delete_stmt, (stock_index_name, ))

    # insert new data
    row_count = 0
    for row in scraper.data_to_tuples():
        insert_stmt = ("INSERT INTO visuals.index_component_stocks "
                       "VALUES"
                       "(%s,%s,%s,%s,%s,%s,%s)")
        cur.execute(insert_stmt, row)
        row_count += 1

    pghook.conn.commit()

    return {'row_count': row_count}
    def execute(self, context):
        # First get all index names, so it's known which indices to rename
        hook = PostgresHook(postgres_conn_id=self.postgres_conn_id,
                            schema=self.database)

        # Start a list to hold rename information
        table_renames = [(
            self.old_table_name,
            self.new_table_name,
            f"{self.new_table_name}_old",
        )]

        # Find the cross-tables for n-m relations, we assume they have
        # a name that start with f"{old_table_name}_"

        with hook.get_cursor() as cursor:
            # the underscore must be escaped because of it's special meaning in a like
            # the exclamation mark was used as an escape chacater because a backslash was not interpreted as an escape
            cursor.execute(
                """
                    SELECT tablename AS name FROM pg_tables
                    WHERE schemaname = 'public' AND tablename like %s ESCAPE '!'
                """,
                (f"{self.old_table_name}!_%", ),
            )

            cross_tables = cursor.fetchall()
            cursor.execute(
                """
                    SELECT indexname AS name FROM pg_indexes
                    WHERE schemaname = 'public' AND indexname LIKE %s ESCAPE '!'
                    ORDER BY indexname
                """,
                (f"%{self.old_table_name}!_%", ),
            )
            indexes = cursor.fetchall()
        renames = []
        for row in cross_tables:
            old_table_name = row["name"]
            new_table_name = old_table_name.replace("_new", "")
            backup_table_name = f"{new_table_name}_old"
            renames.append((old_table_name, new_table_name, backup_table_name))

        idx_renames = [(
            row["name"],
            row["name"].replace(self.old_table_name, self.new_table_name),
        ) for row in indexes]
        # Define the SQL to execute by the super class.
        # This supports executing multiple statements in a single transaction:
        self.sql = []

        for sql in (
                "ALTER TABLE IF EXISTS {new_table_name} RENAME TO {backup_table_name}",
                "ALTER TABLE IF EXISTS {old_table_name} RENAME TO {new_table_name}",
                "DROP TABLE IF EXISTS {backup_table_name}",
        ):

            for old_table_name, new_table_name, backup_table_name in (
                    table_renames + renames):
                lookup = dict(
                    old_table_name=old_table_name,
                    new_table_name=new_table_name,
                    backup_table_name=backup_table_name,
                )
                self.sql.append(sql.format(**lookup))

        for old_name, new_name in idx_renames:
            self.sql.append(
                f"ALTER INDEX IF EXISTS {old_name} RENAME TO {new_name}")

        return super().execute(context)
    def execute(self, context):
        '''
        :param context: This is an Airflow standard; it contains metadata
        about the job and task. I use it here to generate descriptive filenames.
        '''

        # Decide where we'll put the results.
        # The file is temporarily constructed within /tmp and copied
        # at the end to a unique path in Google Cloud Storage based
        # on the job and task names as well as the job date.
        # We need a temp file since we can't append directly
        # to GCS files.
        target_csv = task_output_file_path(self.output_basedir, context)
        temp_csv = task_temp_file(context)
        log.info('Temp output: {}'.format(temp_csv))
        log.info('Final output: {}'.format(target_csv))

        with open(temp_csv, 'w', newline='') as csvfile:
            csvwriter = csv.writer(csvfile,
                                   delimiter=',',
                                   quotechar='"',
                                   quoting=csv.QUOTE_MINIMAL)

            # We store the database credentials as a cluster configuration
            # rather than in code, similar to the Variables feature already
            # described in onethree_dag.py. In this case, you can view the
            # credentials 'pg_onethree_demo' in the Airflow interface by
            # clicking on Admin->Credentials and searching for that name.
            #
            # PostgresHook is a helper class that takes care of getting
            # those credentials and giving us the corresponding DB cursor.
            #
            # The other benefit of doing it this way is that connections
            # are reused across tasks, to avoid overloading the DB.
            pg_hook = PostgresHook('pg_onethree_demo',
                                   supports_autocommit=True)

            # Pull the list of drugbank IDs from the database for this batch.
            #
            # A comment on database structure: I'm using a surrogate key (id)
            # and treating drugbank_provided_id just as a property.
            # The table where targets are stored (drugbank_targets) has
            # a foreign key from drugbank_drug to this id column. Among other
            # benefits this makes indexes smaller and joins faster since an integer
            # is a compact data type compared to the character ID.
            cursor = pg_hook.get_cursor()
            cursor.execute(
                'SELECT id, drugbank_provided_id FROM public.drugbank_drugs '
                'WHERE id % {} = {}'.format(self.num_batches, self.batch_num))

            drugbank_id = cursor.fetchone()
            while drugbank_id:

                # Get the contents of the specific page
                url = 'https://www.drugbank.ca/drugs/' + drugbank_id[1]
                r = requests.get(url)

                # Parse the HTML of the page using BeautifulSoup with html5lib, which
                # works the same as a browser.
                soup = BeautifulSoup(r.text, 'html5lib')

                # Isolate our search to just the 'targets' div (i.e. what you'd see
                # visually when browsing to the TARGETS section of the page).
                #
                # This uses a CSS selector to get the div with both the
                # 'bond-list-container' and 'targets' CSS classes. "bond-list-container"
                # basically specifies "a section" and "targets" indicates which one.
                #
                # NOTE: HTML parsing is a bit brittle. I try to address that by harnessing
                # the logic in their design (for example they specifically identify the target
                # section with a class) and making as few other assumptions about the
                # structure as I can. However, an additional and far more robust
                # safeguard that I'd use in production would be size checks: Defining
                # the range of data quantities that we expect relative to past observations,
                # and alerting me when the real quantity is outside that range.
                # In my experience this catches 90% of issues.

                targets_div = soup.select('div.bond-list-container.targets')

                # If the page is a stub or otherwise lacks targets, we are unable
                # to pull any so continue.
                if len(targets_div) == 0:
                    drugbank_id = cursor.fetchone()
                    continue
                targets_div = targets_div[0]

                # Find the <dt> tags which contain the words 'Gene Name'.
                # Handle trivial changes to letter casing or surrounding whitespace.
                def ci_match_gn(string):
                    return string and 'GENE NAME' in string.upper()

                gene_strings = targets_div.find_all('dt', string=ci_match_gn)

                for t in gene_strings:
                    # Our target should be the next sibling of <dt>, i.e. <dd>.
                    # If it isn't, they might have introduced a line break which
                    # would become a sibling in between the two elements, so handle
                    # that case.

                    target = t.next_sibling
                    if target.name != 'dd':
                        target = target.next_sibling
                    if target.name != 'dd':
                        continue

                    drug_target = target.string

                    # Targets will be stored in a separate table, with a foreign
                    # key back to the drug table's id column.
                    csvwriter.writerow([drugbank_id[0], drug_target])

                drugbank_id = cursor.fetchone()

        # Move the CSV to its final location.
        #
        # Since the same code is run locally during development and in
        # production, we use an Airflow variable 'environment' that we have
        # set to distinguish environments. The helper function uses it to
        # determine whether we are copying to a local filesystem location or a
        # Cloud Storage location, since we will need different commands.
        #
        # The 'environment' variable has many other uses, for example
        # deciding whether to page the engineering team for unrecoverable
        # errors (yes in 'production', usually no in 'dev').
        env = Variable.get('environment')
        move_to_output_loc(env, temp_csv, target_csv)

        # Pass the name of the output CSV file to whatever
        # downstream task might need it. It's pulled similarly
        # using xcom_pull (shown later).
        #
        # XCOM is another Airflow feature, this time allowing message
        # passing (of small data like this) between tasks.
        context['task_instance'].xcom_push('csv_path', target_csv)