Beispiel #1
0
def extract(
    batch_id, method="GET", http_conn_id="default_api", mongo_conn_id="default_mongo"
):

    http = HttpHook(method, http_conn_id=http_conn_id)

    mongo_conn = MongoHook(mongo_conn_id)
    ids_to_update_coll = mongo_conn.get_collection("ids_to_update", "courts")
    results_to_transform_coll = mongo_conn.get_collection(
        "results_to_transform", "courts"
    )

    # Note/TODO: because we add endpoints back that we couldn't handle, we may
    # get stuck in an infinite loop. Another solution is exiting whenever an
    # exception occurs, but this isn't ideal either
    while ids_to_update_coll.find_one({"batch_id": str(batch_id)}) != None:

        # find a job to work on
        result = ids_to_update_coll.find_one_and_delete({"batch_id": str(batch_id)})
        api_id = result["api_id"]
        try:

            # transform to get a valid link
            # TODO: this needs to be generalized to any website
            endpoint = f"opinions/{api_id}"

            # pull data in
            response = http.run(endpoint)

            result_data = response.json()

            if response.status_code == 200:

                # store our result into mongo
                results_to_transform_coll.insert_one(
                    {"batch_id": str(batch_id), "data": result_data}
                )

            else:
                # TODO: throw a more specific exception
                raise AirflowException(
                    f"Received {response.status_code} code from {endpoint}."
                )

        except json.JSONDecodeError as j_error:
            print("Failed to decode response with {j_error}:\n{response.body}")
            mongo_conn.insert_one(
                "ids_to_update",
                {"api_id": str(api_id), "batch_id": str(batch_id)},
                mongo_db="courts",
            )
        except Exception as error:
            # something went wrong. Log it and return this endpoint to mongoDB so we can try again
            print(f"An exception occured while processing batch {batch_id}:\n{error}")
            mongo_conn.insert_one(
                "ids_to_update",
                {"api_id": str(api_id), "batch_id": str(batch_id)},
                mongo_db="courts",
            )
    def execute(self, context):
        mongoHook = MongoHook(conn_id=self.mongo_conn_id)
        self.mongo_db = mongoHook.connection.schema
        log.info('postgres_conn_id: %s', self.postgres_conn_id)
        log.info('mongo_conn_id: %s', self.mongo_conn_id)
        log.info('postgres_sql: %s', self.postgres_sql)
        # log.info('prev_exec_date: %s', self.prev_exec_date)
        log.info('mongo_db: %s', self.mongo_db)
        log.info('mongo_collection: %s', self.mongo_collection)

        well_data = self.get_data()
        most_recent_date = Variable.get("most_recent_date")
        print(most_recent_date)
        filter_query = None
        for index, well in well_data.iterrows():
            if well is not None and well['is_newly_added']:
                print('newly added')
                filter_query = {"Name": {"$eq": well['well_name']}}
            else:
                print('old well')
                filter_query = {
                    "$and": [{
                        "Name": {
                            "$eq": well['well_name']
                        }
                    }, {
                        "Date": {
                            "$gt": most_recent_date
                        }
                    }]
                }
                # filter_query = { "Date" : { "$gt" : most_recent_date } }

            log.info('mongo filter query: %s', filter_query)
            mongo_well_list = self.transform(
                mongoHook.get_collection(
                    self.mongo_collection).find(filter_query))
            print(len(mongo_well_list))
            if len(mongo_well_list) > 0:
                for doc in mongo_well_list:
                    doc["water_cut_calc"] = utils.calc_watercut(
                        doc['OIL_bopd'], doc['WATER_bwpd'])
                    doc["gor_calc"] = utils.calc_gor(doc['OIL_bopd'],
                                                     doc['GAS_mscfd'])

                self.update_records(mongoHook, filter_query, mongo_well_list)
    def test_transform_load_operator(
        self, mocker, postgresql, ports_collection, test_dag
    ):
        """Test if transform_load_operator upserts data into master db."""
        # Create mocks
        mocker.patch.object(
            PostgresHook,
            "get_conn",
            return_value=postgresql
        )
        mocker.patch.object(
            MongoHook,
            "get_collection",
            return_value=ports_collection
        )

        # Check if the source table has an item in it
        mongo_hook = MongoHook()
        collection = mongo_hook.get_collection()
        assert collection.count_documents({}) > 0

        # Check if the sink table is initially empty
        cursor = postgresql.cursor()
        cursor.execute("SELECT COUNT(*) FROM ports;")
        initial_result = cursor.fetchone()[0]
        assert initial_result == 0

        # Setup task
        mongo_staging_config = MongoConfig('mongo_default', 'ports')
        postgres_master_config = PostgresConfig('postgres_default')
        task = TransformAndLoadOperator(
            mongo_config=mongo_staging_config,
            postgres_config=postgres_master_config,
            task_id='test',
            processor=PortsItemProcessor(),
            query=SqlQueries.ports_table_insert,
            query_params={"updated_at": datetime.datetime.utcnow()},
            dag=test_dag
        )

        # Execute task and check if it inserted the data successfully
        task.execute(context={}, testing=True)
        cursor.execute("SELECT COUNT(*) FROM ports;")
        after_result = cursor.fetchone()[0]
        assert after_result > 0
    def test_transform_load_operator_database_error(
        self, mocker, postgresql, ports_collection, test_dag
    ):
        """Test if transform_load_operator handles DB errors."""
        # Create mocks
        mocker.patch.object(
            PostgresHook,
            "get_conn",
            return_value=postgresql
        )
        mocker.patch.object(
            MongoHook,
            "get_collection",
            return_value=ports_collection
        )

        # Check if the source table has an item in it
        mongo_hook = MongoHook()
        collection = mongo_hook.get_collection()
        assert collection.count_documents({}) > 0

        # Setup task, intentionally give an unknown table
        mongo_staging_config = MongoConfig('mongo_default', 'ports')
        postgres_master_config = PostgresConfig('postgres_default')
        task = TransformAndLoadOperator(
            mongo_config=mongo_staging_config,
            postgres_config=postgres_master_config,
            task_id='test',
            processor=PortsItemProcessor(),
            query=SqlQueries.ports_table_insert.replace(
                'ports', 'ports_wrong'
            ),
            query_params={"updated_at": datetime.datetime.utcnow()},
            dag=test_dag
        )

        # Execute the task and check if it will raise an UndefinedTable error
        with raises((UndefinedTable, Exception, OperationalError)):
            # Set testing to false to implicitly close the database
            task.execute(context={}, testing=False)
            task.execute(context={}, testing=True)
Beispiel #5
0
    def execute(self, context, testing=False):
        """
        Read all data from mongo db, process it
        and write to postgresql db.

        Uses UPSERT SQL query to write data.
        """
        self.log.info('LoadToMasterdbOperator Starting...')
        self.log.info("Initializing Mongo Staging DB Connection...")
        mongo_hook = MongoHook(conn_id=self._mongo_conn_id)
        ports_collection = mongo_hook.get_collection(self._mongo_collection)
        self.log.info("Initializing Postgres Master DB Connection...")
        psql_hook = PostgresHook(postgres_conn_id=self._postgres_conn_id)
        psql_conn = psql_hook.get_conn()
        psql_cursor = psql_conn.cursor()
        self.log.info("Loading Staging data to Master Database...")
        try:
            for idx, document in enumerate(ports_collection.find({})):
                document = self._processor.process_item(document)
                staging_id = document.get('_id').__str__()
                document['staging_id'] = staging_id
                document.pop('_id')
                psql_cursor.execute(self._sql_query, document)
            psql_conn.commit()
        except (OperationalError, UndefinedTable, OperationFailure):
            self.log.error("Writting to database FAILED.")
            self.log.error(traceback.format_exc())
            raise Exception("LoadToMasterdbOperator FAILED.")
        except Exception:
            self.log.error(traceback.format_exc())
            raise Exception("LoadToMasterdbOperator FAILED.")
        finally:
            if not testing:
                self.log.info('Closing database connections...')
                psql_conn.close()
                mongo_hook.close_conn()
        self.log.info(f'UPSERTED {idx+1} records into Postgres Database.')
        self.log.info('LoadToMasterdbOperator SUCCESS!')
    def execute(self, context):

        mongoHook = MongoHook(conn_id=self.mongo_conn_id)

        log.info('odbc_conn_id: %s', self.odbc_conn_id)
        log.info('postgres_conn_id: %s', self.postgres_conn_id)
        log.info('mongo_conn_id: %s', self.mongo_conn_id)
        log.info('mongo_db: %s', mongoHook.connection.schema)
        log.info('mongo_collection: %s', self.mongo_collection)
        log.info('odbc_sql: %s', self.odbc_sql)
        log.info('postgres_sql: %s', self.postgres_sql)
        log.info('postgres_insert_sql: %s', self.postgres_insert_sql)

        mongo_well_list = mongoHook.get_collection(
            self.mongo_collection).distinct("Name")
        log.info('mongo well list: %s', mongo_well_list)
        odbc_well_list = self.get_data()
        log.info('odbc well list: %s', odbc_well_list)
        final_well_list = []
        if not mongo_well_list and len(mongo_well_list) == 0:
            final_well_list = self.prepare_well_list(odbc_well_list, True)
        else:
            mongo_filtered_well_list = self.prepare_well_list(
                mongo_well_list, False)
            new_well_list = list(set(odbc_well_list) - set(mongo_well_list))
            log.info('new well list: %s', new_well_list)
            new_well_list = self.prepare_well_list(new_well_list, True)
            postgres_well_list = self.get_well_data()
            if postgres_well_list.empty == True:
                for item in new_well_list:
                    final_well_list.append(item)
            else:
                final_well_list = new_well_list

        log.info('final well list for insert: %s', final_well_list)
        if final_well_list and len(final_well_list) > 0:
            self.insert_data(final_well_list)
    def test_transform_load_operator_exception_error(
        self, mocker, postgresql, ports_collection, test_dag
    ):
        """Test if transform_load_operator handles Exception thrown."""
        # Create mocks
        mocker.patch.object(
            PostgresHook,
            "get_conn",
            return_value=postgresql
        )
        mocker.patch.object(
            MongoHook,
            "get_collection",
            return_value=ports_collection
        )

        # Check if the source table has an item in it
        mongo_hook = MongoHook()
        collection = mongo_hook.get_collection()
        assert collection.count_documents({}) > 0

        # Setup task
        mongo_staging_config = MongoConfig('mongo_default', 'ports')
        postgres_master_config = PostgresConfig('postgres_default')
        task = TransformAndLoadOperator(
            mongo_config=mongo_staging_config,
            postgres_config=postgres_master_config,
            task_id='test',
            processor=PortsItemProcessor(),
            query='Wrong SQL query',
            dag=test_dag
        )

        # Execute task and check if it will raise an Exception error
        with raises(Exception):
            task.execute(context={}, testing=True)
Beispiel #8
0
    def test_save_to_json_operator(
        self, mocker, postgresql, ports_collection, test_dag,
        tmp_path: Path
    ):
        """Test if save_to_json_operator saves the file on a specified path"""
        # Create mocks
        mocker.patch.object(
            PostgresHook,
            "get_conn",
            return_value=postgresql
        )
        mocker.patch.object(
            MongoHook,
            "get_collection",
            return_value=ports_collection
        )

        # Check if the source table has an item in it
        mongo_hook = MongoHook()
        collection = mongo_hook.get_collection()
        assert collection.count_documents({}) > 0

        # Setup some data, transfer staging data to master
        mongo_staging_config = MongoConfig('mongo_default', 'ports')
        postgres_master_config = PostgresConfig('postgres_default')
        transform_load = TransformAndLoadOperator(
            mongo_config=mongo_staging_config,
            postgres_config=postgres_master_config,
            task_id='test',
            processor=PortsItemProcessor(),
            query=SqlQueries.ports_table_insert,
            query_params={"updated_at": datetime.datetime.utcnow()},
            dag=test_dag
        )

        # Execute task and check if it inserted the data successfully
        transform_load.execute(context={}, testing=True)
        pg_hook = PostgresHook()
        cursor = pg_hook.get_conn().cursor()
        cursor.execute("SELECT COUNT(*) FROM ports;")
        after_result = cursor.fetchone()[0]
        assert after_result > 0

        # Alter tmp_path to forcesively create a path
        tmp_path = tmp_path / 'unknown-path'

        # Execute save_to_json to save the data into json file on tmp_path
        save_to_json = LoadToJsonOperator(
            task_id='export_to_json',
            postgres_config=postgres_master_config,
            query=SqlQueries.select_all_query_to_json,
            path=tmp_path,
            tables=['ports'],
            dag=test_dag
        )
        save_to_json.execute(
            {'execution_date': datetime.datetime(2021, 1, 1)}
        )

        output_path = tmp_path / 'ports_20210101T000000.json'

        expected_data = {
            'ports': [{
                'id': 1,
                'countryName': 'Philippines',
                'portName': 'Aleran/Ozamis',
                'unlocode': 'PH ALE',
                'coordinates': '4234N 00135E'
            }]
        }

        # Read result
        with open(output_path, "r") as f:
            result = json.load(f)

        # Assert
        assert 'ports' in result
        assert result == expected_data
Beispiel #9
0
    def test_save_to_json_operator_database_error(
        self, mocker, postgresql, ports_collection, test_dag,
        tmp_path: Path
    ):
        """Test if save_to_json_operator can handle errors related to db."""
        # Create mocks
        mocker.patch.object(
            PostgresHook,
            "get_conn",
            return_value=postgresql
        )
        mocker.patch.object(
            MongoHook,
            "get_collection",
            return_value=ports_collection
        )

        # Check if the source table has an item in it
        mongo_hook = MongoHook()
        collection = mongo_hook.get_collection()
        assert collection.count_documents({}) > 0

        # Setup some data, transfer staging data to master
        mongo_staging_config = MongoConfig('mongo_default', 'ports')
        postgres_master_config = PostgresConfig('postgres_default')
        transform_load = TransformAndLoadOperator(
            mongo_config=mongo_staging_config,
            postgres_config=postgres_master_config,
            task_id='test',
            processor=PortsItemProcessor(),
            query=SqlQueries.ports_table_insert,
            query_params={"updated_at": datetime.datetime.utcnow()},
            dag=test_dag
        )

        # Execute task and check if it inserted the data successfully
        transform_load.execute(context={}, testing=True)
        pg_hook = PostgresHook()
        cursor = pg_hook.get_conn().cursor()
        cursor.execute("SELECT COUNT(*) FROM ports;")
        after_result = cursor.fetchone()[0]
        assert after_result > 0

        # Execute save_to_json to save the data into json file on tmp_path
        save_to_json = LoadToJsonOperator(
            task_id='test2',
            postgres_config=postgres_master_config,
            query=SqlQueries.select_all_query_to_json,
            path=tmp_path,
            tables=['foo'],
            dag=test_dag
        )
        with raises((UndefinedTable, OperationalError, Exception)):
            # Set testing = False to implicitly close the database connection
            save_to_json.execute(
                {'execution_date': datetime.datetime(2021, 1, 1)},
                testing=False
            )
            save_to_json.execute(
                {'execution_date': datetime.datetime(2021, 1, 1)},
                testing=True
            )