Exemple #1
0
    def execute(self, context: Dict[str, str]):
        hook = CassandraHook(cassandra_conn_id=self.cassandra_conn_id)

        query_extra = {}
        if self.query_timeout is not NOT_SET:
            query_extra['timeout'] = self.query_timeout

        cursor = hook.get_conn().execute(self.cql, **query_extra)

        files_to_upload = self._write_local_data_files(cursor)

        # If a schema is set, create a BQ schema JSON file.
        if self.schema_filename:
            files_to_upload.update(self._write_local_schema_file(cursor))

        # Flush all files before uploading
        for file_handle in files_to_upload.values():
            file_handle.flush()

        self._upload_to_gcs(files_to_upload)

        # Close all temp file handles.
        for file_handle in files_to_upload.values():
            file_handle.close()

        # Close all sessions and connection associated with this Cassandra cluster
        hook.shutdown_cluster()
Exemple #2
0
    def setUp(self):
        db.merge_conn(
            Connection(conn_id='cassandra_test',
                       conn_type='cassandra',
                       host='host-1,host-2',
                       port='9042',
                       schema='test_keyspace',
                       extra='{"load_balancing_policy":"TokenAwarePolicy"}'))
        db.merge_conn(
            Connection(conn_id='cassandra_default_with_schema',
                       conn_type='cassandra',
                       host='cassandra',
                       port='9042',
                       schema='s'))

        hook = CassandraHook("cassandra_default")
        session = hook.get_conn()
        cqls = [
            "DROP SCHEMA IF EXISTS s",
            """
                CREATE SCHEMA s WITH REPLICATION =
                    { 'class' : 'SimpleStrategy', 'replication_factor' : 1 }
            """,
        ]
        for cql in cqls:
            session.execute(cql)

        session.shutdown()
        hook.shutdown_cluster()
Exemple #3
0
 def _query_cassandra(self):
     """
     Queries cassandra and returns a cursor to the results.
     """
     self.hook = CassandraHook(cassandra_conn_id=self.cassandra_conn_id)
     session = self.hook.get_conn()
     cursor = session.execute(self.cql)
     return cursor
Exemple #4
0
    def test_get_conn(self):
        with mock.patch.object(Cluster, "__init__") as mock_cluster_ctor:
            mock_cluster_ctor.return_value = None
            CassandraHook(cassandra_conn_id='cassandra_test')
            mock_cluster_ctor.assert_called_once_with(
                contact_points=['host-1', 'host-2'],
                port=9042,
                protocol_version=4,
                load_balancing_policy=mock.ANY,
            )

            assert isinstance(mock_cluster_ctor.call_args[1]['load_balancing_policy'], TokenAwarePolicy)
    def test_get_conn(self):
        with mock.patch.object(Cluster, "connect") as mock_connect, \
                mock.patch("socket.getaddrinfo", return_value=[]) as mock_getaddrinfo:
            mock_connect.return_value = 'session'
            hook = CassandraHook(cassandra_conn_id='cassandra_test')
            hook.get_conn()
            assert mock_getaddrinfo.called
            mock_connect.assert_called_once_with('test_keyspace')

            cluster = hook.get_cluster()
            self.assertEqual(cluster.contact_points, ['host-1', 'host-2'])
            self.assertEqual(cluster.port, 9042)
            self.assertTrue(isinstance(cluster.load_balancing_policy, TokenAwarePolicy))
Exemple #6
0
    def test_table_exists_with_keyspace_from_session(self):
        hook = CassandraHook("cassandra_default_with_schema")
        session = hook.get_conn()
        cqls = [
            "DROP TABLE IF EXISTS t",
            "CREATE TABLE t (pk1 text PRIMARY KEY)",
        ]
        for cql in cqls:
            session.execute(cql)

        self.assertTrue(hook.table_exists("t"))
        self.assertFalse(hook.table_exists("u"))

        session.shutdown()
        hook.shutdown_cluster()
Exemple #7
0
    def test_table_exists_with_keyspace_from_cql(self):
        hook = CassandraHook("cassandra_default")
        session = hook.get_conn()
        cqls = [
            "DROP TABLE IF EXISTS s.t",
            "CREATE TABLE s.t (pk1 text PRIMARY KEY)",
        ]
        for cql in cqls:
            session.execute(cql)

        assert hook.table_exists("s.t")
        assert not hook.table_exists("s.u")

        session.shutdown()
        hook.shutdown_cluster()
Exemple #8
0
    def test_record_exists_with_keyspace_from_session(self):
        hook = CassandraHook("cassandra_default_with_schema")
        session = hook.get_conn()
        cqls = [
            "DROP TABLE IF EXISTS t",
            "CREATE TABLE t (pk1 text, pk2 text, c text, PRIMARY KEY (pk1, pk2))",
            "INSERT INTO t (pk1, pk2, c) VALUES ('foo', 'bar', 'baz')",
        ]
        for cql in cqls:
            session.execute(cql)

        self.assertTrue(hook.record_exists("t", {"pk1": "foo", "pk2": "bar"}))
        self.assertFalse(hook.record_exists("t", {"pk1": "foo", "pk2": "baz"}))

        session.shutdown()
        hook.shutdown_cluster()
Exemple #9
0
    def execute(self, context: 'Context'):
        hook = CassandraHook(cassandra_conn_id=self.cassandra_conn_id)

        query_extra = {}
        if self.query_timeout is not NOT_SET:
            query_extra['timeout'] = self.query_timeout

        cursor = hook.get_conn().execute(self.cql, **query_extra)

        # If a schema is set, create a BQ schema JSON file.
        if self.schema_filename:
            self.log.info('Writing local schema file')
            schema_file = self._write_local_schema_file(cursor)

            # Flush file before uploading
            schema_file['file_handle'].flush()

            self.log.info('Uploading schema file to GCS.')
            self._upload_to_gcs(schema_file)
            schema_file['file_handle'].close()

        counter = 0
        self.log.info('Writing local data files')
        for file_to_upload in self._write_local_data_files(cursor):
            # Flush file before uploading
            file_to_upload['file_handle'].flush()

            self.log.info('Uploading chunk file #%d to GCS.', counter)
            self._upload_to_gcs(file_to_upload)

            self.log.info('Removing local file')
            file_to_upload['file_handle'].close()
            counter += 1

        # Close all sessions and connection associated with this Cassandra cluster
        hook.shutdown_cluster()
 def poke(self, context: Dict[Any, Any]) -> bool:
     self.log.info('Sensor check existence of table: %s', self.table)
     hook = CassandraHook(self.cassandra_conn_id)
     return hook.table_exists(self.table)
 def poke(self, context):
     self.log.info('Sensor check existence of record: %s', self.keys)
     hook = CassandraHook(self.cassandra_conn_id)
     return hook.record_exists(self.table, self.keys)
Exemple #12
0
 def get_hook(self):
     if self.conn_type == 'mysql':
         from airflow.hooks.mysql_hook import MySqlHook
         return MySqlHook(mysql_conn_id=self.conn_id)
     elif self.conn_type == 'google_cloud_platform':
         from airflow.gcp.hooks.bigquery import BigQueryHook
         return BigQueryHook(bigquery_conn_id=self.conn_id)
     elif self.conn_type == 'postgres':
         from airflow.hooks.postgres_hook import PostgresHook
         return PostgresHook(postgres_conn_id=self.conn_id)
     elif self.conn_type == 'pig_cli':
         from airflow.hooks.pig_hook import PigCliHook
         return PigCliHook(pig_cli_conn_id=self.conn_id)
     elif self.conn_type == 'hive_cli':
         from airflow.hooks.hive_hooks import HiveCliHook
         return HiveCliHook(hive_cli_conn_id=self.conn_id)
     elif self.conn_type == 'presto':
         from airflow.hooks.presto_hook import PrestoHook
         return PrestoHook(presto_conn_id=self.conn_id)
     elif self.conn_type == 'hiveserver2':
         from airflow.hooks.hive_hooks import HiveServer2Hook
         return HiveServer2Hook(hiveserver2_conn_id=self.conn_id)
     elif self.conn_type == 'sqlite':
         from airflow.hooks.sqlite_hook import SqliteHook
         return SqliteHook(sqlite_conn_id=self.conn_id)
     elif self.conn_type == 'jdbc':
         from airflow.hooks.jdbc_hook import JdbcHook
         return JdbcHook(jdbc_conn_id=self.conn_id)
     elif self.conn_type == 'mssql':
         from airflow.hooks.mssql_hook import MsSqlHook
         return MsSqlHook(mssql_conn_id=self.conn_id)
     elif self.conn_type == 'oracle':
         from airflow.hooks.oracle_hook import OracleHook
         return OracleHook(oracle_conn_id=self.conn_id)
     elif self.conn_type == 'vertica':
         from airflow.contrib.hooks.vertica_hook import VerticaHook
         return VerticaHook(vertica_conn_id=self.conn_id)
     elif self.conn_type == 'cloudant':
         from airflow.contrib.hooks.cloudant_hook import CloudantHook
         return CloudantHook(cloudant_conn_id=self.conn_id)
     elif self.conn_type == 'jira':
         from airflow.providers.jira.hooks.jira import JiraHook
         return JiraHook(jira_conn_id=self.conn_id)
     elif self.conn_type == 'redis':
         from airflow.contrib.hooks.redis_hook import RedisHook
         return RedisHook(redis_conn_id=self.conn_id)
     elif self.conn_type == 'wasb':
         from airflow.contrib.hooks.wasb_hook import WasbHook
         return WasbHook(wasb_conn_id=self.conn_id)
     elif self.conn_type == 'docker':
         from airflow.hooks.docker_hook import DockerHook
         return DockerHook(docker_conn_id=self.conn_id)
     elif self.conn_type == 'azure_data_lake':
         from airflow.contrib.hooks.azure_data_lake_hook import AzureDataLakeHook
         return AzureDataLakeHook(azure_data_lake_conn_id=self.conn_id)
     elif self.conn_type == 'azure_cosmos':
         from airflow.contrib.hooks.azure_cosmos_hook import AzureCosmosDBHook
         return AzureCosmosDBHook(azure_cosmos_conn_id=self.conn_id)
     elif self.conn_type == 'cassandra':
         from airflow.providers.apache.cassandra.hooks.cassandra import CassandraHook
         return CassandraHook(cassandra_conn_id=self.conn_id)
     elif self.conn_type == 'mongo':
         from airflow.contrib.hooks.mongo_hook import MongoHook
         return MongoHook(conn_id=self.conn_id)
     elif self.conn_type == 'gcpcloudsql':
         from airflow.gcp.hooks.cloud_sql import CloudSqlDatabaseHook
         return CloudSqlDatabaseHook(gcp_cloudsql_conn_id=self.conn_id)
     elif self.conn_type == 'grpc':
         from airflow.contrib.hooks.grpc_hook import GrpcHook
         return GrpcHook(grpc_conn_id=self.conn_id)
     raise AirflowException("Unknown hook type {}".format(self.conn_type))
Exemple #13
0
    # 'queue': 'bash_queue',
    # 'pool': 'backfill',
    # 'priority_weight': 10,
    # 'end_date': datetime(2016, 1, 1),
    # 'wait_for_downstream': False,
    # 'dag': dag,
    # 'sla': timedelta(hours=2),
    # 'execution_timeout': timedelta(seconds=300),
    # 'on_failure_callback': some_function,
    # 'on_success_callback': some_other_function,
    # 'on_retry_callback': another_function,
    # 'sla_miss_callback': yet_another_function,
    # 'trigger_rule': 'all_success'
}

hook = CassandraHook('cassandra_default')
pp = pprint.PrettyPrinter(indent=4)

def check_table_exists(keyspace_name, table_name):
    print("Checking for existence of "+keyspace_name+"."+table_name)
    hook.keyspace = keyspace_name
    return hook.table_exists(table_name)

def execute_query(query):
    pp.pprint( hook.get_conn().execute(query).current_rows )

select_all_query = "SELECT * FROM test.users;"

with DAG(
    'cass_hooks_tutorial',
    default_args=default_args,