def execute(self, context): """ Executes the transfer operation from hdfs to Clickhouse. :param context: The context that is being provided when executing. :type context: dict """ self.log.info("Connecting to hdfs using %s connection." % self.clickhouse_conn_id) clickhouse = ClickhouseHook(clickhouse_conn_id=self.clickhouse_conn_id, auth=self.auth) self.log.info("Connecting to hdfs using %s connection." % self.hdfs_conn_id) try: hdfs = HDFSHook(hdfs_conn_id=self.hdfs_conn_id) hdfs_client = hdfs.get_conn() except Exception as e: raise AirflowException("Failed to retireve hdfs client.", e) self.log.info("Checking hdfs paths %s" % ', '.join(self.hdfs_paths)) try: ls = list(hdfs_client.ls(self.hdfs_paths)) self.log.info("Total files: %s" % len(ls)) total_size = sum([f.get('blocksize', 0) for f in ls]) if total_size == 0: self.log.warning("Files are empty, skipping insert.") return except Exception as e: raise AirflowException("Error checking hdfs paths.", e) self.log.info("Reading from hdfs.") try: hdfs_stdout_gen = hdfs_client.cat(self.hdfs_paths) self.log.info("Loading into clickhouse table %s..." % self.clickhouse_table) total_rows = 0 for hdfs_stdout in hdfs_stdout_gen: r = clickhouse.insert_rows(table=self.clickhouse_table, data=hdfs_stdout, row_format=self.row_format, timeout=self.timeout, params={'send_progress_in_http_headers': 1}) written_rows = json.loads(r.headers['X-ClickHouse-Summary']).get('written_rows') if written_rows: total_rows += int(written_rows) else: self.log.warning('Failed to retrieve row count.') if total_rows == 0: raise AirflowException("Inserted rows: 0") except Exception as e: raise AirflowException("Error inserting into clickhouse", e) self.log.info("Successfully inserted %s rows into %s." % (total_rows, self.clickhouse_table))
def test_get_autoconfig_client(self, mock_get_connections, mock_client): conn = Connection(conn_id='hdfs', conn_type='hdfs', host='localhost', port=8020, login='******', extra=json.dumps({'autoconfig': True})) mock_get_connections.return_value = [conn] HDFSHook(hdfs_conn_id='hdfs').get_conn() mock_client.assert_called_once_with(effective_user='******', use_sasl=False)
def test_get_ha_client(self, mock_get_connections): conn_1 = Connection(conn_id='hdfs_default', conn_type='hdfs', host='localhost', port=8020) conn_2 = Connection(conn_id='hdfs_default', conn_type='hdfs', host='localhost2', port=8020) mock_get_connections.return_value = [conn_1, conn_2] client = HDFSHook().get_conn() self.assertIsInstance(client, snakebite.client.HAClient)
def test_get_autoconfig_client_no_conn(self, mock_client): HDFSHook(hdfs_conn_id='hdfs_missing', autoconfig=True).get_conn() mock_client.assert_called_once_with(effective_user=None, use_sasl=False)
def test_get_client(self): client = HDFSHook(proxy_user='******').get_conn() self.assertIsInstance(client, snakebite.client.Client) self.assertEqual('localhost', client.host) self.assertEqual(8020, client.port) self.assertEqual('foo', client.service.channel.effective_user)
def test_get_client(self): client = HDFSHook(proxy_user='******').get_conn() assert isinstance(client, snakebite.client.Client) assert 'localhost' == client.host assert 8020 == client.port assert 'foo' == client.service.channel.effective_user