def _get_partition_watermarks(self, table, tableRef, partitions): if "field" in table["timePartitioning"]: field = table["timePartitioning"]["field"] else: field = "_PARTITIONTIME" low = min(partitions, key=lambda t: t.partition_id) low_wm = Watermark( datetime.datetime.fromtimestamp(float( low.epoch_created)).strftime("%Y-%m-%d %H:%M:%S"), "bigquery", tableRef["datasetId"], tableRef["tableId"], "{field}={partition_id}".format(field=field, partition_id=low.partition_id), part_type="low_watermark", cluster=tableRef["projectId"], ) high = max(partitions, key=lambda t: t.partition_id) high_wm = Watermark( datetime.datetime.fromtimestamp(float( high.epoch_created)).strftime("%Y-%m-%d %H:%M:%S"), "bigquery", tableRef["datasetId"], tableRef["tableId"], "{field}={partition_id}".format(field=field, partition_id=high.partition_id), part_type="high_watermark", cluster=tableRef["projectId"], ) return low_wm, high_wm
def _get_partition_watermarks( self, table: Dict[str, Any], tableRef: Dict[str, str], partitions: List[PartitionInfo]) -> Tuple[Watermark, Watermark]: if 'field' in table['timePartitioning']: field = table['timePartitioning']['field'] else: field = '_PARTITIONTIME' low = min(partitions, key=lambda t: t.partition_id) low_wm = Watermark(datetime.datetime.fromtimestamp( float(low.epoch_created)).strftime('%Y-%m-%d %H:%M:%S'), 'bigquery', tableRef['datasetId'], tableRef['tableId'], '{field}={partition_id}'.format( field=field, partition_id=low.partition_id), part_type="low_watermark", cluster=tableRef['projectId']) high = max(partitions, key=lambda t: t.partition_id) high_wm = Watermark(datetime.datetime.fromtimestamp( float(high.epoch_created)).strftime('%Y-%m-%d %H:%M:%S'), 'bigquery', tableRef['datasetId'], tableRef['tableId'], '{field}={partition_id}'.format( field=field, partition_id=high.partition_id), part_type="high_watermark", cluster=tableRef['projectId']) return low_wm, high_wm
def test_index_with_data(self) -> None: extractor = self._get_extractor([self.index_with_data_1]) expected = [ Watermark( database='elasticsearch', cluster='cluster_name', schema='schema_name', table_name='index_with_data_1', create_time=datetime.fromtimestamp(1641863003).strftime( '%Y-%m-%d %H:%M:%S'), part_name= f"time={datetime.fromtimestamp(1641863055).strftime('%Y-%m-%d')}", part_type='low_watermark'), Watermark( database='elasticsearch', cluster='cluster_name', schema='schema_name', table_name='index_with_data_1', create_time=datetime.fromtimestamp(1641863003).strftime( '%Y-%m-%d %H:%M:%S'), part_name= f"time={datetime.fromtimestamp(1641949455).strftime('%Y-%m-%d')}", part_type='high_watermark') ] self._extract_and_compare(extractor, expected)
def setUp(self) -> None: super(TestWatermark, self).setUp() self.watermark = Watermark(create_time='2017-09-18T00:00:00', database=DATABASE, schema=SCHEMA, table_name=TABLE, cluster=CLUSTER, part_type=PART_TYPE, part_name=NESTED_PART) self.start_key = f'{DATABASE}://{CLUSTER}.{SCHEMA}/{TABLE}/{PART_TYPE}/' self.end_key = f'{DATABASE}://{CLUSTER}.{SCHEMA}/{TABLE}' self.expected_node_result = GraphNode(key=self.start_key, label='Watermark', attributes={ 'partition_key': 'ds', 'partition_value': '2017-09-18/feature_id=9', 'create_time': '2017-09-18T00:00:00' }) self.expected_serialized_node_results = [{ NODE_KEY: self.start_key, NODE_LABEL: 'Watermark', 'partition_key': 'ds', 'partition_value': '2017-09-18/feature_id=9', 'create_time': '2017-09-18T00:00:00' }] self.expected_relation_result = GraphRelationship( start_label='Watermark', end_label='Table', start_key=self.start_key, end_key=self.end_key, type='BELONG_TO_TABLE', reverse_type='WATERMARK', attributes={}) self.expected_serialized_relation_results = [{ RELATION_START_KEY: self.start_key, RELATION_START_LABEL: 'Watermark', RELATION_END_KEY: self.end_key, RELATION_END_LABEL: 'Table', RELATION_TYPE: 'BELONG_TO_TABLE', RELATION_REVERSE_TYPE: 'WATERMARK' }]
def _retrieve_tables(self, dataset): # type: () -> Any sharded_table_watermarks = {} for page in self._page_table_list_results(dataset): if 'tables' not in page: continue for table in page['tables']: tableRef = table['tableReference'] if self._is_table_match_regex(tableRef): table_id = tableRef['tableId'] # BigQuery tables that have 8 digits as last characters are # considered date range tables and are grouped together in the UI. # ( e.g. ga_sessions_20190101, ga_sessions_20190102, etc. ) # We use these suffixes to determine high and low watermarks if self._is_sharded_table(table_id): suffix = table_id[-BigQueryWatermarkExtractor.DATE_LENGTH:] prefix = table_id[:-BigQueryWatermarkExtractor.DATE_LENGTH] if prefix in sharded_table_watermarks: sharded_table_watermarks[prefix]['low'] = min(sharded_table_watermarks[prefix]['low'], suffix) sharded_table_watermarks[prefix]['high'] = max(sharded_table_watermarks[prefix]['high'], suffix) else: sharded_table_watermarks[prefix] = {'high': suffix, 'low': suffix, 'table': table} else: partitions = self._get_partitions(table, tableRef) if not partitions: continue low, high = self._get_partition_watermarks(table, tableRef, partitions) yield low yield high for prefix, td in sharded_table_watermarks.items(): table = td['table'] tableRef = table['tableReference'] yield Watermark( datetime.datetime.fromtimestamp(float(table['creationTime']) / 1000).strftime('%Y-%m-%d %H:%M:%S'), 'bigquery', tableRef['datasetId'], prefix, '__table__={partition_id}'.format(partition_id=td['low']), part_type="low_watermark", cluster=tableRef['projectId'] ) yield Watermark( datetime.datetime.fromtimestamp(float(table['creationTime']) / 1000).strftime('%Y-%m-%d %H:%M:%S'), 'bigquery', tableRef['datasetId'], prefix, '__table__={partition_id}'.format(partition_id=td['high']), part_type="high_watermark", cluster=tableRef['projectId'] )
def setUp(self): # type: () -> None super(TestWatermark, self).setUp() self.watermark = Watermark(create_time='2017-09-18T00:00:00', database=DATABASE, schema=SCHEMA, table_name=TABLE, cluster=CLUSTER, part_type=PART_TYPE, part_name=NESTED_PART) self.expected_node_result = { NODE_KEY: '{database}://{cluster}.{schema}/{table}/{part_type}/'.format( database=DATABASE.lower(), cluster=CLUSTER.lower(), schema=SCHEMA.lower(), table=TABLE.lower(), part_type=PART_TYPE.lower()), NODE_LABEL: 'Watermark', 'partition_key': 'ds', 'partition_value': '2017-09-18/feature_id=9', 'create_time': '2017-09-18T00:00:00' } self.expected_relation_result = { RELATION_START_KEY: '{database}://{cluster}.{schema}/{table}/{part_type}/'.format( database=DATABASE.lower(), cluster=CLUSTER.lower(), schema=SCHEMA.lower(), table=TABLE.lower(), part_type=PART_TYPE.lower()), RELATION_START_LABEL: 'Watermark', RELATION_END_KEY: '{database}://{cluster}.{schema}/{table}'.format( database=DATABASE.lower(), cluster=CLUSTER.lower(), schema=SCHEMA.lower(), table=TABLE.lower()), RELATION_END_LABEL: 'Table', RELATION_TYPE: 'BELONG_TO_TABLE', RELATION_REVERSE_TYPE: 'WATERMARK' }
def _get_extract_iter(self) -> Iterator[Union[Watermark, None]]: # Get all the indices indices: Dict = self._get_indexes() # Iterate over indices for index_name, index_metadata in indices.items(): creation_date: Optional[float] = self._get_index_creation_date(index_metadata) watermark_bounds: Optional[Tuple[float, float]] = self._get_index_watermark_bounds(index_name=index_name) watermark_min: Optional[float] = None if watermark_bounds is None else watermark_bounds[0] watermark_max: Optional[float] = None if watermark_bounds is None else watermark_bounds[1] if creation_date is None or watermark_min is None or watermark_max is None: continue creation_date_str: str = datetime.fromtimestamp(creation_date / 1000).strftime('%Y-%m-%d %H:%M:%S') watermark_min_str: str = datetime.fromtimestamp(watermark_min / 1000).strftime('%Y-%m-%d') watermark_max_str: str = datetime.fromtimestamp(watermark_max / 1000).strftime('%Y-%m-%d') yield Watermark( database=self.database, cluster=self.cluster, schema=self.schema, table_name=index_name, create_time=creation_date_str, part_name=f'{self._time_field}={watermark_min_str}', part_type='low_watermark' ) yield Watermark( database=self.database, cluster=self.cluster, schema=self.schema, table_name=index_name, create_time=creation_date_str, part_name=f'{self._time_field}={watermark_max_str}', part_type='high_watermark' )
class TestWatermark(unittest.TestCase): def setUp(self) -> None: super(TestWatermark, self).setUp() self.watermark = Watermark(create_time='2017-09-18T00:00:00', database=DATABASE, schema=SCHEMA, table_name=TABLE, cluster=CLUSTER, part_type=PART_TYPE, part_name=NESTED_PART) self.start_key = f'{DATABASE}://{CLUSTER}.{SCHEMA}/{TABLE}/{PART_TYPE}/' self.end_key = f'{DATABASE}://{CLUSTER}.{SCHEMA}/{TABLE}' self.expected_node_result = GraphNode(key=self.start_key, label='Watermark', attributes={ 'partition_key': 'ds', 'partition_value': '2017-09-18/feature_id=9', 'create_time': '2017-09-18T00:00:00' }) self.expected_serialized_node_result = { NODE_KEY: self.start_key, NODE_LABEL: 'Watermark', 'partition_key': 'ds', 'partition_value': '2017-09-18/feature_id=9', 'create_time': '2017-09-18T00:00:00' } self.expected_relation_result = GraphRelationship( start_label='Watermark', end_label='Table', start_key=self.start_key, end_key=self.end_key, type='BELONG_TO_TABLE', reverse_type='WATERMARK', attributes={}) self.expected_serialized_relation_result = { RELATION_START_KEY: self.start_key, RELATION_START_LABEL: 'Watermark', RELATION_END_KEY: self.end_key, RELATION_END_LABEL: 'Table', RELATION_TYPE: 'BELONG_TO_TABLE', RELATION_REVERSE_TYPE: 'WATERMARK' } def test_get_watermark_model_key(self) -> None: watermark = self.watermark.get_watermark_model_key() self.assertEqual( watermark, f'{DATABASE}://{CLUSTER}.{SCHEMA}/{TABLE}/{PART_TYPE}/') def test_get_metadata_model_key(self) -> None: metadata = self.watermark.get_metadata_model_key() self.assertEqual(metadata, f'{DATABASE}://{CLUSTER}.{SCHEMA}/{TABLE}') def test_create_nodes(self) -> None: nodes = self.watermark.create_nodes() self.assertEquals(len(nodes), 1) self.assertEquals(nodes[0], self.expected_node_result) self.assertEqual(neo4_serializer.serialize_node(nodes[0]), self.expected_serialized_node_result) def test_create_nodes_neptune(self) -> None: nodes = self.watermark.create_nodes() expected_serialized_node_result = { NEPTUNE_HEADER_ID: self.start_key, NEPTUNE_HEADER_LABEL: 'Watermark', NEPTUNE_LAST_EXTRACTED_AT_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: ANY, NEPTUNE_CREATION_TYPE_NODE_PROPERTY_NAME_BULK_LOADER_FORMAT: NEPTUNE_CREATION_TYPE_JOB, 'partition_key:String(single)': 'ds', 'partition_value:String(single)': '2017-09-18/feature_id=9', 'create_time:String(single)': '2017-09-18T00:00:00' } serialized_node = neptune_serializer.convert_node(nodes[0]) self.assertDictEqual(expected_serialized_node_result, serialized_node) def test_create_relation(self) -> None: relation = self.watermark.create_relation() self.assertEquals(len(relation), 1) self.assertEquals(relation[0], self.expected_relation_result) self.assertEqual(neo4_serializer.serialize_relationship(relation[0]), self.expected_serialized_relation_result) def test_create_relation_neptune(self) -> None: relation = self.watermark.create_relation() serialized_relation = neptune_serializer.convert_relationship( relation[0]) expected = [{ NEPTUNE_HEADER_ID: "{from_vertex_id}_{to_vertex_id}_{label}".format( from_vertex_id=self.start_key, to_vertex_id=self.end_key, label='BELONG_TO_TABLE'), NEPTUNE_RELATIONSHIP_HEADER_FROM: self.start_key, NEPTUNE_RELATIONSHIP_HEADER_TO: self.end_key, NEPTUNE_HEADER_LABEL: 'BELONG_TO_TABLE', NEPTUNE_LAST_EXTRACTED_AT_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: ANY, NEPTUNE_CREATION_TYPE_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: NEPTUNE_CREATION_TYPE_JOB }, { NEPTUNE_HEADER_ID: "{from_vertex_id}_{to_vertex_id}_{label}".format( from_vertex_id=self.end_key, to_vertex_id=self.start_key, label='WATERMARK'), NEPTUNE_RELATIONSHIP_HEADER_FROM: self.end_key, NEPTUNE_RELATIONSHIP_HEADER_TO: self.start_key, NEPTUNE_HEADER_LABEL: 'WATERMARK', NEPTUNE_LAST_EXTRACTED_AT_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: ANY, NEPTUNE_CREATION_TYPE_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: NEPTUNE_CREATION_TYPE_JOB }] self.assertListEqual(serialized_relation, expected) def test_create_next_node(self) -> None: next_node = self.watermark.create_next_node() self.assertEquals(neo4_serializer.serialize_node(next_node), self.expected_serialized_node_result) def test_create_next_relation(self) -> None: next_relation = self.watermark.create_next_relation() self.assertEquals( neo4_serializer.serialize_relationship(next_relation), self.expected_serialized_relation_result)
class TestWatermark(unittest.TestCase): def setUp(self): # type: () -> None super(TestWatermark, self).setUp() self.watermark = Watermark(create_time='2017-09-18T00:00:00', database=DATABASE, schema_name=SCHEMA, table_name=TABLE, cluster=CLUSTER, part_type=PART_TYPE, part_name=NESTED_PART) self.expected_node_result = { NODE_KEY: '{database}://{cluster}.{schema}/{table}/{part_type}/' .format( database=DATABASE.lower(), cluster=CLUSTER.lower(), schema=SCHEMA.lower(), table=TABLE.lower(), part_type=PART_TYPE.lower()), NODE_LABEL: 'Watermark', 'partition_key': 'ds', 'partition_value': '2017-09-18/feature_id=9', 'create_time': '2017-09-18T00:00:00' } self.expected_relation_result = { RELATION_START_KEY: '{database}://{cluster}.{schema}/{table}/{part_type}/' .format( database=DATABASE.lower(), cluster=CLUSTER.lower(), schema=SCHEMA.lower(), table=TABLE.lower(), part_type=PART_TYPE.lower()), RELATION_START_LABEL: 'Watermark', RELATION_END_KEY: '{database}://{cluster}.{schema}/{table}' .format( database=DATABASE.lower(), cluster=CLUSTER.lower(), schema=SCHEMA.lower(), table=TABLE.lower()), RELATION_END_LABEL: 'Table', RELATION_TYPE: 'BELONG_TO_TABLE', RELATION_REVERSE_TYPE: 'WATERMARK' } def test_get_watermark_model_key(self): # type: () -> None watermark = self.watermark.get_watermark_model_key() self.assertEquals( watermark, '{database}://{cluster}.{schema}/{table}/{part_type}/' .format(database=DATABASE.lower(), cluster=CLUSTER.lower(), schema=SCHEMA.lower(), table=TABLE.lower(), part_type=PART_TYPE.lower())) def test_get_metadata_model_key(self): # type: () -> None metadata = self.watermark.get_metadata_model_key() self.assertEquals(metadata, '{database}://{cluster}.{schema}/{table}' .format(database=DATABASE.lower(), cluster=CLUSTER.lower(), schema=SCHEMA.lower(), table=TABLE.lower())) def test_create_nodes(self): # type: () -> None nodes = self.watermark.create_nodes() self.assertEquals(len(nodes), 1) self.assertEquals(nodes[0], self.expected_node_result) def test_create_relation(self): # type: () -> None relation = self.watermark.create_relation() self.assertEquals(len(relation), 1) self.assertEquals(relation[0], self.expected_relation_result) def test_create_next_node(self): # type: () -> None next_node = self.watermark.create_next_node() self.assertEquals(next_node, self.expected_node_result) def test_create_next_relation(self): # type: () -> None next_relation = self.watermark.create_next_relation() self.assertEquals(next_relation, self.expected_relation_result)
def _retrieve_tables(self, dataset: DatasetRef) -> Iterator[Watermark]: sharded_table_watermarks: Dict[str, Dict[str, Union[str, Any]]] = {} cutoff_time_in_epoch = timegm( time.strptime(self.cutoff_time, BigQueryWatermarkExtractor.DATE_TIME_FORMAT)) for page in self._page_table_list_results(dataset): if 'tables' not in page: continue for table in page['tables']: tableRef = table['tableReference'] table_id = tableRef['tableId'] table_creation_time = float(table['creationTime']) / 1000 # only extract watermark metadata for tables created before the cut-off time if table_creation_time < cutoff_time_in_epoch: # BigQuery tables that have numeric suffix starts with a date are # considered date range tables. # ( e.g. ga_sessions_20190101, ga_sessions_20190102, etc. ) # We use these dates in the suffixes to determine high and low watermarks if self._is_sharded_table(table_id): suffix = self._get_sharded_table_suffix(table_id) prefix = table_id[:-len(suffix)] date = suffix[:BaseBigQueryExtractor.DATE_LENGTH] if prefix in sharded_table_watermarks: sharded_table_watermarks[prefix]['low'] = min( sharded_table_watermarks[prefix]['low'], date) sharded_table_watermarks[prefix]['high'] = max( sharded_table_watermarks[prefix]['high'], date) else: sharded_table_watermarks[prefix] = { 'high': date, 'low': date, 'table': table } else: partitions = self._get_partitions(table, tableRef) if not partitions: continue low, high = self._get_partition_watermarks( table, tableRef, partitions) yield low yield high for prefix, td in sharded_table_watermarks.items(): table = td['table'] tableRef = table['tableReference'] yield Watermark(datetime.datetime.fromtimestamp( float(table['creationTime']) / 1000).strftime('%Y-%m-%d %H:%M:%S'), 'bigquery', tableRef['datasetId'], prefix, f'__table__={td["low"]}', part_type="low_watermark", cluster=tableRef['projectId']) yield Watermark(datetime.datetime.fromtimestamp( float(table['creationTime']) / 1000).strftime('%Y-%m-%d %H:%M:%S'), 'bigquery', tableRef['datasetId'], prefix, f'__table__={td["high"]}', part_type="high_watermark", cluster=tableRef['projectId'])
class TestWatermark(unittest.TestCase): def setUp(self) -> None: super(TestWatermark, self).setUp() self.watermark = Watermark(create_time='2017-09-18T00:00:00', database=DATABASE, schema=SCHEMA, table_name=TABLE, cluster=CLUSTER, part_type=PART_TYPE, part_name=NESTED_PART) start_key = '{database}://{cluster}.{schema}/{table}/{part_type}/'.format( database=DATABASE, cluster=CLUSTER, schema=SCHEMA, table=TABLE, part_type=PART_TYPE) end_key = '{database}://{cluster}.{schema}/{table}'.format( database=DATABASE, cluster=CLUSTER, schema=SCHEMA, table=TABLE) self.expected_node_result = GraphNode(key=start_key, label='Watermark', attributes={ 'partition_key': 'ds', 'partition_value': '2017-09-18/feature_id=9', 'create_time': '2017-09-18T00:00:00' }) self.expected_serialized_node_result = { NODE_KEY: start_key, NODE_LABEL: 'Watermark', 'partition_key': 'ds', 'partition_value': '2017-09-18/feature_id=9', 'create_time': '2017-09-18T00:00:00' } self.expected_relation_result = GraphRelationship( start_label='Watermark', end_label='Table', start_key=start_key, end_key=end_key, type='BELONG_TO_TABLE', reverse_type='WATERMARK', attributes={}) self.expected_serialized_relation_result = { RELATION_START_KEY: start_key, RELATION_START_LABEL: 'Watermark', RELATION_END_KEY: end_key, RELATION_END_LABEL: 'Table', RELATION_TYPE: 'BELONG_TO_TABLE', RELATION_REVERSE_TYPE: 'WATERMARK' } def test_get_watermark_model_key(self) -> None: watermark = self.watermark.get_watermark_model_key() self.assertEqual( watermark, '{database}://{cluster}.{schema}/{table}/{part_type}/'.format( database=DATABASE, cluster=CLUSTER, schema=SCHEMA, table=TABLE, part_type=PART_TYPE)) def test_get_metadata_model_key(self) -> None: metadata = self.watermark.get_metadata_model_key() self.assertEqual( metadata, '{database}://{cluster}.{schema}/{table}'.format(database=DATABASE, cluster=CLUSTER, schema=SCHEMA, table=TABLE)) def test_create_nodes(self) -> None: nodes = self.watermark.create_nodes() self.assertEquals(len(nodes), 1) self.assertEquals(nodes[0], self.expected_node_result) self.assertEqual(neo4_serializer.serialize_node(nodes[0]), self.expected_serialized_node_result) def test_create_relation(self) -> None: relation = self.watermark.create_relation() self.assertEquals(len(relation), 1) self.assertEquals(relation[0], self.expected_relation_result) self.assertEqual(neo4_serializer.serialize_relationship(relation[0]), self.expected_serialized_relation_result) def test_create_next_node(self) -> None: next_node = self.watermark.create_next_node() self.assertEquals(neo4_serializer.serialize_node(next_node), self.expected_serialized_node_result) def test_create_next_relation(self) -> None: next_relation = self.watermark.create_next_relation() self.assertEquals( neo4_serializer.serialize_relationship(next_relation), self.expected_serialized_relation_result)
def create_table_watermarks( self, table: ScrapedTableMetadata ) -> Optional[List[Tuple[Watermark, Watermark]]]: # noqa c901 """ Creates the watermark objects that reflect the highest and lowest values in the partition columns """ def _is_show_partitions_supported(t: ScrapedTableMetadata) -> bool: try: self.spark.sql(f'show partitions {t.schema}.{t.table}') return True except Exception as e: # pyspark.sql.utils.AnalysisException: SHOW PARTITIONS is not allowed on a table that is not partitioned LOGGER.warning(e) return False def _fetch_minmax(table: ScrapedTableMetadata, partition_column: str) -> Tuple[str, str]: LOGGER.info( f'Fetching partition info for {partition_column} in {table.schema}.{table.table}' ) min_water = "" max_water = "" try: if is_show_partitions_supported: LOGGER.info('Using SHOW PARTITION') min_water = str( self.spark.sql( f'show partitions {table.schema}.{table.table}'). orderBy(partition_column, ascending=True).first()[partition_column]) max_water = str( self.spark.sql( f'show partitions {table.schema}.{table.table}'). orderBy(partition_column, ascending=False).first()[partition_column]) else: LOGGER.info('Using DESCRIBE EXTENDED') part_info = (self.spark.sql( f'describe extended {table.schema}.{table.table} {partition_column}' ).collect()) minmax = {} for mm in list( filter(lambda x: x['info_name'] in ['min', 'max'], part_info)): minmax[mm['info_name']] = mm['info_value'] min_water = minmax['min'] max_water = minmax['max'] except Exception as e: LOGGER.warning(f'Failed fetching partition watermarks: {e}') return max_water, min_water if not table.table_detail: LOGGER.info(f'No table details found in {table}, skipping') return None if 'partitionColumns' not in table.table_detail or len( table.table_detail['partitionColumns']) < 1: LOGGER.info(f'No partitions found in {table}, skipping') return None is_show_partitions_supported: bool = _is_show_partitions_supported( table) if not is_show_partitions_supported: LOGGER.info('Analyzing table, this can take a while...') partition_columns = ','.join( table.table_detail['partitionColumns']) self.spark.sql( f"analyze table {table.schema}.{table.table} compute statistics for columns {partition_columns}" ) # It makes little sense to get watermarks from a string value, with no concept of high and low. # Just imagine a dataset with a partition by country... valid_types = ['int', 'float', 'date', 'datetime'] if table.columns: _table_columns = table.columns else: _table_columns = [] columns_with_valid_type = list( map( lambda l: l.name, filter(lambda l: str(l.data_type).lower() in valid_types, _table_columns))) r = [] for partition_column in table.table_detail['partitionColumns']: if partition_column not in columns_with_valid_type: continue last, first = _fetch_minmax(table, partition_column) low = Watermark(create_time=table.table_detail['createdAt'], database=self._db, schema=table.schema, table_name=table.table, part_name=f'{partition_column}={first}', part_type='low_watermark', cluster=self._cluster) high = Watermark(create_time=table.table_detail['createdAt'], database=self._db, schema=table.schema, table_name=table.table, part_name=f'{partition_column}={last}', part_type='high_watermark', cluster=self._cluster) r.append((high, low)) return r
class TestWatermark(unittest.TestCase): def setUp(self) -> None: super(TestWatermark, self).setUp() self.watermark = Watermark( create_time='2017-09-18T00:00:00', database=DATABASE, schema=SCHEMA, table_name=TABLE, cluster=CLUSTER, part_type=PART_TYPE, part_name=NESTED_PART ) self.start_key = f'{DATABASE}://{CLUSTER}.{SCHEMA}/{TABLE}/{PART_TYPE}/' self.end_key = f'{DATABASE}://{CLUSTER}.{SCHEMA}/{TABLE}' self.expected_node_result = GraphNode( key=self.start_key, label='Watermark', attributes={ 'partition_key': 'ds', 'partition_value': '2017-09-18/feature_id=9', 'create_time': '2017-09-18T00:00:00' } ) self.expected_serialized_node_results = [{ NODE_KEY: self.start_key, NODE_LABEL: 'Watermark', 'partition_key': 'ds', 'partition_value': '2017-09-18/feature_id=9', 'create_time': '2017-09-18T00:00:00' }] self.expected_relation_result = GraphRelationship( start_label='Watermark', end_label='Table', start_key=self.start_key, end_key=self.end_key, type='BELONG_TO_TABLE', reverse_type='WATERMARK', attributes={} ) self.expected_serialized_relation_results = [{ RELATION_START_KEY: self.start_key, RELATION_START_LABEL: 'Watermark', RELATION_END_KEY: self.end_key, RELATION_END_LABEL: 'Table', RELATION_TYPE: 'BELONG_TO_TABLE', RELATION_REVERSE_TYPE: 'WATERMARK' }] def test_get_watermark_model_key(self) -> None: watermark = self.watermark.get_watermark_model_key() self.assertEqual(watermark, f'{DATABASE}://{CLUSTER}.{SCHEMA}/{TABLE}/{PART_TYPE}/') def test_get_metadata_model_key(self) -> None: metadata = self.watermark.get_metadata_model_key() self.assertEqual(metadata, f'{DATABASE}://{CLUSTER}.{SCHEMA}/{TABLE}') def test_create_nodes(self) -> None: actual = [] node = self.watermark.create_next_node() while node: serialized_node = neo4_serializer.serialize_node(node) actual.append(serialized_node) node = self.watermark.create_next_node() self.assertEqual(actual, self.expected_serialized_node_results) def test_create_nodes_neptune(self) -> None: expected_serialized_node_results = [{ NEPTUNE_HEADER_ID: 'Watermark:' + self.start_key, METADATA_KEY_PROPERTY_NAME: 'Watermark:' + self.start_key, NEPTUNE_HEADER_LABEL: 'Watermark', NEPTUNE_LAST_EXTRACTED_AT_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: ANY, NEPTUNE_CREATION_TYPE_NODE_PROPERTY_NAME_BULK_LOADER_FORMAT: NEPTUNE_CREATION_TYPE_JOB, 'partition_key:String(single)': 'ds', 'partition_value:String(single)': '2017-09-18/feature_id=9', 'create_time:String(single)': '2017-09-18T00:00:00' }] actual = [] node = self.watermark.create_next_node() while node: serialized_node = neptune_serializer.convert_node(node) actual.append(serialized_node) node = self.watermark.create_next_node() self.assertEqual(expected_serialized_node_results, actual) def test_create_relation(self) -> None: actual = [] relation = self.watermark.create_next_relation() while relation: serialized_relation = neo4_serializer.serialize_relationship(relation) actual.append(serialized_relation) relation = self.watermark.create_next_relation() self.assertEqual(actual, self.expected_serialized_relation_results) def test_create_relation_neptune(self) -> None: actual = [] relation = self.watermark.create_next_relation() while relation: serialized_relation = neptune_serializer.convert_relationship(relation) actual.append(serialized_relation) relation = self.watermark.create_next_relation() expected = [ [ { NEPTUNE_HEADER_ID: "{label}:{from_vertex_id}_{to_vertex_id}".format( from_vertex_id="Watermark:" + self.start_key, to_vertex_id="Table:" + self.end_key, label='BELONG_TO_TABLE' ), METADATA_KEY_PROPERTY_NAME: "{label}:{from_vertex_id}_{to_vertex_id}".format( from_vertex_id="Watermark:" + self.start_key, to_vertex_id="Table:" + self.end_key, label='BELONG_TO_TABLE' ), NEPTUNE_RELATIONSHIP_HEADER_FROM: "Watermark:" + self.start_key, NEPTUNE_RELATIONSHIP_HEADER_TO: "Table:" + self.end_key, NEPTUNE_HEADER_LABEL: 'BELONG_TO_TABLE', NEPTUNE_LAST_EXTRACTED_AT_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: ANY, NEPTUNE_CREATION_TYPE_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: NEPTUNE_CREATION_TYPE_JOB }, { NEPTUNE_HEADER_ID: "{label}:{from_vertex_id}_{to_vertex_id}".format( from_vertex_id="Table:" + self.end_key, to_vertex_id="Watermark:" + self.start_key, label='WATERMARK' ), METADATA_KEY_PROPERTY_NAME: "{label}:{from_vertex_id}_{to_vertex_id}".format( from_vertex_id="Table:" + self.end_key, to_vertex_id="Watermark:" + self.start_key, label='WATERMARK' ), NEPTUNE_RELATIONSHIP_HEADER_FROM: "Table:" + self.end_key, NEPTUNE_RELATIONSHIP_HEADER_TO: "Watermark:" + self.start_key, NEPTUNE_HEADER_LABEL: 'WATERMARK', NEPTUNE_LAST_EXTRACTED_AT_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: ANY, NEPTUNE_CREATION_TYPE_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: NEPTUNE_CREATION_TYPE_JOB } ] ] self.assertListEqual(actual, expected) def test_create_records(self) -> None: expected = [{ 'rk': self.start_key, 'partition_key': 'ds', 'partition_value': '2017-09-18/feature_id=9', 'create_time': '2017-09-18T00:00:00', 'table_rk': self.end_key }] actual = [] record = self.watermark.create_next_record() while record: serialized_record = mysql_serializer.serialize_record(record) actual.append(serialized_record) record = self.watermark.create_next_record() self.assertEqual(actual, expected)