Ejemplos de TableLastUpdated en Python, ejemplos de databuilder.models.table_last_updated.TableLastUpdated en Python

Ejemplo n.º 1

0

Mostrar archivo

    def test_extraction_with_database_specified(self) -> None:
        """
        Test DATABASE_KEY in extractor result
        """
        with patch.object(SQLAlchemyExtractor,
                          '_get_connection') as mock_connection:
            connection = MagicMock()
            mock_connection.return_value = connection
            sql_execute = MagicMock()
            connection.execute = sql_execute

            sql_execute.return_value = [{
                'schema': 'test_schema',
                'table_name': 'test_table',
                'last_updated_time': 1000,
                'cluster': 'MY_CLUSTER',
            }]

            extractor = SnowflakeTableLastUpdatedExtractor()
            extractor.init(self.conf)
            actual = extractor.extract()
            expected = TableLastUpdated(schema='test_schema',
                                        table_name='test_table',
                                        last_updated_time_epoch=1000,
                                        db=self.database_key,
                                        cluster='MY_CLUSTER')
            self.assertEqual(expected.__repr__(), actual.__repr__())
            self.assertIsNone(extractor.extract())

Ejemplo n.º 2

0

Mostrar archivo

    def test_extraction_with_single_result(self) -> None:
        """
        Test Extraction with default cluster and database and with one table as result
        """
        with patch.object(SQLAlchemyExtractor,
                          '_get_connection') as mock_connection:
            connection = MagicMock()
            mock_connection.return_value = connection
            sql_execute = MagicMock()
            connection.execute = sql_execute
            sql_execute.return_value = [{
                'schema':
                'test_schema',
                'table_name':
                'test_table',
                'last_updated_time':
                1000,
                'cluster':
                self.conf['extractor.snowflake_table_last_updated.{}'.format(
                    SnowflakeTableLastUpdatedExtractor.CLUSTER_KEY)],
            }]

            extractor = SnowflakeTableLastUpdatedExtractor()
            extractor.init(self.conf)
            actual = extractor.extract()

            expected = TableLastUpdated(schema='test_schema',
                                        table_name='test_table',
                                        last_updated_time_epoch=1000,
                                        db='snowflake',
                                        cluster='MY_CLUSTER')
            self.assertEqual(expected.__repr__(), actual.__repr__())
            self.assertIsNone(extractor.extract())

Ejemplo n.º 3

0

Mostrar archivo

Archivo: test_table_last_updated.py Proyecto: youngyjd/amundsendatabuilder

    def setUp(self) -> None:
        super(TestTableLastUpdated, self).setUp()

        self.tableLastUpdated = TableLastUpdated(
            table_name='test_table',
            last_updated_time_epoch=25195665,
            schema='default')

        self.expected_node_results = [{
            NODE_KEY:
            'hive://gold.default/test_table/timestamp',
            NODE_LABEL:
            'Timestamp',
            'last_updated_timestamp:UNQUOTED':
            25195665,
            timestamp_constants.TIMESTAMP_PROPERTY + ":UNQUOTED":
            25195665,
            'name':
            'last_updated_timestamp'
        }]

        self.expected_relation_results = [{
            RELATION_START_KEY:
            'hive://gold.default/test_table',
            RELATION_START_LABEL:
            'Table',
            RELATION_END_KEY:
            'hive://gold.default/test_table/timestamp',
            RELATION_END_LABEL:
            'Timestamp',
            RELATION_TYPE:
            'LAST_UPDATED_AT',
            RELATION_REVERSE_TYPE:
            'LAST_UPDATED_TIME_OF'
        }]

Ejemplo n.º 4

0

Mostrar archivo

    def test_extraction(self) -> None:
        old_datetime = datetime(2018, 8, 14, 4, 12, 3, tzinfo=UTC)
        new_datetime = datetime(2018, 11, 14, 4, 12, 3, tzinfo=UTC)

        fs = MagicMock()
        fs.ls = MagicMock(return_value=['/foo/bar', '/foo/baz'])
        fs.is_file = MagicMock(return_value=True)
        fs.info = MagicMock(side_effect=[
            FileMetadata(
                path='/foo/bar', last_updated=old_datetime, size=15093),
            FileMetadata(
                path='/foo/baz', last_updated=new_datetime, size=15094)
        ])

        pt_alchemy_extractor_instance = MagicMock()
        non_pt_alchemy_extractor_instance = MagicMock()

        with patch.object(HiveTableLastUpdatedExtractor,
                          '_get_partitioned_table_sql_alchemy_extractor', return_value=pt_alchemy_extractor_instance), \
            patch.object(HiveTableLastUpdatedExtractor,
                         '_get_non_partitioned_table_sql_alchemy_extractor',
                         return_value=non_pt_alchemy_extractor_instance), \
            patch.object(HiveTableLastUpdatedExtractor,
                         '_get_filesystem', return_value=fs):
            pt_alchemy_extractor_instance.extract = MagicMock(
                return_value=None)

            non_pt_alchemy_extractor_instance.extract = MagicMock(
                side_effect=null_iterator([
                    {
                        'schema': 'foo_schema',
                        'table_name': 'table_1',
                        'location': '/foo/bar'
                    },
                ]))

            extractor = HiveTableLastUpdatedExtractor()
            extractor.init(ConfigFactory.from_dict({}))

            result = extractor.extract()
            expected = TableLastUpdated(schema='foo_schema',
                                        table_name='table_1',
                                        last_updated_time_epoch=1542168723,
                                        db='hive',
                                        cluster='gold')
            self.assertEqual(result.__repr__(), expected.__repr__())

            self.assertIsNone(extractor.extract())

Ejemplo n.º 5

0

Mostrar archivo

    def _retrieve_tables(self, dataset):
        # type: () -> Any
        for page in self._page_table_list_results(dataset):
            if 'tables' not in page:
                continue

            for table in page['tables']:
                tableRef = table['tableReference']
                table_id = tableRef['tableId']

                # BigQuery tables that have 8 digits as last characters are
                # considered date range tables and are grouped together in the UI.
                # ( e.g. ga_sessions_20190101, ga_sessions_20190102, etc. )
                if self._is_sharded_table(table_id):
                    # If the last eight characters are digits, we assume the table is of a table date range type
                    # and then we only need one schema definition
                    table_prefix = table_id[:-BigQueryLastUpdatedExtractor.DATE_LENGTH]
                    if table_prefix in self.grouped_tables:
                        # If one table in the date range is processed, then ignore other ones
                        # (it adds too much metadata)
                        continue

                    table_id = table_prefix
                    self.grouped_tables.add(table_prefix)

                table = self.bigquery_service.tables().get(
                    projectId=tableRef['projectId'],
                    datasetId=tableRef['datasetId'],
                    tableId=tableRef['tableId']).execute(num_retries=BigQueryLastUpdatedExtractor.NUM_RETRIES)

                                
                table_last_upd = TableLastUpdated(table_name=table_id, last_updated_time_epoch=int(table['lastModifiedTime'])//1000,schema=tableRef['datasetId'],db='bigquery',cluster=tableRef['projectId'])
                    
                    
                yield(table_last_upd)

Ejemplo n.º 6

0

Mostrar archivo

Archivo: hive_table_last_updated_extractor.py Proyecto: irvcaza/datalake4os

    def _get_last_updated_datetime_from_filesystem(self,
                                                   table: str,
                                                   schema: str,
                                                   storage_location: str,
                                                   ) -> Union[TableLastUpdated, None]:
        """
        Fetching metadata within files under storage_location to get latest timestamp.
        (First level only under storage_location)
        Utilizes thread pool to enhance performance. Not using processpool, as it's almost entirely IO bound operation.

        :param table:
        :param schema:
        :param storage_location:
        :return:
        """

        if LOGGER.isEnabledFor(logging.DEBUG):
            LOGGER.debug(f'Getting last updated datetime for {schema}.{table} in {storage_location}')

        last_updated = OLDEST_TIMESTAMP

        paths = self._ls(storage_location)
        if not paths:
            LOGGER.info(f'{schema}.{table} does not have any file in path {storage_location}. Skipping')
            return None

        LOGGER.info(f'Fetching metadata for {schema}.{table} of {len(paths)} files')

        if 0 < self._last_updated_filecheck_threshold < len(paths):
            LOGGER.info(f'Skipping {schema}.{table} due to too many files. '
                        f'{len(paths)} files exist in {storage_location}')
            return None

        time_stamp_futures = \
            [self._fs_worker_pool.apply_async(self._get_timestamp, (path, schema, table, storage_location))
             for path in paths]
        for time_stamp_future in time_stamp_futures:
            try:
                time_stamp = time_stamp_future.get(timeout=self._fs_worker_timeout)
                if time_stamp:
                    last_updated = max(time_stamp, last_updated)
            except TimeoutError:
                LOGGER.warning('Timed out on paths %s . Skipping', paths)

        if last_updated == OLDEST_TIMESTAMP:
            LOGGER.info(f'No timestamp was derived on {schema}.{table} from location: {storage_location} . Skipping')
            return None

        result = TableLastUpdated(table_name=table,
                                  last_updated_time_epoch=int((last_updated - OLDEST_TIMESTAMP).total_seconds()),
                                  schema=schema,
                                  db=HiveTableLastUpdatedExtractor.DATABASE,
                                  cluster=self._cluster)

        return result

Ejemplo n.º 7

0

Mostrar archivo

 def _get_extract_iter(self) -> Iterator[TableLastUpdated]:
     """
     Provides iterator of result row from SQLAlchemy extractor
     """
     tbl_last_updated_row = self._alchemy_extractor.extract()
     while tbl_last_updated_row:
         yield TableLastUpdated(table_name=tbl_last_updated_row['table_name'],
                                last_updated_time_epoch=tbl_last_updated_row['last_updated_time'],
                                schema=tbl_last_updated_row['schema'],
                                db=self._database,
                                cluster=tbl_last_updated_row['cluster'])
         tbl_last_updated_row = self._alchemy_extractor.extract()

Ejemplo n.º 8

0

Mostrar archivo

Archivo: hive_table_last_updated_extractor.py Proyecto: thierryturpin/amundsendatabuilder

    def _get_extract_iter(self):
        # type: () -> Iterator[TableLastUpdated]
        """
        An iterator that utilizes Generator pattern. First it provides TableLastUpdated objects for partitioned
        table, straight from partitioned_table_extractor (SQLAlchemyExtractor)

        Once partitioned table is done, it uses non_partitioned_table_extractor to get storage location of table,
        and probing files under storage location to get max timestamp per table.
        :return:
        """

        partitioned_tbl_row = self._partitioned_table_extractor.extract()
        while partitioned_tbl_row:
            yield TableLastUpdated(
                table_name=partitioned_tbl_row['table_name'],
                last_updated_time_epoch=partitioned_tbl_row[
                    'last_updated_time'],
                schema_name=partitioned_tbl_row['schema_name'],
                db=HiveTableLastUpdatedExtractor.DATABASE,
                cluster=self._cluster)
            partitioned_tbl_row = self._partitioned_table_extractor.extract()

        LOGGER.info('Extracting non-partitioned table')
        count = 0
        non_partitioned_tbl_row = self._non_partitioned_table_extractor.extract(
        )
        while non_partitioned_tbl_row:
            count += 1
            if count % 10 == 0:
                LOGGER.info(
                    'Processed {} non-partitioned tables'.format(count))

            if not non_partitioned_tbl_row['location']:
                LOGGER.warning(
                    'Skipping as no storage location available. {}'.format(
                        non_partitioned_tbl_row))
                non_partitioned_tbl_row = self._non_partitioned_table_extractor.extract(
                )
                continue

            start = time.time()
            table_last_updated = self._get_last_updated_datetime_from_filesystem(
                table=non_partitioned_tbl_row['table_name'],
                schema=non_partitioned_tbl_row['schema_name'],
                storage_location=non_partitioned_tbl_row['location'])
            LOGGER.info('Elapsed: {} seconds'.format(time.time() - start))

            if table_last_updated:
                yield table_last_updated

            non_partitioned_tbl_row = self._non_partitioned_table_extractor.extract(
            )

Ejemplo n.º 9

0

Mostrar archivo

 def create_table_last_updated(
         self, table: ScrapedTableMetadata) -> Optional[TableLastUpdated]:
     '''Creates the amundsen table last updated metadata object from the ScrapedTableMetadata object.'''
     last_modified = table.get_last_modified()
     if last_modified:
         return TableLastUpdated(table_name=table.table,
                                 last_updated_time_epoch=int(
                                     last_modified.timestamp()),
                                 schema=table.schema,
                                 db=self._db,
                                 cluster=self._cluster)
     else:
         return None

Ejemplo n.º 10

0

Mostrar archivo

Archivo: test_table_last_updated.py Proyecto: shahneil88/amundsendatabuilder

class TestTableLastUpdated(unittest.TestCase):
    def setUp(self) -> None:
        super(TestTableLastUpdated, self).setUp()

        self.tableLastUpdated = TableLastUpdated(
            table_name='test_table',
            last_updated_time_epoch=25195665,
            schema='default')

        self.expected_node_result = {
            NODE_KEY: 'hive://gold.default/test_table/timestamp',
            NODE_LABEL: 'Timestamp',
            'last_updated_timestamp:UNQUOTED': 25195665,
            timestamp_constants.TIMESTAMP_PROPERTY + ":UNQUOTED": 25195665,
            'name': 'last_updated_timestamp'
        }

        self.expected_relation_result = {
            RELATION_START_KEY: 'hive://gold.default/test_table',
            RELATION_START_LABEL: 'Table',
            RELATION_END_KEY: 'hive://gold.default/test_table/timestamp',
            RELATION_END_LABEL: 'Timestamp',
            RELATION_TYPE: 'LAST_UPDATED_AT',
            RELATION_REVERSE_TYPE: 'LAST_UPDATED_TIME_OF'
        }

    def test_create_next_node(self) -> None:
        next_node = self.tableLastUpdated.create_next_node()
        next_node_serialized = neo4_serializer.serialize_node(next_node)
        self.assertEqual(next_node_serialized, self.expected_node_result)

    def test_create_next_relation(self) -> None:
        next_relation = self.tableLastUpdated.create_next_relation()
        next_relation_serialized = neo4_serializer.serialize_relationship(
            next_relation)
        self.assertEqual(next_relation_serialized,
                         self.expected_relation_result)

    def test_get_table_model_key(self) -> None:
        table = self.tableLastUpdated.get_table_model_key()
        self.assertEqual(table, 'hive://gold.default/test_table')

    def test_get_last_updated_model_key(self) -> None:
        last_updated = self.tableLastUpdated.get_last_updated_model_key()
        self.assertEqual(last_updated,
                         'hive://gold.default/test_table/timestamp')

    def test_create_nodes(self) -> None:
        nodes = self.tableLastUpdated.create_nodes()
        self.assertEquals(len(nodes), 1)
        serialize_node = neo4_serializer.serialize_node(nodes[0])
        self.assertEquals(serialize_node, self.expected_node_result)

    def test_create_relation(self) -> None:
        relation = self.tableLastUpdated.create_relation()
        self.assertEquals(len(relation), 1)
        serialized_relation = neo4_serializer.serialize_relationship(
            relation[0])
        self.assertEquals(serialized_relation, self.expected_relation_result)

Ejemplo n.º 11

0

Mostrar archivo

Archivo: test_table_last_updated.py Proyecto: dlachasse/amundsendatabuilder

class TestTableLastUpdated(unittest.TestCase):
    def setUp(self):
        # type: () -> None
        super(TestTableLastUpdated, self).setUp()

        self.tableLastUpdated = TableLastUpdated(
            table_name='test_table',
            last_updated_time_epoch=25195665,
            schema='default')

        self.expected_node_result = {
            NODE_KEY: 'hive://gold.default/test_table/timestamp',
            NODE_LABEL: 'Timestamp',
            'last_updated_timestamp': 25195665,
            'name': 'last_updated_timestamp'
        }

        self.expected_relation_result = {
            RELATION_START_KEY: 'hive://gold.default/test_table',
            RELATION_START_LABEL: 'Table',
            RELATION_END_KEY: 'hive://gold.default/test_table/timestamp',
            RELATION_END_LABEL: 'Timestamp',
            RELATION_TYPE: 'LAST_UPDATED_AT',
            RELATION_REVERSE_TYPE: 'LAST_UPDATED_TIME_OF'
        }

    def test_create_next_node(self):
        # type: () -> None
        next_node = self.tableLastUpdated.create_next_node()
        self.assertEquals(next_node, self.expected_node_result)

    def test_create_next_relation(self):
        # type: () -> None
        next_relation = self.tableLastUpdated.create_next_relation()
        self.assertEquals(next_relation, self.expected_relation_result)

    def test_get_table_model_key(self):
        # type: () -> None
        table = self.tableLastUpdated.get_table_model_key()
        self.assertEquals(table, 'hive://gold.default/test_table')

    def test_get_last_updated_model_key(self):
        # type: () -> None
        last_updated = self.tableLastUpdated.get_last_updated_model_key()
        self.assertEquals(last_updated,
                          'hive://gold.default/test_table/timestamp')

    def test_create_nodes(self):
        # type: () -> None
        nodes = self.tableLastUpdated.create_nodes()
        self.assertEquals(len(nodes), 1)
        self.assertEquals(nodes[0], self.expected_node_result)

    def test_create_relation(self):
        # type: () -> None
        relation = self.tableLastUpdated.create_relation()
        self.assertEquals(len(relation), 1)
        self.assertEquals(relation[0], self.expected_relation_result)

Ejemplo n.º 12

0

Mostrar archivo

    def setUp(self):
        # type: () -> None
        super(TestTableLastUpdated, self).setUp()

        self.tableLastUpdated = TableLastUpdated(table_name='test_table',
                                                 last_updated_time_epoch=25195665,
                                                 schema_name='default')

        self.expected_node_result = {
            NODE_KEY: 'hive://gold.default/test_table/timestamp',
            NODE_LABEL: 'Timestamp',
            'last_updated_timestamp': 25195665,
            'name': 'last_updated_timestamp'
        }

        self.expected_relation_result = {
            RELATION_START_KEY: 'hive://gold.default/test_table',
            RELATION_START_LABEL: 'Table',
            RELATION_END_KEY: 'hive://gold.default/test_table/timestamp',
            RELATION_END_LABEL: 'Timestamp',
            RELATION_TYPE: 'LAST_UPDATED_AT',
            RELATION_REVERSE_TYPE: 'LAST_UPDATED_TIME_OF'
        }

Ejemplo n.º 13

0

Mostrar archivo

Archivo: test_hive_table_last_updated_extractor.py Proyecto: saiharish97/amundsendatabuilder

    def test_extraction_with_partition_table_result(self):
        # type: () -> None
        config_dict = {
            'filesystem.{}'.format(FileSystem.DASK_FILE_SYSTEM): MagicMock()
        }
        conf = ConfigFactory.from_dict(config_dict)

        pt_alchemy_extractor_instance = MagicMock()
        non_pt_alchemy_extractor_instance = MagicMock()
        with patch.object(HiveTableLastUpdatedExtractor, '_get_partitioned_table_sql_alchemy_extractor',
                          return_value=pt_alchemy_extractor_instance),\
            patch.object(HiveTableLastUpdatedExtractor, '_get_non_partitioned_table_sql_alchemy_extractor',
                         return_value=non_pt_alchemy_extractor_instance):
            pt_alchemy_extractor_instance.extract = MagicMock(
                side_effect=[{
                    'schema': 'foo_schema',
                    'table_name': 'table_1',
                    'last_updated_time': 1
                }, {
                    'schema': 'foo_schema',
                    'table_name': 'table_2',
                    'last_updated_time': 2
                }])

            non_pt_alchemy_extractor_instance.extract = MagicMock(
                return_value=None)

            extractor = HiveTableLastUpdatedExtractor()
            extractor.init(conf)

            result = extractor.extract()
            expected = TableLastUpdated(schema='foo_schema',
                                        table_name='table_1',
                                        last_updated_time_epoch=1,
                                        db='hive',
                                        cluster='gold')
            self.assertEqual(result.__repr__(), expected.__repr__())
            result = extractor.extract()
            expected = TableLastUpdated(schema='foo_schema',
                                        table_name='table_2',
                                        last_updated_time_epoch=2,
                                        db='hive',
                                        cluster='gold')
            self.assertEqual(result.__repr__(), expected.__repr__())

            self.assertIsNone(extractor.extract())

Ejemplo n.º 14

0

Mostrar archivo

Archivo: hive_table_last_updated_extractor.py Proyecto: thierryturpin/amundsendatabuilder

    def _get_last_updated_datetime_from_filesystem(
            self,
            table,  # type: str
            schema,  # type: str
            storage_location,  # type: str
    ):
        # type: (...) -> Union[TableLastUpdated, None]
        """
        Fetching metadata within files under storage_location to get latest timestamp.
        (First level only under storage_location)
        Utilizes thread pool to enhance performance. Not using processpool, as it's almost entirely IO bound operation.

        :param table:
        :param schema:
        :param storage_location:
        :return:
        """

        if LOGGER.isEnabledFor(logging.DEBUG):
            LOGGER.debug(
                'Getting last updated datetime for {}.{} in {}'.format(
                    schema, table, storage_location))

        last_updated = OLDEST_TIMESTAMP

        paths = self._ls(storage_location)
        if not paths:
            LOGGER.info(
                '{schema}.{table} does not have any file in path {path}. Skipping'
                .format(schema=schema, table=table, path=storage_location))
            return None

        LOGGER.info(
            'Fetching metadata for {schema}.{table} of {num_files} files'.
            format(schema=schema, table=table, num_files=len(paths)))

        if self._last_updated_filecheck_threshold > 0 and len(
                paths) > self._last_updated_filecheck_threshold:
            LOGGER.info(
                'Skipping {schema}.{table} due to too many files. {len_files} files exist in {location}'
                .format(schema=schema,
                        table=table,
                        len_files=len(paths),
                        location=storage_location))
            return None

        time_stamp_futures = \
            [self._fs_worker_pool.apply_async(self._get_timestamp, (path, schema, table, storage_location)) for path in
             paths]
        for time_stamp_future in time_stamp_futures:
            try:
                time_stamp = time_stamp_future.get(
                    timeout=self._fs_worker_timeout)
                if time_stamp:
                    last_updated = max(time_stamp, last_updated)
            except Exception as e:
                if e.__class__.__name__ == 'TimeoutError':
                    LOGGER.warning(
                        'Timed out on paths {} . Skipping'.format(paths))
                else:
                    raise e

        if last_updated == OLDEST_TIMESTAMP:
            LOGGER.info(
                'No timestamp was derived on {schema}.{table} from location: {location} . Skipping'
                .format(schema=schema, table=table, location=storage_location))
            return None

        result = TableLastUpdated(table_name=table,
                                  last_updated_time_epoch=int(
                                      (last_updated -
                                       OLDEST_TIMESTAMP).total_seconds()),
                                  schema_name=schema,
                                  db=HiveTableLastUpdatedExtractor.DATABASE,
                                  cluster=self._cluster)

        return result

Ejemplo n.º 15

0

Mostrar archivo

    def test_extraction_with_multiple_result(self) -> None:
        """
        Test Extraction with default cluster and database and with multiple tables as result
        """
        with patch.object(SQLAlchemyExtractor,
                          '_get_connection') as mock_connection:
            connection = MagicMock()
            mock_connection.return_value = connection
            sql_execute = MagicMock()
            connection.execute = sql_execute

            default_cluster = self.conf[
                'extractor.snowflake_table_last_updated.{}'.format(
                    SnowflakeTableLastUpdatedExtractor.CLUSTER_KEY)]

            table = {
                'schema': 'test_schema1',
                'table_name': 'test_table1',
                'last_updated_time': 1000,
                'cluster': default_cluster
            }

            table1 = {
                'schema': 'test_schema1',
                'table_name': 'test_table2',
                'last_updated_time': 2000,
                'cluster': default_cluster
            }

            table2 = {
                'schema': 'test_schema2',
                'table_name': 'test_table3',
                'last_updated_time': 3000,
                'cluster': default_cluster
            }

            sql_execute.return_value = [table, table1, table2]

            extractor = SnowflakeTableLastUpdatedExtractor()
            extractor.init(self.conf)

            expected = TableLastUpdated(schema='test_schema1',
                                        table_name='test_table1',
                                        last_updated_time_epoch=1000,
                                        db='snowflake',
                                        cluster='MY_CLUSTER')
            self.assertEqual(expected.__repr__(),
                             extractor.extract().__repr__())

            expected = TableLastUpdated(schema='test_schema1',
                                        table_name='test_table2',
                                        last_updated_time_epoch=2000,
                                        db='snowflake',
                                        cluster='MY_CLUSTER')
            self.assertEqual(expected.__repr__(),
                             extractor.extract().__repr__())

            expected = TableLastUpdated(schema='test_schema2',
                                        table_name='test_table3',
                                        last_updated_time_epoch=3000,
                                        db='snowflake',
                                        cluster='MY_CLUSTER')
            self.assertEqual(expected.__repr__(),
                             extractor.extract().__repr__())

            self.assertIsNone(extractor.extract())

Ejemplo n.º 16

0

Mostrar archivo

class TestTableLastUpdated(unittest.TestCase):

    def setUp(self) -> None:
        super(TestTableLastUpdated, self).setUp()

        self.tableLastUpdated = TableLastUpdated(table_name='test_table',
                                                 last_updated_time_epoch=25195665,
                                                 schema='default')

        self.expected_node_results = [{
            NODE_KEY: 'hive://gold.default/test_table/timestamp',
            NODE_LABEL: 'Timestamp',
            'last_updated_timestamp:UNQUOTED': 25195665,
            timestamp_constants.TIMESTAMP_PROPERTY + ":UNQUOTED": 25195665,
            'name': 'last_updated_timestamp'
        }]

        self.expected_relation_results = [{
            RELATION_START_KEY: 'hive://gold.default/test_table',
            RELATION_START_LABEL: 'Table',
            RELATION_END_KEY: 'hive://gold.default/test_table/timestamp',
            RELATION_END_LABEL: 'Timestamp',
            RELATION_TYPE: 'LAST_UPDATED_AT',
            RELATION_REVERSE_TYPE: 'LAST_UPDATED_TIME_OF'
        }]

    def test_get_table_model_key(self) -> None:
        table = self.tableLastUpdated.get_table_model_key()
        self.assertEqual(table, 'hive://gold.default/test_table')

    def test_get_last_updated_model_key(self) -> None:
        last_updated = self.tableLastUpdated.get_last_updated_model_key()
        self.assertEqual(last_updated, 'hive://gold.default/test_table/timestamp')

    def test_create_nodes(self) -> None:
        actual = []
        node = self.tableLastUpdated.create_next_node()
        while node:
            serialize_node = neo4_serializer.serialize_node(node)
            actual.append(serialize_node)
            node = self.tableLastUpdated.create_next_node()

        self.assertEqual(actual, self.expected_node_results)

    def test_create_nodes_neptune(self) -> None:
        node_id = TableLastUpdated.LAST_UPDATED_NODE_LABEL + ":" + self.tableLastUpdated.get_last_updated_model_key()
        expected_nodes = [{
            NEPTUNE_HEADER_ID: node_id,
            METADATA_KEY_PROPERTY_NAME: node_id,
            NEPTUNE_HEADER_LABEL: TableLastUpdated.LAST_UPDATED_NODE_LABEL,
            NEPTUNE_LAST_EXTRACTED_AT_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: ANY,
            NEPTUNE_CREATION_TYPE_NODE_PROPERTY_NAME_BULK_LOADER_FORMAT: NEPTUNE_CREATION_TYPE_JOB,
            'name:String(single)': 'last_updated_timestamp',
            'last_updated_timestamp:Long(single)': 25195665,
            timestamp_constants.TIMESTAMP_PROPERTY + ":Long(single)": 25195665,
        }]

        actual = []
        next_node = self.tableLastUpdated.create_next_node()
        while next_node:
            next_node_serialized = neptune_serializer.convert_node(next_node)
            actual.append(next_node_serialized)
            next_node = self.tableLastUpdated.create_next_node()

        self.assertEqual(actual, expected_nodes)

    def test_create_relation(self) -> None:
        actual = []
        relation = self.tableLastUpdated.create_next_relation()
        while relation:
            serialized_relation = neo4_serializer.serialize_relationship(relation)
            actual.append(serialized_relation)
            relation = self.tableLastUpdated.create_next_relation()

    def test_create_relation_neptune(self) -> None:
        actual = []
        next_relation = self.tableLastUpdated.create_next_relation()
        while next_relation:
            next_relation_serialized = neptune_serializer.convert_relationship(next_relation)
            actual.append(next_relation_serialized)
            next_relation = self.tableLastUpdated.create_next_relation()

        expected = [
            [
                {
                    NEPTUNE_HEADER_ID: "{label}:{from_vertex_id}_{to_vertex_id}".format(
                        from_vertex_id='Table:hive://gold.default/test_table',
                        to_vertex_id='Timestamp:hive://gold.default/test_table/timestamp',
                        label='LAST_UPDATED_AT'
                    ),
                    METADATA_KEY_PROPERTY_NAME: "{label}:{from_vertex_id}_{to_vertex_id}".format(
                        from_vertex_id='Table:hive://gold.default/test_table',
                        to_vertex_id='Timestamp:hive://gold.default/test_table/timestamp',
                        label='LAST_UPDATED_AT'
                    ),
                    NEPTUNE_RELATIONSHIP_HEADER_FROM: 'Table:hive://gold.default/test_table',
                    NEPTUNE_RELATIONSHIP_HEADER_TO: 'Timestamp:hive://gold.default/test_table/timestamp',
                    NEPTUNE_HEADER_LABEL: 'LAST_UPDATED_AT',
                    NEPTUNE_LAST_EXTRACTED_AT_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: ANY,
                    NEPTUNE_CREATION_TYPE_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: NEPTUNE_CREATION_TYPE_JOB
                },
                {
                    NEPTUNE_HEADER_ID: "{label}:{from_vertex_id}_{to_vertex_id}".format(
                        from_vertex_id='Timestamp:hive://gold.default/test_table/timestamp',
                        to_vertex_id='Table:hive://gold.default/test_table',
                        label='LAST_UPDATED_TIME_OF'
                    ),
                    METADATA_KEY_PROPERTY_NAME: "{label}:{from_vertex_id}_{to_vertex_id}".format(
                        from_vertex_id='Timestamp:hive://gold.default/test_table/timestamp',
                        to_vertex_id='Table:hive://gold.default/test_table',
                        label='LAST_UPDATED_TIME_OF'
                    ),
                    NEPTUNE_RELATIONSHIP_HEADER_FROM: 'Timestamp:hive://gold.default/test_table/timestamp',
                    NEPTUNE_RELATIONSHIP_HEADER_TO: 'Table:hive://gold.default/test_table',
                    NEPTUNE_HEADER_LABEL: 'LAST_UPDATED_TIME_OF',
                    NEPTUNE_LAST_EXTRACTED_AT_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: ANY,
                    NEPTUNE_CREATION_TYPE_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: NEPTUNE_CREATION_TYPE_JOB
                }
            ]
        ]

        self.assertEqual(actual, expected)