Esempio n. 1
0
def run_csv_job(file_loc, job_name, model):
    tmp_folder = '/var/tmp/amundsen/{job_name}'.format(job_name=job_name)
    node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder)
    relationship_files_folder = '{tmp_folder}/relationships'.format(
        tmp_folder=tmp_folder)

    csv_extractor = CsvExtractor()
    loader = FSNeptuneCSVLoader()
    publisher = NeptuneCSVPublisher()

    task = DefaultTask(extractor=csv_extractor,
                       loader=loader,
                       transformer=NoopTransformer())

    job_config = ConfigFactory.from_dict({
        'extractor.csv.file_location': file_loc,
        'extractor.csv.model_class': model,
        loader.get_scope(): {
            FSNeptuneCSVLoader.NODE_DIR_PATH: node_files_folder,
            FSNeptuneCSVLoader.RELATION_DIR_PATH: relationship_files_folder,
            FSNeptuneCSVLoader.SHOULD_DELETE_CREATED_DIR: True,
            FSNeptuneCSVLoader.JOB_PUBLISHER_TAG: 'unique_tag'
        },
        publisher.get_scope(): {
            NeptuneCSVPublisher.NODE_FILES_DIR: node_files_folder,
            NeptuneCSVPublisher.RELATION_FILES_DIR: relationship_files_folder,
            NeptuneCSVPublisher.AWS_S3_BUCKET_NAME: S3_BUCKET_NAME,
            NeptuneCSVPublisher.AWS_BASE_S3_DATA_PATH: S3_DATA_PATH,
            NeptuneCSVPublisher.NEPTUNE_HOST: NEPTUNE_ENDPOINT,
            NeptuneCSVPublisher.AWS_IAM_ROLE_NAME: neptune_iam_role_name
        },
    })

    DefaultJob(conf=job_config, task=task, publisher=publisher).launch()
Esempio n. 2
0
def create_last_updated_job():
    # loader saves data to these folders and publisher reads it from here
    tmp_folder = '/var/tmp/amundsen/last_updated_data'
    node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder)
    relationship_files_folder = '{tmp_folder}/relationships'.format(
        tmp_folder=tmp_folder)

    loader = FSNeptuneCSVLoader()
    task = DefaultTask(extractor=EsLastUpdatedExtractor(), loader=loader)

    publisher = NeptuneCSVPublisher()

    job_config = ConfigFactory.from_dict({
        'extractor.es_last_updated.model_class':
        'databuilder.models.es_last_updated.ESLastUpdated',
        loader.get_scope(): {
            FSNeptuneCSVLoader.NODE_DIR_PATH: node_files_folder,
            FSNeptuneCSVLoader.RELATION_DIR_PATH: relationship_files_folder,
            FSNeptuneCSVLoader.SHOULD_DELETE_CREATED_DIR: True,
            FSNeptuneCSVLoader.JOB_PUBLISHER_TAG: 'unique_tag'
        },
        publisher.get_scope(): {
            NeptuneCSVPublisher.NODE_FILES_DIR: node_files_folder,
            NeptuneCSVPublisher.RELATION_FILES_DIR: relationship_files_folder,
            NeptuneCSVPublisher.AWS_S3_BUCKET_NAME: S3_BUCKET_NAME,
            NeptuneCSVPublisher.AWS_BASE_S3_DATA_PATH: S3_DATA_PATH,
            NeptuneCSVPublisher.NEPTUNE_HOST: NEPTUNE_ENDPOINT,
            NeptuneCSVPublisher.AWS_IAM_ROLE_NAME: neptune_iam_role_name,
            'job_publish_tag': 'unique_lastupdated_tag'
        }
    })

    return DefaultJob(conf=job_config, task=task, publisher=publisher)
Esempio n. 3
0
def run_table_column_job(table_path, column_path):
    tmp_folder = '/var/tmp/amundsen/table_column'
    node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder)
    relationship_files_folder = '{tmp_folder}/relationships'.format(
        tmp_folder=tmp_folder)
    extractor = CsvTableColumnExtractor()
    csv_loader = FSNeptuneCSVLoader()
    publisher = NeptuneCSVPublisher()
    task = DefaultTask(extractor,
                       loader=csv_loader,
                       transformer=NoopTransformer())
    job_config = ConfigFactory.from_dict({
        'extractor.csvtablecolumn.table_file_location':
        table_path,
        'extractor.csvtablecolumn.column_file_location':
        column_path,
        csv_loader.get_scope(): {
            FSNeptuneCSVLoader.NODE_DIR_PATH: node_files_folder,
            FSNeptuneCSVLoader.RELATION_DIR_PATH: relationship_files_folder,
            FSNeptuneCSVLoader.SHOULD_DELETE_CREATED_DIR: True,
            FSNeptuneCSVLoader.JOB_PUBLISHER_TAG: 'unique_tag'
        },
        publisher.get_scope(): {
            NeptuneCSVPublisher.NODE_FILES_DIR: node_files_folder,
            NeptuneCSVPublisher.RELATION_FILES_DIR: relationship_files_folder,
            NeptuneCSVPublisher.AWS_S3_BUCKET_NAME: S3_BUCKET_NAME,
            NeptuneCSVPublisher.AWS_BASE_S3_DATA_PATH: S3_DATA_PATH,
            NeptuneCSVPublisher.NEPTUNE_HOST: NEPTUNE_ENDPOINT,
            NeptuneCSVPublisher.AWS_IAM_ROLE_NAME: neptune_iam_role_name
        }
    })
    job = DefaultJob(conf=job_config, task=task, publisher=publisher)
    job.launch()
Esempio n. 4
0
def create_dashboard_tables_job():
    # loader saves data to these folders and publisher reads it from here
    tmp_folder = '/var/tmp/amundsen/dashboard_table'
    node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder)
    relationship_files_folder = '{tmp_folder}/relationships'.format(
        tmp_folder=tmp_folder)

    csv_extractor = CsvExtractor()
    loader = FSNeptuneCSVLoader()
    publisher = NeptuneCSVPublisher()

    generic_transformer = GenericTransformer()
    dict_to_model_transformer = DictToModel()
    transformer = ChainedTransformer(
        transformers=[generic_transformer, dict_to_model_transformer],
        is_init_transformers=True)

    task = DefaultTask(extractor=csv_extractor,
                       loader=loader,
                       transformer=transformer)

    job_config = ConfigFactory.from_dict({
        csv_extractor.get_scope(): {
            CsvExtractor.FILE_LOCATION:
            'example/sample_data/sample_dashboard_table.csv'
        },
        transformer.get_scope(): {
            generic_transformer.get_scope(): {
                FIELD_NAME: 'table_ids',
                CALLBACK_FUNCTION: _str_to_list
            },
            dict_to_model_transformer.get_scope(): {
                MODEL_CLASS:
                'databuilder.models.dashboard.dashboard_table.DashboardTable',
            }
        },
        loader.get_scope(): {
            FSNeptuneCSVLoader.NODE_DIR_PATH: node_files_folder,
            FSNeptuneCSVLoader.RELATION_DIR_PATH: relationship_files_folder,
            FSNeptuneCSVLoader.SHOULD_DELETE_CREATED_DIR: True,
            FSNeptuneCSVLoader.JOB_PUBLISHER_TAG: 'unique_tag'
        },
        publisher.get_scope(): {
            NeptuneCSVPublisher.NODE_FILES_DIR: node_files_folder,
            NeptuneCSVPublisher.RELATION_FILES_DIR: relationship_files_folder,
            NeptuneCSVPublisher.AWS_S3_BUCKET_NAME: S3_BUCKET_NAME,
            NeptuneCSVPublisher.AWS_BASE_S3_DATA_PATH: S3_DATA_PATH,
            NeptuneCSVPublisher.NEPTUNE_HOST: NEPTUNE_ENDPOINT,
            NeptuneCSVPublisher.AWS_IAM_ROLE_NAME: neptune_iam_role_name,
            NeptuneCSVPublisher.AWS_REGION: AWS_REGION,
            NeptuneCSVPublisher.AWS_ACCESS_KEY: aws_access_key,
            NeptuneCSVPublisher.AWS_SECRET_ACCESS_KEY: aws_access_secret,
            NeptuneCSVPublisher.AWS_SESSION_TOKEN: aws_token
        }
    })

    return DefaultJob(conf=job_config, task=task, publisher=publisher)
    def test_load(self) -> None:
        actors = [Actor('Tom Cruise'), Actor('Meg Ryan')]
        cities = [City('San Diego'), City('Oakland')]
        movie = Movie('Top Gun', actors, cities)

        loader = FSNeptuneCSVLoader()
        loader.init(self._conf)
        loader.load(movie)

        loader.close()

        expected_node_path = '{}/../resources/fs_neptune_csv_loader/nodes'.format(
            os.path.join(os.path.dirname(__file__)))
        expected_nodes = self._get_csv_rows(expected_node_path,
                                            itemgetter('~id'))
        actual_nodes = self._get_csv_rows(
            self._conf.get_string(FSNeptuneCSVLoader.NODE_DIR_PATH),
            itemgetter('~id'))
        self.maxDiff = None
        self.assertEqual(expected_nodes, actual_nodes)

        expected_rel_path = '{}/../resources/fs_neptune_csv_loader/relationships'.format(
            os.path.join(os.path.dirname(__file__)))
        expected_relations = self._get_csv_rows(expected_rel_path,
                                                itemgetter('~id'))
        actual_relations = self._get_csv_rows(
            self._conf.get_string(FSNeptuneCSVLoader.RELATION_DIR_PATH),
            itemgetter('~id'))
        self.assertListEqual(list(expected_relations), list(actual_relations))
Esempio n. 6
0
def run_postgres_job(job_name):
    tmp_folder = '/var/tmp/amundsen/{job_name}'.format(job_name=job_name)
    node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder)
    relationship_files_folder = '{tmp_folder}/relationships'.format(tmp_folder=tmp_folder)

    loader = FSNeptuneCSVLoader()
    publisher = NeptuneCSVPublisher()

    where_clause_suffix = textwrap.dedent(
    """
        where table_schema = '{}'
    """.format(postgres_schema)
    )

    job_config = ConfigFactory.from_dict({
        f'extractor.postgres_metadata.{PostgresMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY}': where_clause_suffix,
        f'extractor.postgres_metadata.{PostgresMetadataExtractor.USE_CATALOG_AS_CLUSTER_NAME}': True,
        f'extractor.postgres_metadata.extractor.sqlalchemy.{SQLAlchemyExtractor.CONN_STRING}': connection_string(),
        loader.get_scope(): {
            FSNeptuneCSVLoader.NODE_DIR_PATH: node_files_folder,
            FSNeptuneCSVLoader.RELATION_DIR_PATH: relationship_files_folder,
            FSNeptuneCSVLoader.SHOULD_DELETE_CREATED_DIR: True,
            FSNeptuneCSVLoader.JOB_PUBLISHER_TAG: 'unique_tag'
        },
        publisher.get_scope(): {
            NeptuneCSVPublisher.NODE_FILES_DIR: node_files_folder,
            NeptuneCSVPublisher.RELATION_FILES_DIR: relationship_files_folder,
            NeptuneCSVPublisher.AWS_S3_BUCKET_NAME: S3_BUCKET_NAME,
            NeptuneCSVPublisher.AWS_BASE_S3_DATA_PATH: S3_DATA_PATH,
            NeptuneCSVPublisher.NEPTUNE_HOST: NEPTUNE_ENDPOINT,
            NeptuneCSVPublisher.AWS_IAM_ROLE_NAME: neptune_iam_role_name,
            NeptuneCSVPublisher.AWS_REGION: AWS_REGION
        },
    })

    DefaultJob(
        conf=job_config,
        task=DefaultTask(extractor=PostgresMetadataExtractor(), loader=loader),
        publisher=publisher).launch()