Beispiel #1
0
def bq_load_events(source_uris: List[String]):
    return bq_load_solid_for_source(
        BigQueryLoadSource.GCS).alias('bq_load_events_internal')(source_uris)
Beispiel #2
0
def test_pd_df_load():
    dataset = get_dataset()
    table = '%s.%s' % (dataset, 'df')

    test_df = pd.DataFrame({'num1': [1, 3], 'num2': [2, 4]})

    create_solid = bq_create_dataset.alias('create_solid')
    load_solid = bq_load_solid_for_source(
        BigQueryLoadSource.DataFrame).alias('load_solid')
    query_solid = bq_solid_for_queries(['SELECT num1, num2 FROM %s' % table
                                        ]).alias('query_solid')
    delete_solid = bq_delete_dataset.alias('delete_solid')

    @solid(input_defs=[InputDefinition('success', Nothing)],
           output_defs=[OutputDefinition(DataFrame)])
    def return_df(_context):  # pylint: disable=unused-argument
        return test_df

    config = {
        'solids': {
            'create_solid': {
                'config': {
                    'dataset': dataset,
                    'exists_ok': True
                }
            },
            'load_solid': {
                'config': {
                    'destination': table
                }
            },
            'delete_solid': {
                'config': {
                    'dataset': dataset,
                    'delete_contents': True
                }
            },
        }
    }

    @pipeline(mode_defs=bq_modes())
    def bq_pipeline():
        delete_solid(query_solid(load_solid(return_df(create_solid()))))

    result = execute_pipeline(bq_pipeline, config)
    assert result.success

    values = result.result_for_solid('query_solid').result_value()
    assert values[0].to_dict() == test_df.to_dict()

    # BQ loads should throw an exception if pyarrow and fastparquet aren't available
    with mock.patch.dict(sys.modules, {'pyarrow': None, 'fastparquet': None}):
        with pytest.raises(DagsterExecutionStepExecutionError) as exc_info:
            result = execute_pipeline(bq_pipeline, config)
        assert (
            'loading data to BigQuery from pandas DataFrames requires either pyarrow or fastparquet'
            ' to be installed' in str(exc_info.value.user_exception))
        cleanup_config = {
            'solids': {
                'delete_solid': {
                    'config': {
                        'dataset': dataset,
                        'delete_contents': True
                    }
                }
            }
        }

        @pipeline(mode_defs=bq_modes())
        def cleanup():
            delete_solid()

        assert execute_pipeline(cleanup, cleanup_config).success

    assert not dataset_exists(dataset)
Beispiel #3
0
def test_gcs_load():
    dataset = get_dataset()
    table = '%s.%s' % (dataset, 'df')

    create_solid = bq_create_dataset.alias('create_solid')
    load_solid = bq_load_solid_for_source(
        BigQueryLoadSource.GCS).alias('load_solid')
    query_solid = bq_solid_for_queries([
        'SELECT string_field_0, string_field_1 FROM %s ORDER BY string_field_0 ASC LIMIT 1'
        % table
    ]).alias('query_solid')
    delete_solid = bq_delete_dataset.alias('delete_solid')

    @solid(input_defs=[InputDefinition('success', Nothing)],
           output_defs=[OutputDefinition(List[Path])])
    def return_gcs_uri(_context):  # pylint: disable=unused-argument
        return ["gs://cloud-samples-data/bigquery/us-states/us-states.csv"]

    config = {
        'solids': {
            'create_solid': {
                'config': {
                    'dataset': dataset,
                    'exists_ok': True
                }
            },
            'load_solid': {
                'config': {
                    'destination': table,
                    'load_job_config': {
                        'autodetect': True,
                        'skip_leading_rows': 1,
                        'source_format': 'CSV',
                        'write_disposition': 'WRITE_TRUNCATE',
                    },
                }
            },
            'delete_solid': {
                'config': {
                    'dataset': dataset,
                    'delete_contents': True
                }
            },
        }
    }

    @pipeline(mode_defs=bq_modes())
    def bq_pipeline():
        delete_solid(query_solid(load_solid(return_gcs_uri(create_solid()))))

    result = execute_pipeline(bq_pipeline, config)
    assert result.success

    values = result.result_for_solid('query_solid').result_value()
    assert values[0].to_dict() == {
        'string_field_0': {
            0: 'Alabama'
        },
        'string_field_1': {
            0: 'AL'
        }
    }

    assert not dataset_exists(dataset)
Beispiel #4
0
 def _gcs_to_bigquery_solid(source_uris: List[String]):
     return bq_load_solid_for_source(BigQueryLoadSource.GCS).alias(
         'gcs_to_bigquery_solid_internal')(source_uris)