コード例 #1
0
ファイル: bigquery_tests.py プロジェクト: ZedYeung/pydatalab
    def test_extract_cell_table(self, mock_get_notebook_item, mock_get_table,
                                mock_table_extract):
        args = {
            'table': 'test-table',
            'path': 'test-path',
            'format': 'json',
            'delimiter': None,
            'header': None,
            'compress': None,
            'nocache': None
        }
        mock_get_table.return_value = None
        with self.assertRaisesRegexp(Exception,
                                     'Could not find table test-table'):
            bq.commands._bigquery._extract_cell(args, None)

        mock_get_table.return_value = bq.Table('project.test.table',
                                               self._create_context())
        mock_table_extract.return_value.result = lambda: 'test-results'
        mock_table_extract.return_value.failed = False
        mock_table_extract.return_value.errors = None
        self.assertEqual(bq.commands._bigquery._extract_cell(args, None),
                         'test-results')
        mock_table_extract.assert_called_with('test-path',
                                              format='json',
                                              csv_delimiter=None,
                                              csv_header=None,
                                              compress=None)
コード例 #2
0
def test_datalab_load_table_from_dataframe(to_delete):
    # [START bigquery_migration_datalab_load_table_from_dataframe]
    import google.datalab.bigquery as bq
    import pandas

    # Create the dataset
    dataset_id = 'import_sample'
    # [END bigquery_migration_datalab_load_table_from_dataframe]
    # Use unique dataset ID to avoid collisions when running tests
    dataset_id = 'test_dataset_{}'.format(int(time.time() * 1000))
    to_delete.append(dataset_id)
    # [START bigquery_migration_datalab_load_table_from_dataframe]
    bq.Dataset(dataset_id).create()

    # Create the table and load the data
    dataframe = pandas.DataFrame([
        {'title': 'The Meaning of Life', 'release_year': 1983},
        {'title': 'Monty Python and the Holy Grail', 'release_year': 1975},
        {'title': 'Life of Brian', 'release_year': 1979},
        {
            'title': 'And Now for Something Completely Different',
            'release_year': 1971
        },
    ])
    schema = bq.Schema.from_data(dataframe)
    table = bq.Table(
        '{}.monty_python'.format(dataset_id)).create(schema=schema)
    table.insert(dataframe)  # Starts steaming insert of data
コード例 #3
0
ファイル: bigquery_tests.py プロジェクト: ZedYeung/pydatalab
    def test_table_viewer(self, mock_table_exists, mock_get_field_list,
                          mock_get_data, mock_tables_get,
                          mock_render_chart_data, mock_next_id):
        test_table = bq.Table('testproject.test.table', self._create_context())

        mock_table_exists.return_value = False
        with self.assertRaisesRegexp(Exception, 'does not exist'):
            bq.commands._bigquery._table_viewer(test_table)

        mock_table_exists.return_value = True
        mock_get_field_list.return_value = ['col1']
        mock_get_data.return_value = ({'cols': ['col1'], 'rows': ['val1']}, 1)
        mock_render_chart_data.return_value = 'test_chart_data'
        mock_next_id.return_value = 'test_id'
        viewer = bq.commands._bigquery._table_viewer(test_table)

        mock_table_exists.assert_called()
        mock_get_field_list.assert_called()
        mock_render_chart_data.assert_called()

        expected_html_header = '''
    <div class="bqtv" id="test_id">test_chart_data</div>
    <br />(testproject.test.table)<br />
    '''
        self.assertIn(expected_html_header, viewer)
コード例 #4
0
    def execute(self, context):
        table = bq.Table(self.table, context=None)
        if not table.exists():
            table.create(schema=self.schema)

        kwargs = {}
        if 'delimiter' in self.csv_options:
            kwargs['delimiter'] = self.csv_options['delimiter']
        if 'skip' in self.csv_options:
            kwargs['skip_leading_rows'] = self.csv_options['skip']
        if 'strict' in self.csv_options:
            kwargs['allow_jagged_rows'] = self.csv_options['strict']
        if 'quote' in self.csv_options:
            kwargs['quote'] = self.csv_options['quote']
        csv_options = bq.CSVOptions(**kwargs)

        job = table.load(
            self.path,
            mode=self.mode,
            source_format=('csv' if self.format == 'csv' else
                           'NEWLINE_DELIMITED_JSON'),
            csv_options=csv_options,
            ignore_unknown_values=not self.csv_options.get('strict'))

        if job.failed:
            raise Exception('Load failed: %s' % str(job.fatal_error))
        elif job.errors:
            raise Exception('Load completed with errors: %s' % str(job.errors))

        return {'result': job.result()}
コード例 #5
0
    def execute(self, context):
        if self._table:
            pydatalab_context = google.datalab.Context.default()
            table = bq.Table(self._table, context=pydatalab_context)

        if self._mode == 'create':
            if table.exists():
                raise Exception(
                    "%s already exists; mode should be \'append\' or \'overwrite\'"
                    % self._table)
            if not self._schema:
                raise Exception(
                    '%s does not exist, and no schema specified in cell; cannot load.'
                    % self._table)
            table.create(schema=self._schema)
        elif not table.exists():
            raise Exception('%s does not exist; mode should be \'create\'' %
                            self._table)

        csv_options = bq.CSVOptions(
            delimiter=self._csv_options.get('delimiter'),
            skip_leading_rows=self._csv_options.get('skip'),
            allow_jagged_rows=self._csv_options.get('strict'),
            quote=self._csv_options.get('quote'))
        job = table.load(
            self._path,
            mode=self._mode,
            source_format=('csv' if self._format == 'csv' else
                           'NEWLINE_DELIMITED_JSON'),
            csv_options=csv_options,
            ignore_unknown_values=not self._csv_options.get('strict'))
        if job.failed:
            raise Exception('Load failed: %s' % str(job.fatal_error))
        elif job.errors:
            raise Exception('Load completed with errors: %s' % str(job.errors))
コード例 #6
0
def test_datalab_load_table_from_gcs_csv(to_delete):
    # [START bigquery_migration_datalab_load_table_from_gcs_csv]
    import google.datalab.bigquery as bq

    # Create the dataset
    dataset_id = 'import_sample'
    # [END bigquery_migration_datalab_load_table_from_gcs_csv]
    # Use unique dataset ID to avoid collisions when running tests
    dataset_id = 'test_dataset_{}'.format(int(time.time() * 1000))
    to_delete.append(dataset_id)
    # [START bigquery_migration_datalab_load_table_from_gcs_csv]
    bq.Dataset(dataset_id).create()

    # Create the table
    schema = [
        {'name': 'name', 'type': 'STRING'},
        {'name': 'post_abbr', 'type': 'STRING'},
    ]
    table = bq.Table(
        '{}.us_states'.format(dataset_id)).create(schema=schema)
    table.load(
        'gs://cloud-samples-data/bigquery/us-states/us-states.csv',
        mode='append',
        source_format='csv',
        csv_options=bq.CSVOptions(skip_leading_rows=1)
    )  # Waits for the job to complete
    # [END bigquery_migration_datalab_load_table_from_gcs_csv]

    assert table.length == 50
コード例 #7
0
 def test_table_cell_list_dataset(self, mock_dataset, mock_default_context):
   args = {'command': 'list', 'filter': '', 'dataset': 'test-dataset', 'project': None}
   tables = [bq.Table('project.test.' + name) for name in ['t1', 't2']]
   mock_dataset.return_value = iter(tables)
   self.assertEqual(
       bq.commands._bigquery._table_cell(args, None),
       '<ul><li>project.test.t1</li><li>project.test.t2</li></ul>')
コード例 #8
0
def run_analysis(args):
    """Builds an analysis file for training.

  Uses BiqQuery tables to do the analysis.

  Args:
    args: command line args

  Raises:
    ValueError if schema contains unknown types.
  """
    import google.datalab.bigquery as bq
    if args.bigquery_table:
        table = bq.Table(args.bigquery_table)
        schema_list = table.schema._bq_schema
    else:
        schema_list = json.loads(
            file_io.read_file_to_string(args.schema_file).decode())
        table = bq.ExternalDataSource(source=args.input_file_pattern,
                                      schema=bq.Schema(schema_list))

    # Check the schema is supported.
    for col_schema in schema_list:
        col_type = col_schema['type'].lower()
        if col_type != 'string' and col_type != 'integer' and col_type != 'float':
            raise ValueError('Schema contains an unsupported type %s.' %
                             col_type)

    run_numerical_analysis(table, schema_list, args)
    run_categorical_analysis(table, schema_list, args)

    # Save a copy of the schema to the output location.
    file_io.write_string_to_file(
        os.path.join(args.output_dir, SCHEMA_FILE),
        json.dumps(schema_list, indent=2, separators=(',', ': ')))
コード例 #9
0
    def test_get_bq_extract_operator_definition(self, mock_table):
        mock_table.return_value = bq.Table(
            'foo_project.foo_dataset.foo_table',
            context=PipelineTest._create_context())
        task_id = 'foo'
        task_details = {}
        task_details['type'] = 'BigQueryToCloudStorage'
        task_details['table'] = 'foo_project.foo_dataset.foo_table'
        task_details['path'] = 'foo_path'
        task_details['format'] = 'csv'
        task_details['delimiter'] = '$'
        task_details['header'] = False
        task_details['compress'] = True
        operator_def = pipeline.Pipeline(None, None)._get_operator_definition(
            task_id, task_details, None)
        self.assertEqual(
            operator_def,
            """foo = BigQueryToCloudStorageOperator(task_id='foo_id', compression=\"\"\"GZIP\"\"\", destination_cloud_storage_uris=[\'foo_path\'], export_format=\"\"\"CSV\"\"\", field_delimiter=\"\"\"$\"\"\", print_header=False, source_project_dataset_table=\"\"\"foo_project.foo_dataset.foo_table\"\"\", dag=dag)
""")  # noqa

        task_details['format'] = 'json'
        operator_def = pipeline.Pipeline(None, None)._get_operator_definition(
            task_id, task_details, None)
        self.assertEqual(
            operator_def,
            """foo = BigQueryToCloudStorageOperator(task_id='foo_id', compression=\"\"\"GZIP\"\"\", destination_cloud_storage_uris=[\'foo_path\'], export_format=\"\"\"NEWLINE_DELIMITED_JSON\"\"\", field_delimiter=\"\"\"$\"\"\", print_header=False, source_project_dataset_table=\"\"\"foo_project.foo_dataset.foo_table\"\"\", dag=dag)
""")  # noqa
コード例 #10
0
    def test_get_bq_load_operator_definition(self, mock_table):
        mock_table.return_value = bq.Table(
            'foo_project.foo_dataset.foo_table',
            context=PipelineTest._create_context())
        task_id = 'foo'
        task_details = {}
        task_details['type'] = 'GoogleCloudStorageToBigQuery'
        task_details['table'] = 'foo_project.foo_dataset.foo_table'
        task_details['path'] = 'gs://foo_bucket/foo_file.csv'
        task_details['format'] = 'csv'
        task_details['delimiter'] = '$'
        task_details['skip'] = False
        operator_def = pipeline.Pipeline(None, None)._get_operator_definition(
            task_id, task_details, None)
        self.assertEqual(
            operator_def,
            """foo = GoogleCloudStorageToBigQueryOperator(task_id='foo_id', bucket=\"\"\"foo_bucket\"\"\", destination_project_dataset_table=\"\"\"foo_project.foo_dataset.foo_table\"\"\", export_format=\"\"\"CSV\"\"\", field_delimiter=\"\"\"$\"\"\", skip_leading_rows=False, source_objects=\"\"\"foo_file.csv\"\"\", dag=dag)
""")  # noqa

        task_details['format'] = 'json'
        operator_def = pipeline.Pipeline(None, None)._get_operator_definition(
            task_id, task_details, None)
        self.assertEqual(
            operator_def,
            """foo = GoogleCloudStorageToBigQueryOperator(task_id='foo_id', bucket=\"\"\"foo_bucket\"\"\", destination_project_dataset_table=\"\"\"foo_project.foo_dataset.foo_table\"\"\", export_format=\"\"\"NEWLINE_DELIMITED_JSON\"\"\", field_delimiter=\"\"\"$\"\"\", skip_leading_rows=False, source_objects=\"\"\"foo_file.csv\"\"\", dag=dag)
""")  # noqa
コード例 #11
0
def main(argv=None):
  args = parse_arguments(sys.argv if argv is None else argv)

  if args.schema:
    schema = json.loads(
        file_io.read_file_to_string(args.schema).decode())
  else:
    import google.datalab.bigquery as bq
    schema = bq.Table(args.bigquery).schema._bq_schema
  features = json.loads(
      file_io.read_file_to_string(args.features).decode())

  file_io.recursive_create_dir(args.output)

  if args.cloud:
    run_cloud_analysis(
        output_dir=args.output,
        csv_file_pattern=args.csv,
        bigquery_table=args.bigquery,
        schema=schema,
        features=features)
  else:
    feature_analysis.run_local_analysis(
        output_dir=args.output,
        csv_file_pattern=args.csv,
        schema=schema,
        features=features)
コード例 #12
0
ファイル: _pipeline.py プロジェクト: freyrsae/pydatalab
    def _get_bq_load_params(operator_task_details):
        if 'table' in operator_task_details:
            table = bigquery.commands._bigquery._get_table(
                operator_task_details['table'])
            if not table:
                table = bigquery.Table(operator_task_details['table'])
                # TODO(rajivpb): Ensure that mode == create here.
            operator_task_details[
                'destination_project_dataset_table'] = table.full_name
            del operator_task_details['table']

        if 'format' in operator_task_details:
            operator_task_details['export_format'] = 'CSV' if operator_task_details['format'] == 'csv' \
              else 'NEWLINE_DELIMITED_JSON'
            del operator_task_details['format']

        if 'delimiter' in operator_task_details:
            operator_task_details['field_delimiter'] = operator_task_details[
                'delimiter']
            del operator_task_details['delimiter']

        if 'skip' in operator_task_details:
            operator_task_details['skip_leading_rows'] = operator_task_details[
                'skip']
            del operator_task_details['skip']

        if 'path' in operator_task_details:
            bucket, source_object = Pipeline._get_bucket_and_source_object(
                operator_task_details['path'])
            operator_task_details['bucket'] = bucket
            operator_task_details['source_objects'] = source_object
            del operator_task_details['path']

        return operator_task_details
コード例 #13
0
    def test_numerics(self):
        """Build a BQ table, and then call analyze on it."""
        schema = [{
            'name': 'col1',
            'type': 'INTEGER'
        }, {
            'name': 'col2',
            'type': 'FLOAT'
        }]
        project_id = dl.Context.default().project_id
        dataset_name = 'temp_pydatalab_test_%s' % uuid.uuid4().hex
        table_name = 'temp_table'
        full_table_name = '%s.%s.%s' % (project_id, dataset_name, table_name)

        output_folder = tempfile.mkdtemp()

        try:
            # Make a dataset, a table, and insert data.
            db = bq.Dataset((project_id, dataset_name))
            db.create()

            table = bq.Table(full_table_name)
            table.create(schema=bq.Schema(schema), overwrite=True)

            data = [{'col1': i, 'col2': 10 * i + 0.5} for i in range(100)]
            table.insert(data)

            analyze_data.run_cloud_analysis(output_dir=output_folder,
                                            csv_file_pattern=None,
                                            bigquery_table=full_table_name,
                                            schema=schema,
                                            features={
                                                'col1': {
                                                    'transform': 'scale'
                                                },
                                                'col2': {
                                                    'transform': 'identity'
                                                }
                                            })

            stats = json.loads(
                file_io.read_file_to_string(
                    os.path.join(output_folder,
                                 analyze_data.STATS_FILE)).decode())

            self.assertEqual(stats['num_examples'], 100)
            col = stats['column_stats']['col1']
            self.assertAlmostEqual(col['max'], 99.0)
            self.assertAlmostEqual(col['min'], 0.0)
            self.assertAlmostEqual(col['mean'], 49.5)

            col = stats['column_stats']['col2']
            self.assertAlmostEqual(col['max'], 990.5)
            self.assertAlmostEqual(col['min'], 0.5)
            self.assertAlmostEqual(col['mean'], 495.5)
        finally:
            shutil.rmtree(output_folder)
            db.delete(delete_contents=True)
コード例 #14
0
  def test_table_cell_view(self, mock_get_table, mock_default_context):
    args = {'command': 'view', 'name': 'test-table'}
    table = bq.Table('project.test.table')
    mock_get_table.return_value = None
    with self.assertRaisesRegexp(Exception, 'Could not find table test-table'):
      bq.commands._bigquery._table_cell(args, None)

    mock_get_table.return_value = table
    self.assertEqual(table, bq.commands._bigquery._table_cell(args, None))
コード例 #15
0
ファイル: _bigquery.py プロジェクト: abhinavrpatel/pydatalab
def _load_cell(args, cell_body):
    """Implements the BigQuery load magic used to load data from GCS to a table.

   The supported syntax is:

       %bq load <optional args>

  Args:
    args: the arguments following '%bq load'.
    cell_body: optional contents of the cell interpreted as YAML or JSON.
  Returns:
    A message about whether the load succeeded or failed.
  """
    name = args['table']
    table = _get_table(name)
    if not table:
        table = bigquery.Table(name)

    if args['mode'] == 'create':
        if table.exists():
            raise Exception(
                'table %s already exists; use "append" or "overwrite" as mode.'
                % name)
        if not cell_body or 'schema' not in cell_body:
            raise Exception(
                'Table does not exist, and no schema specified in cell; cannot load.'
            )

        env = google.datalab.utils.commands.notebook_environment()
        config = google.datalab.utils.commands.parse_config(
            cell_body, env, False)
        schema = config['schema']
        # schema can be an instance of bigquery.Schema.
        # For example, user can run "my_schema = bigquery.Schema.from_data(df)" in a previous cell and
        # specify "schema: $my_schema" in cell input.
        if not isinstance(schema, bigquery.Schema):
            jsonschema.validate(config, BigQuerySchema.TABLE_SCHEMA_SCHEMA)
            schema = bigquery.Schema(schema)
        table.create(schema=schema)
    elif not table.exists():
        raise Exception('table %s does not exist; use "create" as mode.' %
                        name)

    csv_options = bigquery.CSVOptions(delimiter=args['delimiter'],
                                      skip_leading_rows=args['skip'],
                                      allow_jagged_rows=not args['strict'],
                                      quote=args['quote'])
    job = table.load(args['path'],
                     mode=args['mode'],
                     source_format=args['format'],
                     csv_options=csv_options,
                     ignore_unknown_values=not args['strict'])
    if job.failed:
        raise Exception('Load failed: %s' % str(job.fatal_error))
    elif job.errors:
        raise Exception('Load completed with errors: %s' % str(job.errors))
コード例 #16
0
  def test_load_cell(self, mock_get_table, mock_table_load, mock_table_exists,
                     mock_table_create, mock_default_context):

    args = {'table': 'project.test.table', 'mode': 'create', 'path': 'test/path_%(_ds)s',
            'skip': None, 'csv': None, 'delimiter': None, 'format': 'csv', 'strict': None,
            'quote': None}
    context = self._create_context()
    mock_get_table.return_value = bq.Table('project.test.table')
    job = bq._query_job.QueryJob('test_id', 'project.test.table', 'test_sql', context)

    mock_table_exists.return_value = True
    with self.assertRaisesRegexp(Exception, 'already exists; use "append" or "overwrite" as mode.'):
      bq.commands._bigquery._load_cell(args, None)

    mock_table_exists.return_value = False
    with self.assertRaisesRegexp(Exception, 'Table does not exist, and no schema specified'):
      bq.commands._bigquery._load_cell(args, None)

    cell_body = {
      'schema': [
        {'name': 'col1', 'type': 'int64', 'mode': 'NULLABLE', 'description': 'description1'},
        {'name': 'col1', 'type': 'STRING', 'mode': 'required', 'description': 'description1'}
      ],
      'parameters': [
        {'name': 'custom', 'type': 'FLOAT', 'value': 4.23}
      ]
    }

    mock_table_load.return_value = job
    job._is_complete = True
    job._fatal_error = 'fatal error'
    with self.assertRaisesRegexp(Exception, 'Load failed: fatal error'):
      bq.commands._bigquery._load_cell(args, json.dumps(cell_body))

    job._fatal_error = None
    job._errors = 'error'
    with self.assertRaisesRegexp(Exception, 'Load completed with errors: error'):
      bq.commands._bigquery._load_cell(args, json.dumps(cell_body))

    job._errors = None
    bq.commands._bigquery._load_cell(args, json.dumps(cell_body))
    today = datetime.now().date().isoformat()
    mock_table_load.assert_called_with('test/path_{0}'.format(today), mode='create',
                                       source_format='csv',
                                       csv_options=mock.ANY, ignore_unknown_values=True)

    mock_get_table.return_value = None
    mock_table_exists.return_value = True
    args['mode'] = 'append'
    args['format'] = 'csv'

    bq.commands._bigquery._load_cell(args, None)
    mock_table_load.assert_called_with('test/path_{0}'.format(today), mode='append',
                                       source_format='csv', csv_options=mock.ANY,
                                       ignore_unknown_values=True)
コード例 #17
0
 def test_table_cell_list_bad_filter(self, mock_datasets, mock_default_context):
   args = {'command': 'list', 'filter': 't7', 'dataset': None, 'project': None}
   tables = [bq.Table('project.test.' + name) for name in ['t1', 't2', 't11']]
   ds1 = mock.MagicMock()
   ds1.__iter__.return_value = iter([tables[0], tables[1]])
   ds2 = mock.MagicMock()
   ds2.__iter__.return_value = iter([tables[2]])
   mock_datasets.return_value = iter([ds1, ds2])
   self.assertEqual(
       bq.commands._bigquery._table_cell(args, None),
       '<pre>&lt;empty&gt;</pre>')
コード例 #18
0
 def test_table_cell_list_project(self, mock_datasets, mock_default_context):
   args = {'command': 'list', 'filter': '', 'dataset': None, 'project': 'test-project'}
   tables = [bq.Table('project.test.' + name) for name in ['t1', 't2', 't3']]
   ds1 = mock.MagicMock()
   ds1.__iter__.return_value = iter([tables[0], tables[1]])
   ds2 = mock.MagicMock()
   ds2.__iter__.return_value = iter([tables[2]])
   mock_datasets.return_value = iter([ds1, ds2])
   self.assertEqual(
       bq.commands._bigquery._table_cell(args, None),
       '<ul><li>project.test.t1</li><li>project.test.t2</li><li>project.test.t3</li></ul>')
コード例 #19
0
 def test_get_bq_execute_operator_definition(self, mock_table):
   mock_table.return_value = bq.Table(
       'foo_project.foo_dataset.foo_table',
       context=PipelineTest._create_context())
   task_id = 'foo'
   task_details = {}
   task_details['type'] = 'BigQuery'
   task_details['query'] = google.datalab.bigquery.Query(
     'SELECT * FROM publicdata.samples.wikipedia LIMIT 5')
   operator_def = pipeline.Pipeline(None, None)._get_operator_definition(
       task_id, task_details)
   self.assertEqual(operator_def, "foo = BigQueryOperator(task_id='foo_id', bql='SELECT * FROM publicdata.samples.wikipedia LIMIT 5', use_legacy_sql=False, dag=dag)\n")  # noqa
コード例 #20
0
  def test_local_bigquery_transform(self):
    """Test transfrom locally, but the data comes from bigquery."""
    try:
      self._create_test_data()

      # Make a BQ table, and insert 1 row.
      project_id = dl.Context.default().project_id
      dataset_name = 'test_transform_raw_data_%s' % uuid.uuid4().hex
      table_name = 'tmp_table'

      dataset = bq.Dataset((project_id, dataset_name)).create()
      table = bq.Table((project_id, dataset_name, table_name))
      table.create([{'name': 'num_col', 'type': 'FLOAT'},
                    {'name': 'img_col', 'type': 'STRING'}])
      table.insert(data=[{'num_col': 23.0, 'img_col': self.img_filepath}])

      tfex_dir = os.path.join(self.output_folder, 'test_results')
      cmd = ['python ' + os.path.join(CODE_PATH, 'transform_raw_data.py'),
             '--bigquery-table=%s.%s.%s' % (project_id, dataset_name, table_name),
             '--analyze-output-dir=' + self.output_folder,
             '--output-filename-prefix=features',
             '--project-id=' + project_id,
             '--output-dir=' + tfex_dir]
      subprocess.check_call(' '.join(cmd), shell=True)

      # Read the tf record file. There should only be one file.
      record_filepath = os.path.join(tfex_dir,
                                     'features-00000-of-00001.tfrecord.gz')
      options = tf.python_io.TFRecordOptions(
          compression_type=tf.python_io.TFRecordCompressionType.GZIP)
      serialized_example = next(
          tf.python_io.tf_record_iterator(
              record_filepath,
              options=options))
      example = tf.train.Example()
      example.ParseFromString(serialized_example)

      transformed_number = example.features.feature['num_col'].float_list.value[0]
      self.assertAlmostEqual(transformed_number, 24.0)

      image_bytes = example.features.feature['img_col'].bytes_list.value[0]
      raw_img = Image.open(self.img_filepath).convert('RGB')
      img_file = six.BytesIO()
      raw_img.save(img_file, 'jpeg')
      expected_image_bytes = img_file.getvalue()

      self.assertEqual(image_bytes, expected_image_bytes)
    finally:
      dataset.delete(delete_contents=True)
      shutil.rmtree(self.output_folder)
コード例 #21
0
ファイル: bigquery_tests.py プロジェクト: ZedYeung/pydatalab
    def test_table_cell_describe(self, mock_get_table, mock_default_context):
        args = {'command': 'describe', 'name': 'test-table', 'overwrite': None}
        mock_get_table.return_value = None
        with self.assertRaisesRegexp(Exception, 'Could not find table'):
            bq.commands._bigquery._table_cell(args, None)

        mock_get_table.return_value = bq.Table('project.test.table')
        schema = bq.Schema([{'name': 'col1', 'type': 'string'}])
        mock_get_table.return_value._schema = schema
        rendered = bq.commands._bigquery._table_cell(args, None)
        expected_html1 = 'bq.renderSchema(dom, [{"type": "string", "name": "col1"}]);'
        expected_html2 = 'bq.renderSchema(dom, [{"name": "col1", "type": "string"}]);'
        self.assertTrue(expected_html1 in rendered
                        or expected_html2 in rendered)
コード例 #22
0
  def test_get_bq_execute_operator_definition(self, mock_table):
    mock_table.return_value = bq.Table(
        'foo_project.foo_dataset.foo_table',
        context=PipelineTest._create_context())
    task_id = 'foo'
    task_details = {}
    task_details['type'] = 'BigQuery'

    # Adding newlines to the query to mimic actual usage of %%bq query ...
    task_details['query'] = google.datalab.bigquery.Query("""SELECT *
FROM publicdata.samples.wikipedia
LIMIT 5""")
    operator_def = pipeline.PipelineGenerator._get_operator_definition(task_id, task_details, None)
    self.assertEqual(operator_def, """foo = BigQueryOperator(task_id='foo_id', bql=\"\"\"SELECT *\nFROM publicdata.samples.wikipedia\nLIMIT 5\"\"\", use_legacy_sql=False, dag=dag)
""")  # noqa
コード例 #23
0
def main(argv=None):
  args = parse_arguments(sys.argv if argv is None else argv)

  if args.csv_schema_file:
    schema = json.loads(
        file_io.read_file_to_string(args.csv_schema_file).decode())
  else:
    import google.datalab.bigquery as bq
    schema = bq.Table(args.bigquery_table).schema._bq_schema
  features = json.loads(
      file_io.read_file_to_string(args.features_file).decode())

  expand_defaults(schema, features)  # features are updated.
  check_schema_transforms_match(schema, features)

  file_io.recursive_create_dir(args.output_dir)

  if args.cloud:
    run_cloud_analysis(
        output_dir=args.output_dir,
        csv_file_pattern=args.csv_file_pattern,
        bigquery_table=args.bigquery_table,
        schema=schema,
        features=features)
  else:
    run_local_analysis(
        output_dir=args.output_dir,
        csv_file_pattern=args.csv_file_pattern,
        schema=schema,
        features=features)

  # Also writes the transform fn and tft metadata.
  make_transform_graph(args.output_dir, schema, features)

  # Save a copy of the schema and features in the output folder.
  file_io.write_string_to_file(
    os.path.join(args.output_dir, SCHEMA_FILE),
    json.dumps(schema, indent=2))

  file_io.write_string_to_file(
    os.path.join(args.output_dir, FEATURES_FILE),
    json.dumps(features, indent=2))
コード例 #24
0
ファイル: _bigquery.py プロジェクト: tiravata/pydatalab
def _get_table(name):
  """ Given a variable or table name, get a Table if it exists.

  Args:
    name: the name of the Table or a variable referencing the Table.
  Returns:
    The Table, if found.
  """
  # If name is a variable referencing a table, use that.
  item = google.datalab.utils.commands.get_notebook_item(name)
  if isinstance(item, bigquery.Table):
    return item
  # Else treat this as a BQ table name and return the (cached) table if it exists.
  try:
    return _existing_table_cache[name]
  except KeyError:
    table = bigquery.Table(name)
    if table.exists():
      _existing_table_cache[name] = table
      return table
  return None
コード例 #25
0
  def test_get_table(self, mock_get_notebook_item, mock_table_exists, mock_get_credentials,
                     mock_default_context):
    # test bad name
    mock_get_notebook_item.return_value = None
    mock_table_exists.return_value = False
    t = bq.commands._bigquery._get_table('bad.name')
    self.assertIsNone(t)

    # test good table name
    test_table_name = 'testproject.test.table'
    mock_get_notebook_item.return_value = bq.Table(test_table_name)
    t = bq.commands._bigquery._get_table(test_table_name)
    self.assertEqual(t.full_name, test_table_name)

    # test table name reference
    mock_get_notebook_item.return_value = test_table_name
    mock_table_exists.return_value = True
    t = bq.commands._bigquery._get_table(test_table_name)
    self.assertEqual(t.full_name, test_table_name)

    self.assertIn(test_table_name, bq.commands._bigquery._existing_table_cache)
コード例 #26
0
def main(argv=None):
  args = parse_arguments(sys.argv if argv is None else argv)

  if args.schema:
    schema = json.loads(
        file_io.read_file_to_string(args.schema).decode())
  else:
    import google.datalab.bigquery as bq
    schema = bq.Table(args.bigquery).schema._bq_schema
  features = json.loads(
      file_io.read_file_to_string(args.features).decode())

  expand_defaults(schema, features)  # features are updated.
  inverted_features = invert_features(features)
  check_schema_transforms_match(schema, inverted_features)

  file_io.recursive_create_dir(args.output)

  if args.cloud:
    run_cloud_analysis(
        output_dir=args.output,
        csv_file_pattern=args.csv,
        bigquery_table=args.bigquery,
        schema=schema,
        inverted_features=inverted_features)
  else:
    run_local_analysis(
        output_dir=args.output,
        csv_file_pattern=args.csv,
        schema=schema,
        inverted_features=inverted_features)

  # Save a copy of the schema and features in the output folder.
  file_io.write_string_to_file(
    os.path.join(args.output, constant.SCHEMA_FILE),
    json.dumps(schema, indent=2))

  file_io.write_string_to_file(
    os.path.join(args.output, constant.FEATURES_FILE),
    json.dumps(features, indent=2))
コード例 #27
0
def BigQuery_exportation(df, bigquery_dataset_name, bigquery_table_name):

    print('\nBigQuery exportation started ...')
    start_time = time()

    #Export vers BigQuery
    bigquery_dataset_name = bigquery_dataset_name
    bigquery_table_name = bigquery_table_name

    # Define BigQuery dataset and table
    dataset = bq.Dataset(bigquery_dataset_name)
    table = bq.Table(bigquery_dataset_name + '.' + bigquery_table_name)

    # Create or overwrite the existing table if it exists
    table_schema = bq.Schema.from_data(df)
    table.create(schema=table_schema, overwrite=True)

    # Write the DataFrame to a BigQuery table
    table.insert(df)

    print(
        'BigQuery Exportation Finished. \nTotal exportation time = {:0.2f} min'
        .format((time() - start_time) / 60))
コード例 #28
0
ファイル: main.py プロジェクト: BOTTINYA/AutoDMS-V2
#---------------------- retravail de la classification avec les règles en dur -------------------------

#Application de ces règles au dataframe
Final_predicted_df['FLAG_RUPTURE'] = Final_predicted_df.apply(
    hard_coded_rules.flag_rupture, axis=1)
Final_predicted_df['Class_Prediction'] = Final_predicted_df.apply(
    hard_coded_rules.rejet_cause_prev, axis=1)
Final_predicted_df['Class_Prediction'] = Final_predicted_df.apply(
    hard_coded_rules.flag_livraison, axis=1)

#--------------------- BigQuery Exportation ----------------------------
print('Export to BigQuery table...')
start_time = time()

#Export vers BigQuery
bigquery_dataset_name = 'electric-armor-213817.Donnees_journalieres'
bigquery_table_name = 'Classification_journaliere'

# Define BigQuery dataset and table
dataset = bq.Dataset(bigquery_dataset_name)
table = bq.Table(bigquery_dataset_name + '.' + bigquery_table_name)

# Create or overwrite the existing table if it exists
table_schema = bq.Schema.from_data(Final_predicted_df)
table.create(schema=table_schema, overwrite=True)

# Write the DataFrame to a BigQuery table
table.insert(Final_predicted_df)
print('BigQuery export finished. \nExporting process took {:0.2f}min'.format(
    (time() - start_time) / 60))
コード例 #29
0
ファイル: _bigquery.py プロジェクト: tiravata/pydatalab
def _table_cell(args, cell_body):
  """Implements the BigQuery table magic subcommand used to operate on tables

   The supported syntax is:
   %%bq tables <command> <args>

  Commands:
    {list, create, delete, describe, view}

  Args:
    args: the optional arguments following '%%bq tables command'.
    cell_body: optional contents of the cell interpreted as SQL, YAML or JSON.
  Returns:
    The HTML rendering for the table of datasets.
  """
  if args['command'] == 'list':
    filter_ = args['filter'] if args['filter'] else '*'
    if args['dataset']:
      if args['project'] is None:
        datasets = [bigquery.Dataset(args['dataset'])]
      else:
        context = google.datalab.Context(args['project'],
                                         google.datalab.Context.default().credentials)
        datasets = [bigquery.Dataset(args['dataset'], context)]
    else:
      default_context = google.datalab.Context.default()
      context = google.datalab.Context(default_context.project_id, default_context.credentials)
      if args['project']:
        context.set_project_id(args['project'])
      datasets = bigquery.Datasets(context)

    tables = []
    for dataset in datasets:
      tables.extend([table.full_name
                     for table in dataset if fnmatch.fnmatch(table.full_name, filter_)])

    return _render_list(tables)

  elif args['command'] == 'create':
    if cell_body is None:
      print('Failed to create %s: no schema specified' % args['name'])
    else:
      try:
        record = google.datalab.utils.commands.parse_config(
            cell_body, google.datalab.utils.commands.notebook_environment(), as_dict=False)
        jsonschema.validate(record, BigQuerySchema.TABLE_SCHEMA_SCHEMA)
        schema = bigquery.Schema(record['schema'])
        bigquery.Table(args['name']).create(schema=schema, overwrite=args['overwrite'])
      except Exception as e:
        print('Failed to create table %s: %s' % (args['name'], e))

  elif args['command'] == 'describe':
    name = args['name']
    table = _get_table(name)
    if not table:
      raise Exception('Could not find table %s' % name)

    html = _repr_html_table_schema(table.schema)
    return IPython.core.display.HTML(html)

  elif args['command'] == 'delete':
    try:
      bigquery.Table(args['name']).delete()
    except Exception as e:
      print('Failed to delete table %s: %s' % (args['name'], e))

  elif args['command'] == 'view':
    name = args['name']
    table = _get_table(name)
    if not table:
      raise Exception('Could not find table %s' % name)
    return table
コード例 #30
0
  def test_local_bigquery_transform(self):
    """Test transfrom locally, but the data comes from bigquery."""

    # Make a BQ table, and insert 1 row.
    try:
      bucket_name = 'temp_pydatalab_test_%s' % uuid.uuid4().hex
      bucket_root = 'gs://%s' % bucket_name
      bucket = storage.Bucket(bucket_name)
      bucket.create()

      project_id = dl.Context.default().project_id

      dataset_name = 'test_transform_raw_data_%s' % uuid.uuid4().hex
      table_name = 'tmp_table'

      dataset = bq.Dataset((project_id, dataset_name)).create()
      table = bq.Table((project_id, dataset_name, table_name))
      table.create([{'name': 'key_col', 'type': 'INTEGER'},
                    {'name': 'target_col', 'type': 'FLOAT'},
                    {'name': 'cat_col', 'type': 'STRING'},
                    {'name': 'num_col', 'type': 'FLOAT'},
                    {'name': 'img_col', 'type': 'STRING'}])

      img1_file = os.path.join(self.source_dir, 'img1.jpg')
      dest_file = os.path.join(bucket_root, 'img1.jpg')
      file_io.copy(img1_file, dest_file)

      data = [
          {
           'key_col': 1,
           'target_col': 1.0,
           'cat_col': 'Monday',
           'num_col': 23.0,
           'img_col': dest_file,
          },
      ]
      table.insert(data=data)

      cmd = ['python ' + os.path.join(CODE_PATH, 'transform.py'),
             '--bigquery=%s.%s.%s' % (project_id, dataset_name, table_name),
             '--analysis=' + self.analysis_dir,
             '--prefix=features',
             '--project-id=' + project_id,
             '--output=' + self.output_dir]
      print('cmd ', ' '.join(cmd))
      subprocess.check_call(' '.join(cmd), shell=True)

      # Read the tf record file. There should only be one file.
      record_filepath = os.path.join(self.output_dir,
                                     'features-00000-of-00001.tfrecord.gz')
      options = tf.python_io.TFRecordOptions(
          compression_type=tf.python_io.TFRecordCompressionType.GZIP)
      serialized_examples = list(tf.python_io.tf_record_iterator(record_filepath, options=options))
      self.assertEqual(len(serialized_examples), 1)

      example = tf.train.Example()
      example.ParseFromString(serialized_examples[0])

      transformed_number = example.features.feature['num_col'].float_list.value[0]
      self.assertAlmostEqual(transformed_number, 23.0)
      transformed_category = example.features.feature['cat_col'].int64_list.value[0]
      self.assertEqual(transformed_category, 2)
      image_bytes = example.features.feature['img_col'].float_list.value
      self.assertEqual(len(image_bytes), 2048)
      self.assertTrue(any(x != 0.0 for x in image_bytes))
    finally:
      dataset.delete(delete_contents=True)

      for obj in bucket.objects():
        obj.delete()
      bucket.delete()