def _dataset_line(args): """Implements the BigQuery dataset magic subcommand used to operate on datasets The supported syntax is: %bq datasets <command> <args> Commands: {list, create, delete} Args: args: the optional arguments following '%bq datasets command'. """ if args['command'] == 'list': filter_ = args['filter'] if args['filter'] else '*' context = google.datalab.Context.default() if args['project']: context = google.datalab.Context(args['project'], context.credentials) return _render_list([str(dataset) for dataset in bigquery.Datasets(context) if fnmatch.fnmatch(str(dataset), filter_)]) elif args['command'] == 'create': try: bigquery.Dataset(args['name']).create(friendly_name=args['friendly']) except Exception as e: print('Failed to create dataset %s: %s' % (args['name'], e)) elif args['command'] == 'delete': try: bigquery.Dataset(args['name']).delete() except Exception as e: print('Failed to delete dataset %s: %s' % (args['name'], e))
def test_datalab_load_table_from_dataframe(to_delete): # [START bigquery_migration_datalab_load_table_from_dataframe] import google.datalab.bigquery as bq import pandas # Create the dataset dataset_id = 'import_sample' # [END bigquery_migration_datalab_load_table_from_dataframe] # Use unique dataset ID to avoid collisions when running tests dataset_id = 'test_dataset_{}'.format(int(time.time() * 1000)) to_delete.append(dataset_id) # [START bigquery_migration_datalab_load_table_from_dataframe] bq.Dataset(dataset_id).create() # Create the table and load the data dataframe = pandas.DataFrame([ {'title': 'The Meaning of Life', 'release_year': 1983}, {'title': 'Monty Python and the Holy Grail', 'release_year': 1975}, {'title': 'Life of Brian', 'release_year': 1979}, { 'title': 'And Now for Something Completely Different', 'release_year': 1971 }, ]) schema = bq.Schema.from_data(dataframe) table = bq.Table( '{}.monty_python'.format(dataset_id)).create(schema=schema) table.insert(dataframe) # Starts steaming insert of data
def test_datalab_load_table_from_gcs_csv(to_delete): # [START bigquery_migration_datalab_load_table_from_gcs_csv] import google.datalab.bigquery as bq # Create the dataset dataset_id = 'import_sample' # [END bigquery_migration_datalab_load_table_from_gcs_csv] # Use unique dataset ID to avoid collisions when running tests dataset_id = 'test_dataset_{}'.format(int(time.time() * 1000)) to_delete.append(dataset_id) # [START bigquery_migration_datalab_load_table_from_gcs_csv] bq.Dataset(dataset_id).create() # Create the table schema = [ {'name': 'name', 'type': 'STRING'}, {'name': 'post_abbr', 'type': 'STRING'}, ] table = bq.Table( '{}.us_states'.format(dataset_id)).create(schema=schema) table.load( 'gs://cloud-samples-data/bigquery/us-states/us-states.csv', mode='append', source_format='csv', csv_options=bq.CSVOptions(skip_leading_rows=1) ) # Waits for the job to complete # [END bigquery_migration_datalab_load_table_from_gcs_csv] assert table.length == 50
def test_numerics(self): """Build a BQ table, and then call analyze on it.""" schema = [{ 'name': 'col1', 'type': 'INTEGER' }, { 'name': 'col2', 'type': 'FLOAT' }] project_id = dl.Context.default().project_id dataset_name = 'temp_pydatalab_test_%s' % uuid.uuid4().hex table_name = 'temp_table' full_table_name = '%s.%s.%s' % (project_id, dataset_name, table_name) output_folder = tempfile.mkdtemp() try: # Make a dataset, a table, and insert data. db = bq.Dataset((project_id, dataset_name)) db.create() table = bq.Table(full_table_name) table.create(schema=bq.Schema(schema), overwrite=True) data = [{'col1': i, 'col2': 10 * i + 0.5} for i in range(100)] table.insert(data) analyze_data.run_cloud_analysis(output_dir=output_folder, csv_file_pattern=None, bigquery_table=full_table_name, schema=schema, features={ 'col1': { 'transform': 'scale' }, 'col2': { 'transform': 'identity' } }) stats = json.loads( file_io.read_file_to_string( os.path.join(output_folder, analyze_data.STATS_FILE)).decode()) self.assertEqual(stats['num_examples'], 100) col = stats['column_stats']['col1'] self.assertAlmostEqual(col['max'], 99.0) self.assertAlmostEqual(col['min'], 0.0) self.assertAlmostEqual(col['mean'], 49.5) col = stats['column_stats']['col2'] self.assertAlmostEqual(col['max'], 990.5) self.assertAlmostEqual(col['min'], 0.5) self.assertAlmostEqual(col['mean'], 495.5) finally: shutil.rmtree(output_folder) db.delete(delete_contents=True)
def test_local_bigquery_transform(self): """Test transfrom locally, but the data comes from bigquery.""" try: self._create_test_data() # Make a BQ table, and insert 1 row. project_id = dl.Context.default().project_id dataset_name = 'test_transform_raw_data_%s' % uuid.uuid4().hex table_name = 'tmp_table' dataset = bq.Dataset((project_id, dataset_name)).create() table = bq.Table((project_id, dataset_name, table_name)) table.create([{'name': 'num_col', 'type': 'FLOAT'}, {'name': 'img_col', 'type': 'STRING'}]) table.insert(data=[{'num_col': 23.0, 'img_col': self.img_filepath}]) tfex_dir = os.path.join(self.output_folder, 'test_results') cmd = ['python ' + os.path.join(CODE_PATH, 'transform_raw_data.py'), '--bigquery-table=%s.%s.%s' % (project_id, dataset_name, table_name), '--analyze-output-dir=' + self.output_folder, '--output-filename-prefix=features', '--project-id=' + project_id, '--output-dir=' + tfex_dir] subprocess.check_call(' '.join(cmd), shell=True) # Read the tf record file. There should only be one file. record_filepath = os.path.join(tfex_dir, 'features-00000-of-00001.tfrecord.gz') options = tf.python_io.TFRecordOptions( compression_type=tf.python_io.TFRecordCompressionType.GZIP) serialized_example = next( tf.python_io.tf_record_iterator( record_filepath, options=options)) example = tf.train.Example() example.ParseFromString(serialized_example) transformed_number = example.features.feature['num_col'].float_list.value[0] self.assertAlmostEqual(transformed_number, 24.0) image_bytes = example.features.feature['img_col'].bytes_list.value[0] raw_img = Image.open(self.img_filepath).convert('RGB') img_file = six.BytesIO() raw_img.save(img_file, 'jpeg') expected_image_bytes = img_file.getvalue() self.assertEqual(image_bytes, expected_image_bytes) finally: dataset.delete(delete_contents=True) shutil.rmtree(self.output_folder)
def BigQuery_exportation(df, bigquery_dataset_name, bigquery_table_name): print('\nBigQuery exportation started ...') start_time = time() #Export vers BigQuery bigquery_dataset_name = bigquery_dataset_name bigquery_table_name = bigquery_table_name # Define BigQuery dataset and table dataset = bq.Dataset(bigquery_dataset_name) table = bq.Table(bigquery_dataset_name + '.' + bigquery_table_name) # Create or overwrite the existing table if it exists table_schema = bq.Schema.from_data(df) table.create(schema=table_schema, overwrite=True) # Write the DataFrame to a BigQuery table table.insert(df) print( 'BigQuery Exportation Finished. \nTotal exportation time = {:0.2f} min' .format((time() - start_time) / 60))
def test_local_bigquery_transform(self): """Test transfrom locally, but the data comes from bigquery.""" # Make a BQ table, and insert 1 row. try: bucket_name = 'temp_pydatalab_test_%s' % uuid.uuid4().hex bucket_root = 'gs://%s' % bucket_name bucket = storage.Bucket(bucket_name) bucket.create() project_id = dl.Context.default().project_id dataset_name = 'test_transform_raw_data_%s' % uuid.uuid4().hex table_name = 'tmp_table' dataset = bq.Dataset((project_id, dataset_name)).create() table = bq.Table((project_id, dataset_name, table_name)) table.create([{'name': 'key_col', 'type': 'INTEGER'}, {'name': 'target_col', 'type': 'FLOAT'}, {'name': 'cat_col', 'type': 'STRING'}, {'name': 'num_col', 'type': 'FLOAT'}, {'name': 'img_col', 'type': 'STRING'}]) img1_file = os.path.join(self.source_dir, 'img1.jpg') dest_file = os.path.join(bucket_root, 'img1.jpg') file_io.copy(img1_file, dest_file) data = [ { 'key_col': 1, 'target_col': 1.0, 'cat_col': 'Monday', 'num_col': 23.0, 'img_col': dest_file, }, ] table.insert(data=data) cmd = ['python ' + os.path.join(CODE_PATH, 'transform.py'), '--bigquery=%s.%s.%s' % (project_id, dataset_name, table_name), '--analysis=' + self.analysis_dir, '--prefix=features', '--project-id=' + project_id, '--output=' + self.output_dir] print('cmd ', ' '.join(cmd)) subprocess.check_call(' '.join(cmd), shell=True) # Read the tf record file. There should only be one file. record_filepath = os.path.join(self.output_dir, 'features-00000-of-00001.tfrecord.gz') options = tf.python_io.TFRecordOptions( compression_type=tf.python_io.TFRecordCompressionType.GZIP) serialized_examples = list(tf.python_io.tf_record_iterator(record_filepath, options=options)) self.assertEqual(len(serialized_examples), 1) example = tf.train.Example() example.ParseFromString(serialized_examples[0]) transformed_number = example.features.feature['num_col'].float_list.value[0] self.assertAlmostEqual(transformed_number, 23.0) transformed_category = example.features.feature['cat_col'].int64_list.value[0] self.assertEqual(transformed_category, 2) image_bytes = example.features.feature['img_col'].float_list.value self.assertEqual(len(image_bytes), 2048) self.assertTrue(any(x != 0.0 for x in image_bytes)) finally: dataset.delete(delete_contents=True) for obj in bucket.objects(): obj.delete() bucket.delete()
#---------------------- retravail de la classification avec les règles en dur ------------------------- #Application de ces règles au dataframe Final_predicted_df['FLAG_RUPTURE'] = Final_predicted_df.apply( hard_coded_rules.flag_rupture, axis=1) Final_predicted_df['Class_Prediction'] = Final_predicted_df.apply( hard_coded_rules.rejet_cause_prev, axis=1) Final_predicted_df['Class_Prediction'] = Final_predicted_df.apply( hard_coded_rules.flag_livraison, axis=1) #--------------------- BigQuery Exportation ---------------------------- print('Export to BigQuery table...') start_time = time() #Export vers BigQuery bigquery_dataset_name = 'electric-armor-213817.Donnees_journalieres' bigquery_table_name = 'Classification_journaliere' # Define BigQuery dataset and table dataset = bq.Dataset(bigquery_dataset_name) table = bq.Table(bigquery_dataset_name + '.' + bigquery_table_name) # Create or overwrite the existing table if it exists table_schema = bq.Schema.from_data(Final_predicted_df) table.create(schema=table_schema, overwrite=True) # Write the DataFrame to a BigQuery table table.insert(Final_predicted_df) print('BigQuery export finished. \nExporting process took {:0.2f}min'.format( (time() - start_time) / 60))
def _table_cell(args, cell_body): """Implements the BigQuery table magic subcommand used to operate on tables The supported syntax is: %%bq tables <command> <args> Commands: {list, create, delete, describe, view} Args: args: the optional arguments following '%%bq tables command'. cell_body: optional contents of the cell interpreted as SQL, YAML or JSON. Returns: The HTML rendering for the table of datasets. """ if args['command'] == 'list': filter_ = args['filter'] if args['filter'] else '*' if args['dataset']: if args['project'] is None: datasets = [bigquery.Dataset(args['dataset'])] else: context = google.datalab.Context(args['project'], google.datalab.Context.default().credentials) datasets = [bigquery.Dataset(args['dataset'], context)] else: default_context = google.datalab.Context.default() context = google.datalab.Context(default_context.project_id, default_context.credentials) if args['project']: context.set_project_id(args['project']) datasets = bigquery.Datasets(context) tables = [] for dataset in datasets: tables.extend([table.full_name for table in dataset if fnmatch.fnmatch(table.full_name, filter_)]) return _render_list(tables) elif args['command'] == 'create': if cell_body is None: print('Failed to create %s: no schema specified' % args['name']) else: try: record = google.datalab.utils.commands.parse_config( cell_body, google.datalab.utils.commands.notebook_environment(), as_dict=False) jsonschema.validate(record, BigQuerySchema.TABLE_SCHEMA_SCHEMA) schema = bigquery.Schema(record['schema']) bigquery.Table(args['name']).create(schema=schema, overwrite=args['overwrite']) except Exception as e: print('Failed to create table %s: %s' % (args['name'], e)) elif args['command'] == 'describe': name = args['name'] table = _get_table(name) if not table: raise Exception('Could not find table %s' % name) html = _repr_html_table_schema(table.schema) return IPython.core.display.HTML(html) elif args['command'] == 'delete': try: bigquery.Table(args['name']).delete() except Exception as e: print('Failed to delete table %s: %s' % (args['name'], e)) elif args['command'] == 'view': name = args['name'] table = _get_table(name) if not table: raise Exception('Could not find table %s' % name) return table