def index_table(es, index_name, client, table, participant_id_column, sample_id_column, sample_file_columns, billing_project_id): """Indexes a BigQuery table. Args: es: Elasticsearch object. index_name: Name of Elasticsearch index. table_name: Fully-qualified table name of the format: "<project id>.<dataset id>.<table name>" participant_id_column: Name of the column containing the participant ID. sample_id_column: (optional) Name of the column containing the sample ID (only needed on samples tables). sample_file_columns: (optional) Mappings for columns which contain genomic files of a particular type (specified in ui.json). billing_project_id: GCP project ID to bill for reading table """ _create_nested_mappings(es, index_name, table, sample_id_column) table_name = _table_name_from_table(table) start_time = time.time() logger.info('Indexing %s into %s.' % (table_name, index_name)) # There is no easy way to import BigQuery -> Elasticsearch. Instead: # BigQuery table -> pandas dataframe -> dict -> Elasticsearch df = pd.read_gbq('SELECT * FROM `%s`' % table_name, project_id=billing_project_id, dialect='standard') elapsed_time = time.time() - start_time elapsed_time_str = time.strftime('%Hh:%Mm:%Ss', time.gmtime(elapsed_time)) logger.info('BigQuery -> pandas took %s' % elapsed_time_str) logger.info('%s has %d rows' % (table_name, len(df))) if not participant_id_column in df.columns: raise ValueError( 'Participant ID column %s not found in BigQuery table %s' % (participant_id_column, table_name)) if sample_id_column in df.columns: scripts_by_id = _sample_scripts_by_id(df, table_name, participant_id_column, sample_id_column, sample_file_columns) indexer_util.bulk_index_scripts(es, index_name, scripts_by_id) else: docs_by_id = _docs_by_id(df, table_name, participant_id_column) indexer_util.bulk_index_docs(es, index_name, docs_by_id) elapsed_time = time.time() - start_time elapsed_time_str = time.strftime("%Hh:%Mm:%Ss", time.gmtime(elapsed_time)) logger.info('pandas -> ElasticSearch index took %s' % elapsed_time_str)
def index_fields(es, index_name, table, sample_id_column): table_name = _table_name_from_table(table) logger.info('Indexing %s into %s.' % (table_name, index_name)) id_prefix = table_name fields = table.schema # If the table contains the sample_id_columnm, prefix the elasticsearch Name # of the fields in this table with "samples." # This is needed to differentiate the sample facets for special handling. for field in fields: if field.name == sample_id_column: id_prefix = "samples." + id_prefix field_docs = _field_docs_by_id(id_prefix, '', fields) indexer_util.bulk_index_docs(es, index_name, field_docs)
def index_fields(es, index_name, table, participant_id_column, sample_id_column, columns_to_ignore): table_name = _table_name_from_table(table) logger.info('Indexing %s into %s.' % (table_name, index_name)) id_prefix = table_name fields = table.schema # If the table contains the sample_id_columnm, prefix the elasticsearch Name # of the fields in this table with "samples." # This is needed to differentiate the sample facets for special handling. for field in fields: if field.name == sample_id_column: id_prefix = "samples." + id_prefix # Use simple analyzer so underscores are treated as a word delimiter. # With default analyzer, searching for "baseline" would not find BQ column named "age_at_baseline". # With simple analyzer searching for "baseline" would find BQ column named "age_at_baseline". mappings = { 'dynamic': False, 'properties': { 'name': { 'type': 'text', 'fields': { 'keyword': { 'type': 'keyword', 'ignore_above': 256 } }, 'analyzer': 'simple' }, 'description': { 'type': 'text', 'fields': { 'keyword': { 'type': 'keyword', 'ignore_above': 256 } }, 'analyzer': 'simple' }, } } field_docs = _field_docs_by_id(id_prefix, '', fields, participant_id_column, sample_id_column, columns_to_ignore) es.indices.put_mapping(doc_type='type', index=index_name, body=mappings) indexer_util.bulk_index_docs(es, index_name, field_docs)
def index_table(es, bq_client, storage_client, index_name, table, participant_id_column, sample_id_column, sample_file_columns, time_series_column, time_series_vals, deploy_project_id): table_name = _table_name_from_table(table) bucket_name = '%s-table-export' % deploy_project_id table_export_bucket = storage_client.lookup_bucket(bucket_name) if not table_export_bucket: table_export_bucket = storage_client.create_bucket(bucket_name) unique_id = str(uuid.uuid4()) export_obj_prefix = 'export-%s' % unique_id job_config = bigquery.job.ExtractJobConfig() job_config.destination_format = ( bigquery.DestinationFormat.NEWLINE_DELIMITED_JSON) logger.info('Running extract table job for: %s' % table_name) table_is_view = table.table_type == 'VIEW' if table_is_view: # BigQuery cannot export data from a view. So as a workaround, # create a table from the view and use that instead. logger.info('%s is a view, attempting to create new table' % table_name) table = _create_table_from_view(bq_client, table) job = bq_client.extract_table( table, # The '*'' enables file sharding, which is required for larger datasets. 'gs://%s/%s*.json' % (bucket_name, export_obj_prefix), job_id=unique_id, job_config=job_config) # Wait up to 10 minutes for the resulting export files to be created. job.result(timeout=600) if sample_id_column in [f.name for f in table.schema]: # Cannot have time series data for samples. assert not time_series_vals scripts_by_id = _sample_scripts_by_id_from_export( storage_client, bucket_name, export_obj_prefix, table_name, participant_id_column, sample_id_column, sample_file_columns) indexer_util.bulk_index_scripts(es, index_name, scripts_by_id) elif time_series_vals: assert time_series_column in [f.name for f in table.schema] if time_series_vals[0] == 'Unknown' and len(time_series_vals) == 1: time_series_type = type(None) elif '_' in ''.join(time_series_vals): time_series_type = float else: time_series_type = int scripts_by_id = _tsv_scripts_by_id_from_export( storage_client, bucket_name, export_obj_prefix, table_name, participant_id_column, time_series_column, time_series_type) indexer_util.bulk_index_scripts(es, index_name, scripts_by_id) else: docs_by_id = _docs_by_id_from_export(storage_client, bucket_name, export_obj_prefix, table_name, participant_id_column) indexer_util.bulk_index_docs(es, index_name, docs_by_id) if table_is_view: # Delete the temporary copy table we created bq_client.delete_table(table) logger.info('Deleted temporary copy table %s' % _table_name_from_table(table))