Example #1
0
def index_table(es, index_name, client, table, participant_id_column,
                sample_id_column, sample_file_columns, billing_project_id):
    """Indexes a BigQuery table.

    Args:
        es: Elasticsearch object.
        index_name: Name of Elasticsearch index.
        table_name: Fully-qualified table name of the format:
            "<project id>.<dataset id>.<table name>"
        participant_id_column: Name of the column containing the participant ID.
        sample_id_column: (optional) Name of the column containing the sample ID
            (only needed on samples tables).
        sample_file_columns: (optional) Mappings for columns which contain genomic
            files of a particular type (specified in ui.json).
        billing_project_id: GCP project ID to bill for reading table
    """
    _create_nested_mappings(es, index_name, table, sample_id_column)
    table_name = _table_name_from_table(table)
    start_time = time.time()
    logger.info('Indexing %s into %s.' % (table_name, index_name))

    # There is no easy way to import BigQuery -> Elasticsearch. Instead:
    # BigQuery table -> pandas dataframe -> dict -> Elasticsearch
    df = pd.read_gbq('SELECT * FROM `%s`' % table_name,
                     project_id=billing_project_id,
                     dialect='standard')
    elapsed_time = time.time() - start_time
    elapsed_time_str = time.strftime('%Hh:%Mm:%Ss', time.gmtime(elapsed_time))
    logger.info('BigQuery -> pandas took %s' % elapsed_time_str)
    logger.info('%s has %d rows' % (table_name, len(df)))

    if not participant_id_column in df.columns:
        raise ValueError(
            'Participant ID column %s not found in BigQuery table %s' %
            (participant_id_column, table_name))

    if sample_id_column in df.columns:
        scripts_by_id = _sample_scripts_by_id(df, table_name,
                                              participant_id_column,
                                              sample_id_column,
                                              sample_file_columns)
        indexer_util.bulk_index_scripts(es, index_name, scripts_by_id)
    else:
        docs_by_id = _docs_by_id(df, table_name, participant_id_column)
        indexer_util.bulk_index_docs(es, index_name, docs_by_id)

    elapsed_time = time.time() - start_time
    elapsed_time_str = time.strftime("%Hh:%Mm:%Ss", time.gmtime(elapsed_time))
    logger.info('pandas -> ElasticSearch index took %s' % elapsed_time_str)
Example #2
0
def index_fields(es, index_name, table, sample_id_column):
    table_name = _table_name_from_table(table)
    logger.info('Indexing %s into %s.' % (table_name, index_name))

    id_prefix = table_name
    fields = table.schema
    # If the table contains the sample_id_columnm, prefix the elasticsearch Name
    # of the fields in this table with "samples."
    # This is needed to differentiate the sample facets for special handling.
    for field in fields:
        if field.name == sample_id_column:
            id_prefix = "samples." + id_prefix

    field_docs = _field_docs_by_id(id_prefix, '', fields)
    indexer_util.bulk_index_docs(es, index_name, field_docs)
def index_fields(es, index_name, table, participant_id_column,
                 sample_id_column, columns_to_ignore):
    table_name = _table_name_from_table(table)
    logger.info('Indexing %s into %s.' % (table_name, index_name))

    id_prefix = table_name
    fields = table.schema
    # If the table contains the sample_id_columnm, prefix the elasticsearch Name
    # of the fields in this table with "samples."
    # This is needed to differentiate the sample facets for special handling.
    for field in fields:
        if field.name == sample_id_column:
            id_prefix = "samples." + id_prefix
    # Use simple analyzer so underscores are treated as a word delimiter.
    # With default analyzer, searching for "baseline" would not find BQ column named "age_at_baseline".
    # With simple analyzer searching for "baseline" would find BQ column named "age_at_baseline".
    mappings = {
        'dynamic': False,
        'properties': {
            'name': {
                'type': 'text',
                'fields': {
                    'keyword': {
                        'type': 'keyword',
                        'ignore_above': 256
                    }
                },
                'analyzer': 'simple'
            },
            'description': {
                'type': 'text',
                'fields': {
                    'keyword': {
                        'type': 'keyword',
                        'ignore_above': 256
                    }
                },
                'analyzer': 'simple'
            },
        }
    }

    field_docs = _field_docs_by_id(id_prefix, '', fields,
                                   participant_id_column, sample_id_column,
                                   columns_to_ignore)
    es.indices.put_mapping(doc_type='type', index=index_name, body=mappings)
    indexer_util.bulk_index_docs(es, index_name, field_docs)
def index_table(es, bq_client, storage_client, index_name, table,
                participant_id_column, sample_id_column, sample_file_columns,
                time_series_column, time_series_vals, deploy_project_id):
    table_name = _table_name_from_table(table)
    bucket_name = '%s-table-export' % deploy_project_id
    table_export_bucket = storage_client.lookup_bucket(bucket_name)
    if not table_export_bucket:
        table_export_bucket = storage_client.create_bucket(bucket_name)

    unique_id = str(uuid.uuid4())
    export_obj_prefix = 'export-%s' % unique_id
    job_config = bigquery.job.ExtractJobConfig()
    job_config.destination_format = (
        bigquery.DestinationFormat.NEWLINE_DELIMITED_JSON)
    logger.info('Running extract table job for: %s' % table_name)

    table_is_view = table.table_type == 'VIEW'
    if table_is_view:
        # BigQuery cannot export data from a view. So as a workaround,
        # create a table from the view and use that instead.
        logger.info('%s is a view, attempting to create new table' %
                    table_name)
        table = _create_table_from_view(bq_client, table)

    job = bq_client.extract_table(
        table,
        # The '*'' enables file sharding, which is required for larger datasets.
        'gs://%s/%s*.json' % (bucket_name, export_obj_prefix),
        job_id=unique_id,
        job_config=job_config)
    # Wait up to 10 minutes for the resulting export files to be created.
    job.result(timeout=600)
    if sample_id_column in [f.name for f in table.schema]:
        # Cannot have time series data for samples.
        assert not time_series_vals
        scripts_by_id = _sample_scripts_by_id_from_export(
            storage_client, bucket_name, export_obj_prefix, table_name,
            participant_id_column, sample_id_column, sample_file_columns)
        indexer_util.bulk_index_scripts(es, index_name, scripts_by_id)
    elif time_series_vals:
        assert time_series_column in [f.name for f in table.schema]
        if time_series_vals[0] == 'Unknown' and len(time_series_vals) == 1:
            time_series_type = type(None)
        elif '_' in ''.join(time_series_vals):
            time_series_type = float
        else:
            time_series_type = int
        scripts_by_id = _tsv_scripts_by_id_from_export(
            storage_client, bucket_name, export_obj_prefix, table_name,
            participant_id_column, time_series_column, time_series_type)
        indexer_util.bulk_index_scripts(es, index_name, scripts_by_id)
    else:
        docs_by_id = _docs_by_id_from_export(storage_client, bucket_name,
                                             export_obj_prefix, table_name,
                                             participant_id_column)
        indexer_util.bulk_index_docs(es, index_name, docs_by_id)

    if table_is_view:
        # Delete the temporary copy table we created
        bq_client.delete_table(table)
        logger.info('Deleted temporary copy table %s' %
                    _table_name_from_table(table))