Esempio n. 1
0
    def validate(self, parsed_args, client=None):
        if not client:
            credentials = GoogleCredentials.get_application_default(
            ).create_scoped(['https://www.googleapis.com/auth/bigquery'])
            client = bigquery.BigqueryV2(credentials=credentials)

        project_id, dataset_id, table_id = bigquery_util.parse_table_reference(
            parsed_args.input_table)
        if not bigquery_util.table_exist(client, project_id, dataset_id,
                                         table_id):
            raise ValueError('Table {}:{}.{} does not exist.'.format(
                project_id, dataset_id, table_id))
        if parsed_args.custom_sample_info_table:
            sample_project_id, sample_dataset_id, sample_table_id = (
                bigquery_util.parse_table_reference(
                    parsed_args.custom_sample_info_table))
        else:
            if table_id.count(TABLE_SUFFIX_SEPARATOR) != 1:
                raise ValueError(
                    'Input table {} is malformed - exactly one suffix separator "{}" '
                    'is required'.format(parsed_args.input_table,
                                         TABLE_SUFFIX_SEPARATOR))
            base_table_id = table_id[:table_id.find(TABLE_SUFFIX_SEPARATOR)]
            sample_project_id = project_id
            sample_dataset_id = dataset_id
            sample_table_id = bigquery_util.compose_table_name(
                base_table_id, SAMPLE_INFO_TABLE_SUFFIX)

        if not bigquery_util.table_exist(client, sample_project_id,
                                         sample_dataset_id, sample_table_id):
            raise ValueError('Sample table {}:{}.{} does not exist.'.format(
                sample_project_id, sample_dataset_id, sample_table_id))
def _get_variant_query(known_args, schema):
    # type: (argparse.Namespace, bigquery_v2.TableSchema) -> str
    """Returns a BigQuery query for the interested regions."""
    columns = _get_query_columns(schema)
    base_query = _BASE_QUERY_TEMPLATE.format(
        COLUMNS=', '.join(columns),
        INPUT_TABLE='.'.join(
            bigquery_util.parse_table_reference(known_args.input_table)))
    conditions = []
    if known_args.genomic_regions:
        for region in known_args.genomic_regions:
            ref, start, end = genomic_region_parser.parse_genomic_region(
                region)
            conditions.append(
                _GENOMIC_REGION_TEMPLATE.format(
                    REFERENCE_NAME_ID=bigquery_util.ColumnKeyConstants.
                    REFERENCE_NAME,
                    REFERENCE_NAME_VALUE=ref,
                    START_POSITION_ID=bigquery_util.ColumnKeyConstants.
                    START_POSITION,
                    START_POSITION_VALUE=start,
                    END_POSITION_ID=bigquery_util.ColumnKeyConstants.
                    END_POSITION,
                    END_POSITION_VALUE=end))

    if not conditions:
        return base_query
    return ' '.join([base_query, 'WHERE', ' OR '.join(conditions)])
  def __init__(self, base_table_id, suffixes, append):
    # type (str, List[str]) -> None
    """Initialize `FlattenCallColumn` object.

    In preparation to convert variant lookup optimized tables to sample lookup
    optimized tables, we initiate this class with the base table name of variant
    opt table (set using --output_table flag) and the list of suffixes (which
    are extracted from sharding config file).

    Args:
      base_table_id: Base name of variant opt outputs (set by --output_table).
      suffixes: List of suffixes (extracted from sharding config file).
      append: Whether or not we are appending to the destination tables.
    """
    (self._project_id,
     self._dataset_id,
     self._base_table) = bigquery_util.parse_table_reference(base_table_id)
    assert suffixes
    self._suffixes = suffixes[:]

    self._column_names = []
    self._sub_fields = []

    job_config = bigquery.job.QueryJobConfig(
        write_disposition='WRITE_TRUNCATE' if append else 'WRITE_EMPTY')
    self._client = bigquery.Client(project=self._project_id,
                                   default_query_job_config=job_config)
    self._find_one_non_empty_table()
Esempio n. 4
0
def _get_schema(input_table):
  # type: (str) -> bigquery_v2.TableSchema
  project_id, dataset_id, table_id = bigquery_util.parse_table_reference(
      input_table)
  credentials = (client.GoogleCredentials.get_application_default().
                 create_scoped(['https://www.googleapis.com/auth/bigquery']))
  bigquery_client = bigquery_v2.BigqueryV2(credentials=credentials)
  table = bigquery_client.tables.Get(bigquery_v2.BigqueryTablesGetRequest(
      projectId=project_id, datasetId=dataset_id, tableId=table_id))
  return table.schema
Esempio n. 5
0
    def __init__(
            self,
            avro_root_path,  # type: str
            output_table,  # type: str
            suffixes,  # type: List[str]
            delete_empty_tables  # type: bool
    ):
        """Initializes `LoadAvro` object.

    This class loads AVRO files generated by Dataflow pipeline into BigQuery.
    In our default sharding config we have 25 output tables, so here `suffixes`
    will have 25 + 1 (sample_info) values. For each of those suffixes this class
    will load destination BigQuery table with its AVRO files, for example:
    for `chr1` suffix:
       gs://TEMP_LOCATION/avro/JOB_NAME/YYYYMMDD_HHMMSS/chr1-*
    will be loaded to:
       PROJECT_ID.DATASET_ID.BASE_TABLE_ID__chr1

    After loading, if 0 rows were loaded (ie empty AVRO files) then destination
    table will be deleted if `delete_empty_tables` is set.

    Note1: This class assumes the destination table is already created. This is
    because integer range partitioning and clustering of columns must be done
    when the table is created.
    Note2: If we run all 26 jobs in parallel BigQuery will be overwhelmed and
    jobs fail randomly, thus we use _MAX_NUM_CONCURRENT_AVRO_LOAD_JOBS limit.

    Args:
      avro_root_path: Location of AVRO files on Google Cloud Storage (GCS).
      output_table: Base table_name `__` + suffixes will be added to it.
      suffixes: List of table suffixes: `__chr1`, `__chr2`, ... `sample_info`.
      delete_empty_tables: Whether or not to delete tables with 0 rows loaded.
    """
        self._avro_root_path = avro_root_path
        project_id, dataset_id, table_id = bigquery_util.parse_table_reference(
            output_table)
        self._table_base_name = '{}.{}.{}'.format(project_id, dataset_id,
                                                  table_id)

        self._num_load_jobs_retries = 0
        self._suffixes_to_load_jobs = {
        }  # type: Dict[str, bigquery.job.LoadJob]
        self._remaining_load_jobs = suffixes[:]

        self._delete_empty_tables = delete_empty_tables
        self._not_empty_suffixes = []

        self._client = bigquery.Client(project=project_id)
    def _validate_output_tables(self, client, output_table_base_name,
                                sharding_config_path, append, is_main_output):
        if (output_table_base_name !=
                bigquery_util.get_table_base_name(output_table_base_name)):
            raise ValueError(
                ('Output table cannot contain "{}". we reserve this '
                 'string to mark sharded output tables.').format(
                     bigquery_util.TABLE_SUFFIX_SEPARATOR))

        project_id, dataset_id, table_id = bigquery_util.parse_table_reference(
            output_table_base_name)
        bigquery_util.raise_error_if_dataset_not_exists(
            client, project_id, dataset_id)
        all_output_tables = []
        if is_main_output:
            all_output_tables.append(
                bigquery_util.compose_table_name(table_id,
                                                 SAMPLE_INFO_TABLE_SUFFIX))
        sharding = variant_sharding.VariantSharding(sharding_config_path)
        num_shards = sharding.get_num_shards()
        # In case there is no residual in config we will ignore the last shard.
        if not sharding.should_keep_shard(sharding.get_residual_index()):
            num_shards -= 1
        for i in range(num_shards):
            table_suffix = sharding.get_output_table_suffix(i)
            if table_suffix != bigquery_util.get_table_base_name(table_suffix):
                raise ValueError(
                    ('Table suffix cannot contain "{}" we reserve this  '
                     'string to mark sharded output tables.').format(
                         bigquery_util.TABLE_SUFFIX_SEPARATOR))
            all_output_tables.append(
                bigquery_util.compose_table_name(table_id, table_suffix))

        for output_table in all_output_tables:
            if append:
                if not bigquery_util.table_exist(client, project_id,
                                                 dataset_id, output_table):
                    raise ValueError(
                        'Table {}:{}.{} does not exist, cannot append to it.'.
                        format(project_id, dataset_id, output_table))
            else:
                if bigquery_util.table_exist(client, project_id, dataset_id,
                                             output_table):
                    raise ValueError((
                        'Table {}:{}.{} already exists, cannot overwrite it. Please '
                        'set `--append True` if you want to append to it.'
                    ).format(project_id, dataset_id, output_table))
Esempio n. 7
0
def _bigquery_to_vcf_shards(
        known_args,  # type: argparse.Namespace
        beam_pipeline_options,  # type: pipeline_options.PipelineOptions
        vcf_data_temp_folder,  # type: str
        vcf_data_header_file_path,  # type: str
):
    # type: (...) -> None
    """Runs BigQuery to VCF shards pipelines.

  It reads the variants from BigQuery table, groups a collection of variants
  within a contiguous region of the genome (the size of the collection is
  adjustable through flag `--number_of_bases_per_shard`), sorts them, and then
  writes to one VCF file. All VCF data files are saved in
  `vcf_data_temp_folder`.

  Also, it writes the data header to `vcf_data_header_file_path`.
  TODO(allieychen): Eventually, it also generates the meta information file.
  """
    bq_source = bigquery.BigQuerySource(
        query=_BASE_QUERY_TEMPLATE.format(INPUT_TABLE='.'.join(
            bigquery_util.parse_table_reference(known_args.input_table))),
        validate=True,
        use_standard_sql=True)

    with beam.Pipeline(options=beam_pipeline_options) as p:
        variants = (p
                    | 'ReadFromBigQuery ' >> beam.io.Read(bq_source)
                    | bigquery_to_variant.BigQueryToVariant())
        call_names = (
            variants
            | 'CombineCallNames' >> combine_call_names.CallNamesCombiner())

        _ = (call_names
             | 'GenerateVcfDataHeader' >> beam.ParDo(
                 _write_vcf_data_header, _VCF_FIXED_COLUMNS,
                 vcf_data_header_file_path))

        _ = (variants
             | densify_variants.DensifyVariants(
                 beam.pvalue.AsSingleton(call_names))
             | 'PairVariantWithKey' >> beam.Map(
                 _pair_variant_with_key, known_args.number_of_bases_per_shard)
             | 'GroupVariantsByKey' >> beam.GroupByKey()
             | beam.ParDo(_get_file_path_and_sorted_variants,
                          vcf_data_temp_folder)
             | vcfio.WriteVcfDataLines())
def run(argv=None):
    # type: (List[str]) -> None
    """Runs BigQuery to VCF pipeline."""
    logging.info('Command: %s', ' '.join(argv or sys.argv))
    known_args, pipeline_args = vcf_to_bq_common.parse_args(
        argv, _COMMAND_LINE_OPTIONS)
    bq_source = bigquery.BigQuerySource(
        query=_BASE_QUERY_TEMPLATE.format(INPUT_TABLE='.'.join(
            bigquery_util.parse_table_reference(known_args.input_table))),
        validate=True,
        use_standard_sql=True)

    options = pipeline_options.PipelineOptions(pipeline_args)
    with beam.Pipeline(options=options) as p:
        _ = (p | 'ReadFromBigQuery ' >> beam.io.Read(bq_source)
             | bigquery_to_variant.BigQueryToVariant()
             | densify_variants.DensifyVariants()
             | vcfio.WriteToVcf(known_args.output_file))
    def copy_to_flatten_table(self, output_base_table_id):
        # type: (str) -> None
        """Copies data from variant lookup optimized tables to sample lookup tables.

    Copies rows from _base_table_id__* to output_base_table_id__* for each value
    in _suffixes. Here we assume destination tables are already created and are
    partitioned based on call_sample_id column. The copying process is done via
    a flattening query similar to the one used in get_flatten_table_schema().

    Note that if source tables have repeated sample_ids then output table will
    have more rows than input table. Essentially:
    Number of output rows = Number of input rows * Number of repeated sample_ids

    Args:
      output_base_table_id: Base table name of output tables.
    """
        # Here we assume all output_table_base + suffices[:] are already created.
        (output_project_id, output_dataset_id, output_base_table) = (
            bigquery_util.parse_table_reference(output_base_table_id))
        select_columns = self._get_flatten_column_names()
        for suffix in self._suffixes:
            input_table_id = bigquery_util.compose_table_name(
                self._base_table, suffix)
            output_table_id = bigquery_util.compose_table_name(
                output_base_table, suffix)

            full_output_table_id = '{}.{}.{}'.format(output_project_id,
                                                     output_dataset_id,
                                                     output_table_id)
            cp_query = _FLATTEN_CALL_QUERY.format(
                SELECT_COLUMNS=select_columns,
                PROJECT_ID=self._project_id,
                DATASET_ID=self._dataset_id,
                TABLE_ID=input_table_id,
                MAIN_TABLE_ALIAS=_MAIN_TABLE_ALIAS,
                CALL_COLUMN=bigquery_util.ColumnKeyConstants.CALLS,
                CALL_TABLE_ALIAS=_CALL_TABLE_ALIAS)

            self._copy_to_flatten_table(full_output_table_id, cp_query)
            logging.info('Flatten table is fully loaded: %s',
                         full_output_table_id)
    def validate(self, parsed_args, client=None):
        # type: (argparse.Namespace, bigquery.BigqueryV2) -> None
        if not parsed_args.output_table and parsed_args.output_avro_path:
            # Writing into BigQuery is not requested; no more BigQuery checks needed.
            return

        project_id, dataset_id, table_id = bigquery_util.parse_table_reference(
            parsed_args.output_table)

        if not client:
            credentials = GoogleCredentials.get_application_default(
            ).create_scoped(['https://www.googleapis.com/auth/bigquery'])
            client = bigquery.BigqueryV2(credentials=credentials)

        bigquery_util.raise_error_if_dataset_not_exists(
            client, project_id, dataset_id)
        # Ensuring given output table doesn't already exist to avoid overwriting it.
        if not parsed_args.append:
            if parsed_args.update_schema_on_append:
                raise ValueError(
                    '--update_schema_on_append requires --append to be '
                    'true.')
            bigquery_util.raise_error_if_table_exists(client, project_id,
                                                      dataset_id, table_id)
def _bigquery_to_vcf_shards(
        known_args,  # type: argparse.Namespace
        beam_pipeline_options,  # type: pipeline_options.PipelineOptions
        vcf_data_temp_folder,  # type: str
        header_file_path,  # type: str
):
    # type: (...) -> None
    """Runs BigQuery to VCF shards pipelines.

  It reads the variants from BigQuery table, groups a collection of variants
  within a contiguous region of the genome (the size of the collection is
  adjustable through flag `--number_of_bases_per_shard`), sorts them, and then
  writes to one VCF file. All VCF data files are saved in
  `vcf_data_temp_folder`.

  Also, it writes the meta info and data header with the sample names to
  `vcf_header_file_path`.
  """
    schema = _get_schema(known_args.input_table)
    variant_query = _get_variant_query(known_args, schema)
    logging.info('Processing BigQuery query %s:', variant_query)
    project_id, dataset_id, table_id = bigquery_util.parse_table_reference(
        known_args.input_table)
    bq_variant_source = bigquery.BigQuerySource(query=variant_query,
                                                validate=True,
                                                use_standard_sql=True)
    annotation_names = _extract_annotation_names(schema)

    base_table_id = bigquery_util.get_table_base_name(table_id)
    sample_query = _SAMPLE_INFO_QUERY_TEMPLATE.format(
        PROJECT_ID=project_id,
        DATASET_ID=dataset_id,
        TABLE_NAME=bigquery_util.compose_table_name(base_table_id,
                                                    SAMPLE_INFO_TABLE_SUFFIX))
    bq_sample_source = bigquery.BigQuerySource(query=sample_query,
                                               validate=True,
                                               use_standard_sql=True)
    with beam.Pipeline(options=beam_pipeline_options) as p:
        variants = (p
                    | 'ReadFromBigQuery ' >> beam.io.Read(bq_variant_source)
                    | bigquery_to_variant.BigQueryToVariant(annotation_names))
        sample_table_rows = (
            p
            | 'ReadFromSampleTable' >> beam.io.Read(bq_sample_source))
        if known_args.sample_names:
            temp_sample_names = (p
                                 | transforms.Create(known_args.sample_names,
                                                     reshuffle=False))
        else:
            # Get sample names from sample IDs in the variants and sort.
            id_to_name_hash_table = (sample_table_rows
                                     | 'SampleIdToNameDict' >>
                                     sample_mapping_table.SampleIdToNameDict())
            temp_sample_ids = (
                variants
                | 'CombineSampleIds' >> combine_sample_ids.SampleIdsCombiner(
                    known_args.preserve_sample_order))
            temp_sample_names = (
                temp_sample_ids
                | 'GetSampleNames' >> sample_mapping_table.GetSampleNames(
                    beam.pvalue.AsSingleton(id_to_name_hash_table))
                | 'CombineToList' >> beam.combiners.ToList()
                | 'SortSampleNames' >> beam.ParDo(sorted))

        name_to_id_hash_table = (
            sample_table_rows
            |
            'SampleNameToIdDict' >> sample_mapping_table.SampleNameToIdDict())
        sample_ids = (temp_sample_names
                      | 'GetSampleIds' >> sample_mapping_table.GetSampleIds(
                          beam.pvalue.AsSingleton(name_to_id_hash_table))
                      | 'CombineSortedSampleIds' >> beam.combiners.ToList())
        sample_names = temp_sample_names | beam.combiners.ToList()

        _ = (sample_names
             | 'GenerateVcfDataHeader' >> beam.ParDo(
                 _write_vcf_header_with_sample_names, _VCF_FIXED_COLUMNS,
                 known_args.representative_header_file, header_file_path))

        _ = (variants
             | densify_variants.DensifyVariants(
                 beam.pvalue.AsSingleton(sample_ids))
             | 'PairVariantWithKey' >> beam.Map(
                 _pair_variant_with_key, known_args.number_of_bases_per_shard)
             | 'GroupVariantsByKey' >> beam.GroupByKey()
             | beam.ParDo(_get_file_path_and_sorted_variants,
                          vcf_data_temp_folder)
             | vcfio.WriteVcfDataLines(known_args.bq_uses_1_based_coordinate))