def validate(self, parsed_args, client=None):
        if not client:
            credentials = GoogleCredentials.get_application_default(
            ).create_scoped(['https://www.googleapis.com/auth/bigquery'])
            client = bigquery.BigqueryV2(credentials=credentials)

        project_id, dataset_id, table_id = bigquery_util.parse_table_reference(
            parsed_args.input_table)
        if not bigquery_util.table_exist(client, project_id, dataset_id,
                                         table_id):
            raise ValueError('Table {}:{}.{} does not exist.'.format(
                project_id, dataset_id, table_id))
        if table_id.count(TABLE_SUFFIX_SEPARATOR) != 1:
            raise ValueError(
                'Input table {} is malformed - exactly one suffix separator "{}" is '
                'required'.format(parsed_args.input_table,
                                  TABLE_SUFFIX_SEPARATOR))
        base_table_id = table_id[:table_id.find(TABLE_SUFFIX_SEPARATOR)]
        sample_table_id = bigquery_util.compose_table_name(
            base_table_id, SAMPLE_INFO_TABLE_SUFFIX)

        if not bigquery_util.table_exist(client, project_id, dataset_id,
                                         sample_table_id):
            raise ValueError('Sample table {}:{}.{} does not exist.'.format(
                project_id, dataset_id, sample_table_id))
    def _validate_output_tables(self, client, output_table_base_name,
                                sharding_config_path, append, is_main_output):
        if (output_table_base_name !=
                bigquery_util.get_table_base_name(output_table_base_name)):
            raise ValueError(
                ('Output table cannot contain "{}". we reserve this '
                 'string to mark sharded output tables.').format(
                     bigquery_util.TABLE_SUFFIX_SEPARATOR))

        project_id, dataset_id, table_id = bigquery_util.parse_table_reference(
            output_table_base_name)
        bigquery_util.raise_error_if_dataset_not_exists(
            client, project_id, dataset_id)
        all_output_tables = []
        if is_main_output:
            all_output_tables.append(
                bigquery_util.compose_table_name(table_id,
                                                 SAMPLE_INFO_TABLE_SUFFIX))
        sharding = variant_sharding.VariantSharding(sharding_config_path)
        num_shards = sharding.get_num_shards()
        # In case there is no residual in config we will ignore the last shard.
        if not sharding.should_keep_shard(sharding.get_residual_index()):
            num_shards -= 1
        for i in range(num_shards):
            table_suffix = sharding.get_output_table_suffix(i)
            if table_suffix != bigquery_util.get_table_base_name(table_suffix):
                raise ValueError(
                    ('Table suffix cannot contain "{}" we reserve this  '
                     'string to mark sharded output tables.').format(
                         bigquery_util.TABLE_SUFFIX_SEPARATOR))
            all_output_tables.append(
                bigquery_util.compose_table_name(table_id, table_suffix))

        for output_table in all_output_tables:
            if append:
                if not bigquery_util.table_exist(client, project_id,
                                                 dataset_id, output_table):
                    raise ValueError(
                        'Table {}:{}.{} does not exist, cannot append to it.'.
                        format(project_id, dataset_id, output_table))
            else:
                if bigquery_util.table_exist(client, project_id, dataset_id,
                                             output_table):
                    raise ValueError((
                        'Table {}:{}.{} already exists, cannot overwrite it. Please '
                        'set `--append True` if you want to append to it.'
                    ).format(project_id, dataset_id, output_table))
 def _find_one_non_empty_table(self):
   # Any non empty input table can be used as the source for schema extraction.
   for suffix in self._suffixes:
     table_id = bigquery_util.compose_table_name(self._base_table, suffix)
     if not bigquery_util.table_empty(
         self._project_id, self._dataset_id, table_id):
       self._schema_table_id = table_id
       return
   raise ValueError('All of the variant optimized tables are empty!')
    def copy_to_flatten_table(self, output_base_table_id):
        # type: (str) -> None
        """Copies data from variant lookup optimized tables to sample lookup tables.

    Copies rows from _base_table_id__* to output_base_table_id__* for each value
    in _suffixes. Here we assume destination tables are already created and are
    partitioned based on call_sample_id column. The copying process is done via
    a flattening query similar to the one used in get_flatten_table_schema().

    Note that if source tables have repeated sample_ids then output table will
    have more rows than input table. Essentially:
    Number of output rows = Number of input rows * Number of repeated sample_ids

    Args:
      output_base_table_id: Base table name of output tables.
    """
        # Here we assume all output_table_base + suffices[:] are already created.
        (output_project_id, output_dataset_id, output_base_table) = (
            bigquery_util.parse_table_reference(output_base_table_id))
        select_columns = self._get_flatten_column_names()
        for suffix in self._suffixes:
            input_table_id = bigquery_util.compose_table_name(
                self._base_table, suffix)
            output_table_id = bigquery_util.compose_table_name(
                output_base_table, suffix)

            full_output_table_id = '{}.{}.{}'.format(output_project_id,
                                                     output_dataset_id,
                                                     output_table_id)
            cp_query = _FLATTEN_CALL_QUERY.format(
                SELECT_COLUMNS=select_columns,
                PROJECT_ID=self._project_id,
                DATASET_ID=self._dataset_id,
                TABLE_ID=input_table_id,
                MAIN_TABLE_ALIAS=_MAIN_TABLE_ALIAS,
                CALL_COLUMN=bigquery_util.ColumnKeyConstants.CALLS,
                CALL_TABLE_ALIAS=_CALL_TABLE_ALIAS)

            self._copy_to_flatten_table(full_output_table_id, cp_query)
            logging.info('Flatten table is fully loaded: %s',
                         full_output_table_id)
Exemple #5
0
 def _start_one_load_job(self, suffix):
     # After issue #582 is resolved we can remove the create_disposition flag.
     job_config = bigquery.LoadJobConfig(
         source_format=bigquery.SourceFormat.AVRO,
         create_disposition='CREATE_NEVER')
     uri = self._avro_root_path + suffix + '-*'
     table_id = bigquery_util.compose_table_name(self._table_base_name,
                                                 suffix)
     load_job = self._client.load_table_from_uri(uri,
                                                 table_id,
                                                 job_config=job_config)
     self._suffixes_to_load_jobs.update({suffix: load_job})
  def __init__(self, output_table_prefix, sample_name_encoding, append=False):
    # type: (str, Dict[str, str], bool, int) -> None
    """Initializes the transform.

    Args:
      output_table_prefix: The prefix of the output BigQuery table.
      append: If true, existing records in output_table will not be
        overwritten. New records will be appended to those that already exist.
      sample_name_encoding: If SampleNameEncoding.WITHOUT_FILE_PATH is supplied,
        sample_id would only use sample_name in to get a hashed name; otherwise
        both sample_name and file_name will be used.
    """
    self._output_table = bigquery_util.compose_table_name(
        output_table_prefix, bigquery_util.SAMPLE_INFO_TABLE_SUFFIX)
    self._append = append
    self._sample_name_encoding = sample_name_encoding
    self._schema = sample_info_table_schema_generator.generate_schema()
Exemple #7
0
 def _handle_failed_load_job(self, suffix, load_job):
     table_id = bigquery_util.compose_table_name(self._table_base_name,
                                                 suffix)
     logging.warning('Failed to load AVRO to BigQuery table: %s', table_id)
     exception_str = ''
     if load_job.exception():
         exception_str = str(load_job.exception())
         logging.warning('Load job exception: %s', exception_str)
     if self._num_load_jobs_retries < bigquery_util.BQ_NUM_RETRIES:
         logging.warning('Retrying the failed job...')
         self._num_load_jobs_retries += 1
         time.sleep(300)
         self._start_one_load_job(suffix)
     else:
         logging.error(
             'AVRO load jobs have failed more than BQ_NUM_RETRIES.')
         self._cancel_all_running_load_jobs()
         raise ValueError(
             'Failed to load AVRO to BigQuery table {} \n state: {} \n '
             'job_id: {} \n exception: {}.'.format(table_id, load_job.state,
                                                   load_job.path,
                                                   exception_str))
def _bigquery_to_vcf_shards(
        known_args,  # type: argparse.Namespace
        beam_pipeline_options,  # type: pipeline_options.PipelineOptions
        vcf_data_temp_folder,  # type: str
        header_file_path,  # type: str
):
    # type: (...) -> None
    """Runs BigQuery to VCF shards pipelines.

  It reads the variants from BigQuery table, groups a collection of variants
  within a contiguous region of the genome (the size of the collection is
  adjustable through flag `--number_of_bases_per_shard`), sorts them, and then
  writes to one VCF file. All VCF data files are saved in
  `vcf_data_temp_folder`.

  Also, it writes the meta info and data header with the sample names to
  `vcf_header_file_path`.
  """
    schema = _get_schema(known_args.input_table)
    variant_query = _get_variant_query(known_args, schema)
    logging.info('Processing BigQuery query %s:', variant_query)
    project_id, dataset_id, table_id = bigquery_util.parse_table_reference(
        known_args.input_table)
    bq_variant_source = bigquery.BigQuerySource(query=variant_query,
                                                validate=True,
                                                use_standard_sql=True)
    annotation_names = _extract_annotation_names(schema)

    base_table_id = bigquery_util.get_table_base_name(table_id)
    sample_query = _SAMPLE_INFO_QUERY_TEMPLATE.format(
        PROJECT_ID=project_id,
        DATASET_ID=dataset_id,
        TABLE_NAME=bigquery_util.compose_table_name(base_table_id,
                                                    SAMPLE_INFO_TABLE_SUFFIX))
    bq_sample_source = bigquery.BigQuerySource(query=sample_query,
                                               validate=True,
                                               use_standard_sql=True)
    with beam.Pipeline(options=beam_pipeline_options) as p:
        variants = (p
                    | 'ReadFromBigQuery ' >> beam.io.Read(bq_variant_source)
                    | bigquery_to_variant.BigQueryToVariant(annotation_names))
        sample_table_rows = (
            p
            | 'ReadFromSampleTable' >> beam.io.Read(bq_sample_source))
        if known_args.sample_names:
            temp_sample_names = (p
                                 | transforms.Create(known_args.sample_names,
                                                     reshuffle=False))
        else:
            # Get sample names from sample IDs in the variants and sort.
            id_to_name_hash_table = (sample_table_rows
                                     | 'SampleIdToNameDict' >>
                                     sample_mapping_table.SampleIdToNameDict())
            temp_sample_ids = (
                variants
                | 'CombineSampleIds' >> combine_sample_ids.SampleIdsCombiner(
                    known_args.preserve_sample_order))
            temp_sample_names = (
                temp_sample_ids
                | 'GetSampleNames' >> sample_mapping_table.GetSampleNames(
                    beam.pvalue.AsSingleton(id_to_name_hash_table))
                | 'CombineToList' >> beam.combiners.ToList()
                | 'SortSampleNames' >> beam.ParDo(sorted))

        name_to_id_hash_table = (
            sample_table_rows
            |
            'SampleNameToIdDict' >> sample_mapping_table.SampleNameToIdDict())
        sample_ids = (temp_sample_names
                      | 'GetSampleIds' >> sample_mapping_table.GetSampleIds(
                          beam.pvalue.AsSingleton(name_to_id_hash_table))
                      | 'CombineSortedSampleIds' >> beam.combiners.ToList())
        sample_names = temp_sample_names | beam.combiners.ToList()

        _ = (sample_names
             | 'GenerateVcfDataHeader' >> beam.ParDo(
                 _write_vcf_header_with_sample_names, _VCF_FIXED_COLUMNS,
                 known_args.representative_header_file, header_file_path))

        _ = (variants
             | densify_variants.DensifyVariants(
                 beam.pvalue.AsSingleton(sample_ids))
             | 'PairVariantWithKey' >> beam.Map(
                 _pair_variant_with_key, known_args.number_of_bases_per_shard)
             | 'GroupVariantsByKey' >> beam.GroupByKey()
             | beam.ParDo(_get_file_path_and_sorted_variants,
                          vcf_data_temp_folder)
             | vcfio.WriteVcfDataLines(known_args.bq_uses_1_based_coordinate))
Exemple #9
0
def run(argv=None):
    # type: (List[str]) -> None
    """Runs VCF to BigQuery pipeline."""
    logging.info('Command: %s', ' '.join(argv or sys.argv))
    known_args, pipeline_args = pipeline_common.parse_args(
        argv, _COMMAND_LINE_OPTIONS)

    if known_args.auto_flags_experiment:
        _get_input_dimensions(known_args, pipeline_args)

    annotated_vcf_pattern = _run_annotation_pipeline(known_args, pipeline_args)

    all_patterns = ([annotated_vcf_pattern]
                    if annotated_vcf_pattern else known_args.all_patterns)

    variant_merger = _get_variant_merge_strategy(known_args)

    pipeline_mode = pipeline_common.get_pipeline_mode(all_patterns)

    beam_pipeline_options = pipeline_options.PipelineOptions(pipeline_args)
    avro_root_path = _get_avro_root_path(beam_pipeline_options)
    # Starts a pipeline to merge VCF headers in beam if the total files that
    # match the input pattern exceeds _SMALL_DATA_THRESHOLD
    _merge_headers(known_args, pipeline_args, pipeline_mode, avro_root_path,
                   annotated_vcf_pattern)

    # Retrieve merged headers prior to launching the pipeline. This is needed
    # since the BigQuery schema cannot yet be dynamically created based on input.
    # See https://issues.apache.org/jira/browse/BEAM-2801.
    header_fields = vcf_header_parser.get_vcf_headers(
        known_args.representative_header_file)
    counter_factory = metrics_util.CounterFactory()
    processed_variant_factory = processed_variant.ProcessedVariantFactory(
        header_fields, known_args.split_alternate_allele_info_fields,
        known_args.allow_malformed_records, known_args.annotation_fields,
        known_args.use_allele_num, known_args.minimal_vep_alt_matching,
        known_args.infer_annotation_types, counter_factory)

    schema = schema_converter.generate_schema_from_header_fields(
        header_fields, processed_variant_factory, variant_merger,
        known_args.use_1_based_coordinate, known_args.include_call_name)

    sharding = variant_sharding.VariantSharding(
        known_args.sharding_config_path)
    if sharding.should_keep_shard(sharding.get_residual_index()):
        num_shards = sharding.get_num_shards()
    else:
        num_shards = sharding.get_num_shards() - 1

    if known_args.update_schema_on_append:
        for i in range(num_shards):
            table_suffix = sharding.get_output_table_suffix(i)
            table_name = bigquery_util.compose_table_name(
                known_args.output_table, table_suffix)
            bigquery_util.update_bigquery_schema_on_append(
                schema.fields, table_name)

    pipeline = beam.Pipeline(options=beam_pipeline_options)
    variants = _read_variants(
        all_patterns,
        pipeline,
        known_args,
        pipeline_mode,
        use_1_based_coordinate=known_args.use_1_based_coordinate)
    if known_args.allow_malformed_records:
        variants |= 'DropMalformedRecords' >> filter_variants.FilterVariants()
    sharded_variants = variants | 'ShardVariants' >> beam.Partition(
        shard_variants.ShardVariants(sharding), sharding.get_num_shards())
    variants = []
    for i in range(num_shards):
        suffix = sharding.get_output_table_suffix(i)
        # Convert tuples to list
        variants.append(sharded_variants[i])
        if variant_merger:
            variants[i] |= ('MergeVariants' + suffix >>
                            merge_variants.MergeVariants(variant_merger))
        variants[i] |= (
            'ProcessVariants' + suffix >>
            beam.Map(processed_variant_factory.create_processed_variant). \
            with_output_types(processed_variant.ProcessedVariant))
        _ = (variants[i]
             | 'VariantToAvro' + suffix >> variant_to_avro.VariantToAvroFiles(
                 avro_root_path + suffix,
                 schema,
                 allow_incompatible_records=known_args.
                 allow_incompatible_records,
                 omit_empty_sample_calls=known_args.omit_empty_sample_calls,
                 null_numeric_value_replacement=(
                     known_args.null_numeric_value_replacement),
                 include_call_name=known_args.include_call_name))
    result = pipeline.run()
    try:
        state = result.wait_until_finish()
        if state != beam.runners.runner.PipelineState.DONE:
            logging.error(
                'Dataflow pipeline terminated in an unexpected state: %s',
                state)
            raise AssertionError(
                'Dataflow pipeline terminated in {} state'.format(state))
    except Exception as e:
        logging.error('Dataflow pipeline failed.')
        raise e
    else:
        logging.info('Dataflow pipeline finished successfully.')
        metrics_util.log_all_counters(result)

    # After pipeline is done, create output tables and load AVRO files into them.
    schema_file = _write_schema_to_temp_file(schema, avro_root_path)
    suffixes = []
    try:
        for i in range(num_shards):
            suffixes.append(sharding.get_output_table_suffix(i))
            partition_range_end = sharding.get_output_table_partition_range_end(
                i)
            if not known_args.append:
                table_name = bigquery_util.compose_table_name(
                    known_args.output_table, suffixes[i])
                partitioning.create_bq_table(
                    table_name, schema_file,
                    bigquery_util.ColumnKeyConstants.START_POSITION,
                    partition_range_end)
                _record_newly_created_table(table_name)
                logging.info('Integer range partitioned table %s was created.',
                             table_name)
        if not known_args.append:
            _record_newly_created_table(
                sample_info_table_schema_generator.create_sample_info_table(
                    known_args.output_table))

        suffixes.append(
            sample_info_table_schema_generator.SAMPLE_INFO_TABLE_SUFFIX)
        load_avro = avro_util.LoadAvro(avro_root_path, known_args.output_table,
                                       suffixes, False)
        not_empty_variant_suffixes = load_avro.start_loading()
        logging.info('Following tables were loaded with at least 1 row:')
        for suffix in not_empty_variant_suffixes:
            logging.info(
                bigquery_util.compose_table_name(known_args.output_table,
                                                 suffix))
        # Remove sample_info table from both lists to avoid duplicating it when
        # --sample_lookup_optimized_output_table flag is set
        suffixes.remove(
            sample_info_table_schema_generator.SAMPLE_INFO_TABLE_SUFFIX)
        if sample_info_table_schema_generator.SAMPLE_INFO_TABLE_SUFFIX in\
            not_empty_variant_suffixes:
            not_empty_variant_suffixes.remove(
                sample_info_table_schema_generator.SAMPLE_INFO_TABLE_SUFFIX)
    except Exception as e:
        logging.error(
            'Something unexpected happened during the loading of AVRO '
            'files to BigQuery: %s', str(e))
        logging.info(
            'Since the write to BigQuery stage failed, we did not delete '
            'AVRO files in your GCS bucket. You can manually import them '
            'to BigQuery. To avoid extra storage charges, delete them if '
            'you do not need them, AVRO files are located at: %s',
            avro_root_path)
        raise e
    else:
        logging.warning('All AVRO files were successfully loaded to BigQuery.')
        if known_args.keep_intermediate_avro_files:
            logging.info(
                'Since "--keep_intermediate_avro_files" flag is set, the '
                'AVRO files are kept and stored at: %s', avro_root_path)
        else:
            if bigquery_util.delete_gcs_files(avro_root_path) != 0:
                logging.error(
                    'Deletion of intermediate AVRO files located at "%s" has '
                    'failed.', avro_root_path)

    if known_args.sample_lookup_optimized_output_table:
        flatten_call_column = partitioning.FlattenCallColumn(
            known_args.output_table, not_empty_variant_suffixes,
            known_args.append)
        try:
            flatten_schema_file = tempfile.mkstemp(
                suffix=_BQ_SCHEMA_FILE_SUFFIX)[1]
            if not flatten_call_column.get_flatten_table_schema(
                    flatten_schema_file):
                raise ValueError('Failed to extract schema of flatten table')
            # Create output flatten tables if needed
            if not known_args.append:
                # Create all sample optimized tables including those that will be empty.
                for suffix in suffixes:
                    output_table_id = bigquery_util.compose_table_name(
                        known_args.sample_lookup_optimized_output_table,
                        suffix)
                    partitioning.create_bq_table(
                        output_table_id, flatten_schema_file,
                        bigquery_util.ColumnKeyConstants.CALLS_SAMPLE_ID,
                        partitioning.MAX_RANGE_END)
                    _record_newly_created_table(output_table_id)
                    logging.info(
                        'Sample lookup optimized table %s was created.',
                        output_table_id)
            # Copy to flatten sample lookup tables from the variant lookup tables.
            # Note: uses WRITE_TRUNCATE to overwrite the existing tables (issue #607).
            flatten_call_column.copy_to_flatten_table(
                known_args.sample_lookup_optimized_output_table)
            logging.info(
                'All sample lookup optimized tables are fully loaded.')
        except Exception as e:
            logging.error(
                'Something unexpected happened during the loading rows to '
                'sample optimized table stage: %s', str(e))
            raise e
def create_sample_info_table(output_table):
  full_table_id = bigquery_util.compose_table_name(output_table,
                                                   SAMPLE_INFO_TABLE_SUFFIX)
  partitioning.create_bq_table(full_table_id,
                               SAMPLE_INFO_TABLE_SCHEMA_FILE_PATH)
  return full_table_id