Exemple #1
0
def _get_input_dimensions(known_args, pipeline_args):
    pipeline_mode = pipeline_common.get_pipeline_mode(known_args.all_patterns)
    beam_pipeline_options = pipeline_options.PipelineOptions(pipeline_args)
    google_cloud_options = beam_pipeline_options.view_as(
        pipeline_options.GoogleCloudOptions)

    estimate_sizes_job_name = pipeline_common.generate_unique_name(
        _ESTIMATE_SIZES_JOB_NAME)
    if google_cloud_options.job_name:
        google_cloud_options.job_name += '-' + estimate_sizes_job_name
    else:
        google_cloud_options.job_name = estimate_sizes_job_name
    temp_directory = google_cloud_options.temp_location or tempfile.mkdtemp()
    temp_estimated_input_size_file_name = '-'.join(
        [google_cloud_options.job_name, _ESTIMATE_SIZES_FILE_NAME])
    temp_estimated_input_size_file_path = filesystems.FileSystems.join(
        temp_directory, temp_estimated_input_size_file_name)
    with beam.Pipeline(options=beam_pipeline_options) as p:
        estimates = pipeline_common.get_estimates(p, pipeline_mode,
                                                  known_args.all_patterns)

        files_size = (estimates
                      | 'GetFilesSize' >> extract_input_size.GetFilesSize())
        file_count = (estimates
                      | 'CountAllFiles' >> beam.combiners.Count.Globally())
        sample_map = (estimates
                      |
                      'ExtractSampleMap' >> extract_input_size.GetSampleMap())
        estimated_value_count = (sample_map
                                 | extract_input_size.GetEstimatedValueCount())
        estimated_sample_count = (sample_map
                                  |
                                  extract_input_size.GetEstimatedSampleCount())
        estimated_variant_count = (
            estimates
            | 'GetEstimatedVariantCount' >>
            extract_input_size.GetEstimatedVariantCount())
        _ = (estimated_variant_count
             | beam.ParDo(extract_input_size.print_estimates_to_file,
                          beam.pvalue.AsSingleton(estimated_sample_count),
                          beam.pvalue.AsSingleton(estimated_value_count),
                          beam.pvalue.AsSingleton(files_size),
                          beam.pvalue.AsSingleton(file_count),
                          temp_estimated_input_size_file_path))

    with filesystems.FileSystems.open(
            temp_estimated_input_size_file_path) as f:
        estimates = f.readlines()
    if len(estimates) != 5:
        raise ValueError('Exactly 5 estimates were expected in {}.'.format(
            temp_estimated_input_size_file_path))

    known_args.estimated_variant_count = int(estimates[0].strip())
    known_args.estimated_sample_count = int(estimates[1].strip())
    known_args.estimated_value_count = int(estimates[2].strip())
    known_args.files_size = int(estimates[3].strip())
    known_args.file_count = int(estimates[4].strip())
def run(argv=None):
    # type: (List[str]) -> (str, str)
    """Runs preprocess pipeline."""
    logging.info('Command: %s', ' '.join(argv or sys.argv))
    known_args, pipeline_args = pipeline_common.parse_args(
        argv, _COMMAND_LINE_OPTIONS)
    options = pipeline_options.PipelineOptions(pipeline_args)
    all_patterns = known_args.all_patterns
    pipeline_mode = pipeline_common.get_pipeline_mode(all_patterns)

    with beam.Pipeline(options=options) as p:
        headers = pipeline_common.read_headers(p, pipeline_mode, all_patterns)
        merged_headers = pipeline_common.get_merged_headers(headers)
        merged_definitions = (
            headers
            |
            'MergeDefinitions' >> merge_header_definitions.MergeDefinitions())
        if known_args.report_all_conflicts:
            if len(all_patterns) == 1:
                variants = p | 'ReadFromVcf' >> vcfio.ReadFromVcf(
                    all_patterns[0], allow_malformed_records=True)
            else:
                variants = (p
                            | 'InputFilePattern' >> beam.Create(all_patterns)
                            | 'ReadAllFromVcf' >>
                            vcfio.ReadAllFromVcf(allow_malformed_records=True))

            malformed_records = variants | filter_variants.ExtractMalformedVariants(
            )
            inferred_headers, merged_headers = (_get_inferred_headers(
                variants, merged_headers))
            _ = (merged_definitions
                 | 'GenerateConflictsReport' >> beam.ParDo(
                     preprocess_reporter.generate_report,
                     known_args.report_path,
                     beam.pvalue.AsSingleton(merged_headers),
                     beam.pvalue.AsSingleton(inferred_headers),
                     beam.pvalue.AsIter(malformed_records)))
        else:
            _ = (merged_definitions
                 | 'GenerateConflictsReport' >> beam.ParDo(
                     preprocess_reporter.generate_report,
                     known_args.report_path,
                     beam.pvalue.AsSingleton(merged_headers)))

        if known_args.resolved_headers_path:
            pipeline_common.write_headers(merged_headers,
                                          known_args.resolved_headers_path)
Exemple #3
0
def _run_annotation_pipeline(known_args, pipeline_args):
    # type: (argparse.Namespace, List[str]) -> str
    annotated_vcf_pattern = None
    if known_args.run_annotation_pipeline:
        _validate_annotation_pipeline_args(known_args, pipeline_args)
        known_args.omit_empty_sample_calls = True

        files_to_be_annotated = known_args.all_patterns
        if known_args.shard_variants:
            pipeline_mode = pipeline_common.get_pipeline_mode(
                files_to_be_annotated)
            files_to_be_annotated = _shard_variants(known_args, pipeline_args,
                                                    pipeline_mode)
        annotated_vcf_pattern = _annotate_vcf_files(files_to_be_annotated,
                                                    known_args, pipeline_args)
    return annotated_vcf_pattern
Exemple #4
0
 def _get_pipeline_mode(self, args):
     all_patterns = pipeline_common._get_all_patterns(
         args.input_pattern, args.input_file)
     return pipeline_common.get_pipeline_mode(all_patterns,
                                              args.pipeline_mode)
Exemple #5
0
def run(argv=None):
  # type: (List[str]) -> None
  """Runs VCF to BigQuery pipeline."""
  logging.info('Command: %s', ' '.join(argv or sys.argv))
  known_args, pipeline_args = pipeline_common.parse_args(argv,
                                                         _COMMAND_LINE_OPTIONS)

  if known_args.auto_flags_experiment:
    _get_input_dimensions(known_args, pipeline_args)

  annotated_vcf_pattern = _run_annotation_pipeline(known_args, pipeline_args)

  all_patterns = (
      [annotated_vcf_pattern] if annotated_vcf_pattern
      else known_args.all_patterns)

  variant_merger = _get_variant_merge_strategy(known_args)

  pipeline_mode = pipeline_common.get_pipeline_mode(
      all_patterns,
      known_args.optimize_for_large_inputs)
  # Starts a pipeline to merge VCF headers in beam if the total files that
  # match the input pattern exceeds _SMALL_DATA_THRESHOLD
  _merge_headers(known_args, pipeline_args,
                 pipeline_mode, annotated_vcf_pattern)


  # Retrieve merged headers prior to launching the pipeline. This is needed
  # since the BigQuery schema cannot yet be dynamically created based on input.
  # See https://issues.apache.org/jira/browse/BEAM-2801.
  header_fields = vcf_header_parser.get_vcf_headers(
      known_args.representative_header_file)
  counter_factory = metrics_util.CounterFactory()
  processed_variant_factory = processed_variant.ProcessedVariantFactory(
      header_fields,
      known_args.split_alternate_allele_info_fields,
      known_args.allow_malformed_records,
      known_args.annotation_fields,
      known_args.use_allele_num,
      known_args.minimal_vep_alt_matching,
      known_args.infer_annotation_types,
      counter_factory)

  partitioner = None
  if ((known_args.optimize_for_large_inputs and variant_merger) or
      known_args.partition_config_path):
    partitioner = variant_partition.VariantPartition(
        known_args.partition_config_path)

  beam_pipeline_options = pipeline_options.PipelineOptions(pipeline_args)
  pipeline = beam.Pipeline(options=beam_pipeline_options)
  variants = _read_variants(all_patterns, pipeline, known_args, pipeline_mode)
  variants |= 'FilterVariants' >> filter_variants.FilterVariants(
      reference_names=known_args.reference_names)
  if partitioner:
    num_partitions = partitioner.get_num_partitions()
    partitioned_variants = variants | 'PartitionVariants' >> beam.Partition(
        partition_variants.PartitionVariants(partitioner), num_partitions)
    variants = []
    for i in range(num_partitions):
      if partitioner.should_keep_partition(i):
        variants.append(partitioned_variants[i])
      else:
        num_partitions -= 1
  else:
    # By default we don't partition the data, so we have only 1 partition.
    num_partitions = 1
    variants = [variants]

  for i in range(num_partitions):
    if variant_merger:
      variants[i] |= ('MergeVariants' + str(i) >>
                      merge_variants.MergeVariants(variant_merger))
    variants[i] |= (
        'ProcessVariants' + str(i) >>
        beam.Map(processed_variant_factory.create_processed_variant).\
            with_output_types(processed_variant.ProcessedVariant))
  if partitioner and partitioner.should_flatten():
    variants = [variants | 'FlattenPartitions' >> beam.Flatten()]
    num_partitions = 1

  if known_args.output_table:
    for i in range(num_partitions):
      table_suffix = ''
      if partitioner and partitioner.get_partition_name(i):
        table_suffix = '_' + partitioner.get_partition_name(i)
      table_name = known_args.output_table + table_suffix
      _ = (variants[i] | 'VariantToBigQuery' + table_suffix >>
           variant_to_bigquery.VariantToBigQuery(
               table_name,
               header_fields,
               variant_merger,
               processed_variant_factory,
               append=known_args.append,
               update_schema_on_append=known_args.update_schema_on_append,
               allow_incompatible_records=known_args.allow_incompatible_records,
               omit_empty_sample_calls=known_args.omit_empty_sample_calls,
               num_bigquery_write_shards=known_args.num_bigquery_write_shards,
               null_numeric_value_replacement=(
                   known_args.null_numeric_value_replacement)))

  if known_args.output_avro_path:
    # TODO(bashir2): Add an integration test that outputs to Avro files and
    # also imports to BigQuery. Then import those Avro outputs using the bq
    # tool and verify that the two tables are identical.
    _ = (
        variants | 'FlattenToOnePCollection' >> beam.Flatten()
        | 'VariantToAvro' >>
        variant_to_avro.VariantToAvroFiles(
            known_args.output_avro_path,
            header_fields,
            processed_variant_factory,
            variant_merger=variant_merger,
            allow_incompatible_records=known_args.allow_incompatible_records,
            omit_empty_sample_calls=known_args.omit_empty_sample_calls,
            null_numeric_value_replacement=(
                known_args.null_numeric_value_replacement))
    )

  result = pipeline.run()
  result.wait_until_finish()

  metrics_util.log_all_counters(result)
 def _get_pipeline_mode(self, args):
     all_patterns = pipeline_common._get_all_patterns(
         args.input_pattern, args.input_file)
     return pipeline_common.get_pipeline_mode(
         all_patterns, args.optimize_for_large_inputs)
Exemple #7
0
def run(argv=None):
    # type: (List[str]) -> None
    """Runs VCF to BigQuery pipeline."""
    logging.info('Command: %s', ' '.join(argv or sys.argv))
    known_args, pipeline_args = pipeline_common.parse_args(
        argv, _COMMAND_LINE_OPTIONS)

    if known_args.auto_flags_experiment:
        _get_input_dimensions(known_args, pipeline_args)

    annotated_vcf_pattern = _run_annotation_pipeline(known_args, pipeline_args)

    all_patterns = ([annotated_vcf_pattern]
                    if annotated_vcf_pattern else known_args.all_patterns)

    variant_merger = _get_variant_merge_strategy(known_args)

    pipeline_mode = pipeline_common.get_pipeline_mode(all_patterns)

    beam_pipeline_options = pipeline_options.PipelineOptions(pipeline_args)
    avro_root_path = _get_avro_root_path(beam_pipeline_options)
    # Starts a pipeline to merge VCF headers in beam if the total files that
    # match the input pattern exceeds _SMALL_DATA_THRESHOLD
    _merge_headers(known_args, pipeline_args, pipeline_mode, avro_root_path,
                   annotated_vcf_pattern)

    # Retrieve merged headers prior to launching the pipeline. This is needed
    # since the BigQuery schema cannot yet be dynamically created based on input.
    # See https://issues.apache.org/jira/browse/BEAM-2801.
    header_fields = vcf_header_parser.get_vcf_headers(
        known_args.representative_header_file)
    counter_factory = metrics_util.CounterFactory()
    processed_variant_factory = processed_variant.ProcessedVariantFactory(
        header_fields, known_args.split_alternate_allele_info_fields,
        known_args.allow_malformed_records, known_args.annotation_fields,
        known_args.use_allele_num, known_args.minimal_vep_alt_matching,
        known_args.infer_annotation_types, counter_factory)

    schema = schema_converter.generate_schema_from_header_fields(
        header_fields, processed_variant_factory, variant_merger,
        known_args.use_1_based_coordinate, known_args.include_call_name)

    sharding = variant_sharding.VariantSharding(
        known_args.sharding_config_path)
    if sharding.should_keep_shard(sharding.get_residual_index()):
        num_shards = sharding.get_num_shards()
    else:
        num_shards = sharding.get_num_shards() - 1

    if known_args.update_schema_on_append:
        for i in range(num_shards):
            table_suffix = sharding.get_output_table_suffix(i)
            table_name = bigquery_util.compose_table_name(
                known_args.output_table, table_suffix)
            bigquery_util.update_bigquery_schema_on_append(
                schema.fields, table_name)

    pipeline = beam.Pipeline(options=beam_pipeline_options)
    variants = _read_variants(
        all_patterns,
        pipeline,
        known_args,
        pipeline_mode,
        use_1_based_coordinate=known_args.use_1_based_coordinate)
    if known_args.allow_malformed_records:
        variants |= 'DropMalformedRecords' >> filter_variants.FilterVariants()
    sharded_variants = variants | 'ShardVariants' >> beam.Partition(
        shard_variants.ShardVariants(sharding), sharding.get_num_shards())
    variants = []
    for i in range(num_shards):
        suffix = sharding.get_output_table_suffix(i)
        # Convert tuples to list
        variants.append(sharded_variants[i])
        if variant_merger:
            variants[i] |= ('MergeVariants' + suffix >>
                            merge_variants.MergeVariants(variant_merger))
        variants[i] |= (
            'ProcessVariants' + suffix >>
            beam.Map(processed_variant_factory.create_processed_variant). \
            with_output_types(processed_variant.ProcessedVariant))
        _ = (variants[i]
             | 'VariantToAvro' + suffix >> variant_to_avro.VariantToAvroFiles(
                 avro_root_path + suffix,
                 schema,
                 allow_incompatible_records=known_args.
                 allow_incompatible_records,
                 omit_empty_sample_calls=known_args.omit_empty_sample_calls,
                 null_numeric_value_replacement=(
                     known_args.null_numeric_value_replacement),
                 include_call_name=known_args.include_call_name))
    result = pipeline.run()
    try:
        state = result.wait_until_finish()
        if state != beam.runners.runner.PipelineState.DONE:
            logging.error(
                'Dataflow pipeline terminated in an unexpected state: %s',
                state)
            raise AssertionError(
                'Dataflow pipeline terminated in {} state'.format(state))
    except Exception as e:
        logging.error('Dataflow pipeline failed.')
        raise e
    else:
        logging.info('Dataflow pipeline finished successfully.')
        metrics_util.log_all_counters(result)

    # After pipeline is done, create output tables and load AVRO files into them.
    schema_file = _write_schema_to_temp_file(schema, avro_root_path)
    suffixes = []
    try:
        for i in range(num_shards):
            suffixes.append(sharding.get_output_table_suffix(i))
            partition_range_end = sharding.get_output_table_partition_range_end(
                i)
            if not known_args.append:
                table_name = bigquery_util.compose_table_name(
                    known_args.output_table, suffixes[i])
                partitioning.create_bq_table(
                    table_name, schema_file,
                    bigquery_util.ColumnKeyConstants.START_POSITION,
                    partition_range_end)
                _record_newly_created_table(table_name)
                logging.info('Integer range partitioned table %s was created.',
                             table_name)
        if not known_args.append:
            _record_newly_created_table(
                sample_info_table_schema_generator.create_sample_info_table(
                    known_args.output_table))

        suffixes.append(
            sample_info_table_schema_generator.SAMPLE_INFO_TABLE_SUFFIX)
        load_avro = avro_util.LoadAvro(avro_root_path, known_args.output_table,
                                       suffixes, False)
        not_empty_variant_suffixes = load_avro.start_loading()
        logging.info('Following tables were loaded with at least 1 row:')
        for suffix in not_empty_variant_suffixes:
            logging.info(
                bigquery_util.compose_table_name(known_args.output_table,
                                                 suffix))
        # Remove sample_info table from both lists to avoid duplicating it when
        # --sample_lookup_optimized_output_table flag is set
        suffixes.remove(
            sample_info_table_schema_generator.SAMPLE_INFO_TABLE_SUFFIX)
        if sample_info_table_schema_generator.SAMPLE_INFO_TABLE_SUFFIX in\
            not_empty_variant_suffixes:
            not_empty_variant_suffixes.remove(
                sample_info_table_schema_generator.SAMPLE_INFO_TABLE_SUFFIX)
    except Exception as e:
        logging.error(
            'Something unexpected happened during the loading of AVRO '
            'files to BigQuery: %s', str(e))
        logging.info(
            'Since the write to BigQuery stage failed, we did not delete '
            'AVRO files in your GCS bucket. You can manually import them '
            'to BigQuery. To avoid extra storage charges, delete them if '
            'you do not need them, AVRO files are located at: %s',
            avro_root_path)
        raise e
    else:
        logging.warning('All AVRO files were successfully loaded to BigQuery.')
        if known_args.keep_intermediate_avro_files:
            logging.info(
                'Since "--keep_intermediate_avro_files" flag is set, the '
                'AVRO files are kept and stored at: %s', avro_root_path)
        else:
            if bigquery_util.delete_gcs_files(avro_root_path) != 0:
                logging.error(
                    'Deletion of intermediate AVRO files located at "%s" has '
                    'failed.', avro_root_path)

    if known_args.sample_lookup_optimized_output_table:
        flatten_call_column = partitioning.FlattenCallColumn(
            known_args.output_table, not_empty_variant_suffixes,
            known_args.append)
        try:
            flatten_schema_file = tempfile.mkstemp(
                suffix=_BQ_SCHEMA_FILE_SUFFIX)[1]
            if not flatten_call_column.get_flatten_table_schema(
                    flatten_schema_file):
                raise ValueError('Failed to extract schema of flatten table')
            # Create output flatten tables if needed
            if not known_args.append:
                # Create all sample optimized tables including those that will be empty.
                for suffix in suffixes:
                    output_table_id = bigquery_util.compose_table_name(
                        known_args.sample_lookup_optimized_output_table,
                        suffix)
                    partitioning.create_bq_table(
                        output_table_id, flatten_schema_file,
                        bigquery_util.ColumnKeyConstants.CALLS_SAMPLE_ID,
                        partitioning.MAX_RANGE_END)
                    _record_newly_created_table(output_table_id)
                    logging.info(
                        'Sample lookup optimized table %s was created.',
                        output_table_id)
            # Copy to flatten sample lookup tables from the variant lookup tables.
            # Note: uses WRITE_TRUNCATE to overwrite the existing tables (issue #607).
            flatten_call_column.copy_to_flatten_table(
                known_args.sample_lookup_optimized_output_table)
            logging.info(
                'All sample lookup optimized tables are fully loaded.')
        except Exception as e:
            logging.error(
                'Something unexpected happened during the loading rows to '
                'sample optimized table stage: %s', str(e))
            raise e