Beispiel #1
0
def _shard_variants(known_args, pipeline_args, pipeline_mode):
  # type: (argparse.Namespace, List[str], int) -> List[str]
  """Reads the variants and writes them to VCF shards.

  Returns:
   The VCF shards directory.
  """
  options = pipeline_options.PipelineOptions(pipeline_args)
  google_cloud_options = options.view_as(pipeline_options.GoogleCloudOptions)
  shard_files_job_name = pipeline_common.generate_unique_name(
      _SHARD_VCF_FILES_JOB_NAME)
  _update_google_cloud_job_name(google_cloud_options, shard_files_job_name)
  vcf_shards_output_dir = filesystems.FileSystems.join(
      known_args.annotation_output_dir, _SHARDS_FOLDER)
  with beam.Pipeline(options=options) as p:
    variants = _read_variants(
        known_args.all_patterns, p, known_args, pipeline_mode)
    call_names = (variants
                  | 'CombineCallNames' >>
                  combine_call_names.CallNamesCombiner())
    _ = (variants
         | 'DensifyVariants' >> densify_variants.DensifyVariants(
             beam.pvalue.AsSingleton(call_names))
         | 'WriteToShards' >> write_variants_to_shards.WriteToShards(
             vcf_shards_output_dir,
             beam.pvalue.AsSingleton(call_names),
             known_args.number_of_variants_per_shard))

  return [vep_runner_util.format_dir_path(vcf_shards_output_dir) +
          _GCS_RECURSIVE_WILDCARD]
Beispiel #2
0
def _annotate_vcf_files(all_patterns, known_args, pipeline_args):
  # type: (List[str], argparse.Namespace, List[str]) -> str
  """Annotates the VCF files using VEP.

  Returns:
    The annotated VCF files directory.
  """
  options = pipeline_options.PipelineOptions(pipeline_args)
  google_cloud_options = options.view_as(pipeline_options.GoogleCloudOptions)
  annotate_files_job_name = pipeline_common.generate_unique_name(
      _ANNOTATE_FILES_JOB_NAME)
  _update_google_cloud_job_name(google_cloud_options, annotate_files_job_name)

  with beam.Pipeline(options=options) as p:
    _ = (p
         | beam.Create(all_patterns)
         | 'AnnotateShards' >> beam.ParDo(
             annotate_files.AnnotateFile(known_args, pipeline_args)))
  if known_args.annotation_fields:
    known_args.annotation_fields.append(known_args.vep_info_field)
  else:
    known_args.annotation_fields = [known_args.vep_info_field]
  # TODO(bashir2): The VEP runner by default runs VEP with --allele_number hence
  # we turn on this feature here. However, this might be inconsistent with other
  # annotation fields that are originally present in input files, if they do not
  # have ALLELE_NUM annotation. The fix is to make annotation ALT matching
  # smarter to fall back on other matching methods if ALLELE_NUM is not present.
  # When this is implemented, we may even consider removing use_allele_num flag
  # and always start by checking if ALLELE_NUM is present.
  known_args.use_allele_num = True
  return vep_runner_util.get_output_pattern(known_args.annotation_output_dir)
Beispiel #3
0
def _merge_headers(known_args,
                   pipeline_args,
                   pipeline_mode,
                   annotated_vcf_pattern=None):
    # type: (str, argparse.Namespace, List[str], int, str) -> None
    """Merges VCF headers using beam based on pipeline_mode."""
    options = pipeline_options.PipelineOptions(pipeline_args)

    # Always run pipeline locally if data is small.
    if (pipeline_mode == pipeline_common.PipelineModes.SMALL
            and not known_args.infer_headers
            and not known_args.infer_annotation_types):
        options.view_as(
            pipeline_options.StandardOptions).runner = 'DirectRunner'

    google_cloud_options = options.view_as(pipeline_options.GoogleCloudOptions)
    merge_headers_job_name = pipeline_common.generate_unique_name(
        _MERGE_HEADERS_JOB_NAME)
    if google_cloud_options.job_name:
        google_cloud_options.job_name += '-' + merge_headers_job_name
    else:
        google_cloud_options.job_name = merge_headers_job_name

    temp_directory = google_cloud_options.temp_location or tempfile.mkdtemp()
    temp_merged_headers_file_name = '-'.join(
        [google_cloud_options.job_name, _MERGE_HEADERS_FILE_NAME])
    temp_merged_headers_file_path = filesystems.FileSystems.join(
        temp_directory, temp_merged_headers_file_name)
    if not known_args.append:
        bigquery_util.create_sample_info_table(known_args.output_table)

    with beam.Pipeline(options=options) as p:
        headers = pipeline_common.read_headers(p, pipeline_mode,
                                               known_args.all_patterns)
        _ = (headers
             | 'SampleInfoToBigQuery' >>
             sample_info_to_bigquery.SampleInfoToBigQuery(
                 known_args.output_table, SampleNameEncoding[
                     known_args.sample_name_encoding], known_args.append))
        if known_args.representative_header_file:
            return
        merged_header = pipeline_common.get_merged_headers(
            headers, known_args.split_alternate_allele_info_fields,
            known_args.allow_incompatible_records)
        if annotated_vcf_pattern:
            merged_header = pipeline_common.add_annotation_headers(
                p, known_args, pipeline_mode, merged_header,
                annotated_vcf_pattern)
        if known_args.infer_headers or known_args.infer_annotation_types:
            infer_headers_input_pattern = ([
                annotated_vcf_pattern
            ] if annotated_vcf_pattern else known_args.all_patterns)
            merged_header = _add_inferred_headers(infer_headers_input_pattern,
                                                  p, known_args, merged_header,
                                                  pipeline_mode)

        pipeline_common.write_headers(merged_header,
                                      temp_merged_headers_file_path)
        known_args.representative_header_file = temp_merged_headers_file_path
Beispiel #4
0
def _get_input_dimensions(known_args, pipeline_args):
    pipeline_mode = pipeline_common.get_pipeline_mode(known_args.all_patterns)
    beam_pipeline_options = pipeline_options.PipelineOptions(pipeline_args)
    google_cloud_options = beam_pipeline_options.view_as(
        pipeline_options.GoogleCloudOptions)

    estimate_sizes_job_name = pipeline_common.generate_unique_name(
        _ESTIMATE_SIZES_JOB_NAME)
    if google_cloud_options.job_name:
        google_cloud_options.job_name += '-' + estimate_sizes_job_name
    else:
        google_cloud_options.job_name = estimate_sizes_job_name
    temp_directory = google_cloud_options.temp_location or tempfile.mkdtemp()
    temp_estimated_input_size_file_name = '-'.join(
        [google_cloud_options.job_name, _ESTIMATE_SIZES_FILE_NAME])
    temp_estimated_input_size_file_path = filesystems.FileSystems.join(
        temp_directory, temp_estimated_input_size_file_name)
    with beam.Pipeline(options=beam_pipeline_options) as p:
        estimates = pipeline_common.get_estimates(p, pipeline_mode,
                                                  known_args.all_patterns)

        files_size = (estimates
                      | 'GetFilesSize' >> extract_input_size.GetFilesSize())
        file_count = (estimates
                      | 'CountAllFiles' >> beam.combiners.Count.Globally())
        sample_map = (estimates
                      |
                      'ExtractSampleMap' >> extract_input_size.GetSampleMap())
        estimated_value_count = (sample_map
                                 | extract_input_size.GetEstimatedValueCount())
        estimated_sample_count = (sample_map
                                  |
                                  extract_input_size.GetEstimatedSampleCount())
        estimated_variant_count = (
            estimates
            | 'GetEstimatedVariantCount' >>
            extract_input_size.GetEstimatedVariantCount())
        _ = (estimated_variant_count
             | beam.ParDo(extract_input_size.print_estimates_to_file,
                          beam.pvalue.AsSingleton(estimated_sample_count),
                          beam.pvalue.AsSingleton(estimated_value_count),
                          beam.pvalue.AsSingleton(files_size),
                          beam.pvalue.AsSingleton(file_count),
                          temp_estimated_input_size_file_path))

    with filesystems.FileSystems.open(
            temp_estimated_input_size_file_path) as f:
        estimates = f.readlines()
    if len(estimates) != 5:
        raise ValueError('Exactly 5 estimates were expected in {}.'.format(
            temp_estimated_input_size_file_path))

    known_args.estimated_variant_count = int(estimates[0].strip())
    known_args.estimated_sample_count = int(estimates[1].strip())
    known_args.estimated_value_count = int(estimates[2].strip())
    known_args.files_size = int(estimates[3].strip())
    known_args.file_count = int(estimates[4].strip())
Beispiel #5
0
def run(argv=None):
  # type: (List[str]) -> None
  """Runs BigQuery to VCF pipeline."""
  logging.info('Command: %s', ' '.join(argv or sys.argv))
  known_args, pipeline_args = pipeline_common.parse_args(argv,
                                                         _COMMAND_LINE_OPTIONS)
  options = pipeline_options.PipelineOptions(pipeline_args)
  is_direct_runner = pipeline_common.is_pipeline_direct_runner(
      beam.Pipeline(options=options))
  google_cloud_options = options.view_as(pipeline_options.GoogleCloudOptions)
  if not google_cloud_options.project:
    raise ValueError('project must be set.')
  if not is_direct_runner and not known_args.output_file.startswith('gs://'):
    raise ValueError('Please set the output file {} to GCS when running with '
                     'DataflowRunner.'.format(known_args.output_file))
  if is_direct_runner:
    known_args.number_of_bases_per_shard = sys.maxsize

  temp_folder = google_cloud_options.temp_location or tempfile.mkdtemp()
  unique_temp_id = pipeline_common.generate_unique_name(
      google_cloud_options.job_name or _BQ_TO_VCF_SHARDS_JOB_NAME)
  vcf_data_temp_folder = filesystems.FileSystems.join(
      temp_folder,
      '{}_data_temp_files'.format(unique_temp_id))
  # Create the directory manually. FileSystems cannot create a file if the
  # directory does not exist when using Direct Runner.
  filesystems.FileSystems.mkdirs(vcf_data_temp_folder)
  vcf_header_file_path = filesystems.FileSystems.join(
      temp_folder,
      '{}_header_with_sample_ids.vcf'.format(unique_temp_id))

  if not known_args.representative_header_file:
    known_args.representative_header_file = filesystems.FileSystems.join(
        temp_folder,
        '{}_meta_info.vcf'.format(unique_temp_id))
    _write_vcf_meta_info(known_args.input_table,
                         known_args.representative_header_file,
                         known_args.allow_incompatible_schema)

  _bigquery_to_vcf_shards(known_args,
                          options,
                          vcf_data_temp_folder,
                          vcf_header_file_path)
  if is_direct_runner:
    vcf_file_composer.compose_local_vcf_shards(vcf_header_file_path,
                                               vcf_data_temp_folder,
                                               known_args.output_file)
  else:
    vcf_file_composer.compose_gcs_vcf_shards(google_cloud_options.project,
                                             vcf_header_file_path,
                                             vcf_data_temp_folder,
                                             known_args.output_file)
Beispiel #6
0
def _shard_variants(known_args, pipeline_args, pipeline_mode):
    # type: (argparse.Namespace, List[str], int) -> List[str]
    """Reads the variants and writes them to VCF shards.

  Returns:
   The VCF shards directory.
  """
    options = pipeline_options.PipelineOptions(pipeline_args)
    google_cloud_options = options.view_as(pipeline_options.GoogleCloudOptions)
    shard_files_job_name = pipeline_common.generate_unique_name(
        _SHARD_VCF_FILES_JOB_NAME)
    _update_google_cloud_job_name(google_cloud_options, shard_files_job_name)
    vcf_shards_output_dir = filesystems.FileSystems.join(
        known_args.annotation_output_dir, _SHARDS_FOLDER)
    with beam.Pipeline(options=options) as p:
        variants = _read_variants(known_args.all_patterns,
                                  p,
                                  known_args,
                                  pipeline_mode,
                                  pre_infer_headers=False,
                                  keep_raw_sample_names=True,
                                  use_1_based_coordinate=False)
        sample_ids = (
            variants
            | 'CombineSampleIds' >> combine_sample_ids.SampleIdsCombiner()
            | 'CombineToList' >> beam.combiners.ToList())
        # TODO(tneymanov): Annotation pipeline currently stores sample IDs instead
        # of sample names in the the sharded VCF files, which would lead to double
        # hashing of samples. Needs to be fixed ASAP.
        _ = (variants
             | 'DensifyVariants' >> densify_variants.DensifyVariants(
                 beam.pvalue.AsSingleton(sample_ids))
             | 'WriteToShards' >> write_variants_to_shards.WriteToShards(
                 vcf_shards_output_dir, beam.pvalue.AsSingleton(sample_ids),
                 known_args.number_of_variants_per_shard))

    return [
        vep_runner_util.format_dir_path(vcf_shards_output_dir) +
        _GCS_RECURSIVE_WILDCARD
    ]
Beispiel #7
0
def _merge_headers(known_args, pipeline_args, pipeline_mode):
    # type: (argparse.Namespace, List[str], int) -> None
    """Merges VCF headers using beam based on pipeline_mode."""
    if known_args.representative_header_file:
        return

    options = pipeline_options.PipelineOptions(pipeline_args)

    # Always run pipeline locally if data is small.
    if (pipeline_mode == pipeline_common.PipelineModes.SMALL
            and not known_args.infer_headers
            and not known_args.infer_annotation_types):
        options.view_as(
            pipeline_options.StandardOptions).runner = 'DirectRunner'

    google_cloud_options = options.view_as(pipeline_options.GoogleCloudOptions)
    merge_headers_job_name = pipeline_common.generate_unique_name(
        _MERGE_HEADERS_JOB_NAME)
    if google_cloud_options.job_name:
        google_cloud_options.job_name += '-' + merge_headers_job_name
    else:
        google_cloud_options.job_name = merge_headers_job_name

    temp_directory = google_cloud_options.temp_location or tempfile.mkdtemp()
    temp_merged_headers_file_name = '-'.join(
        [google_cloud_options.job_name, _MERGE_HEADERS_FILE_NAME])
    temp_merged_headers_file_path = filesystems.FileSystems.join(
        temp_directory, temp_merged_headers_file_name)

    with beam.Pipeline(options=options) as p:
        headers = pipeline_common.read_headers(p, pipeline_mode, known_args)
        merged_header = pipeline_common.get_merged_headers(
            headers, known_args.split_alternate_allele_info_fields,
            known_args.allow_incompatible_records)
        if known_args.infer_headers or known_args.infer_annotation_types:
            merged_header = _add_inferred_headers(p, known_args, merged_header)
        pipeline_common.write_headers(merged_header,
                                      temp_merged_headers_file_path)
        known_args.representative_header_file = temp_merged_headers_file_path