def _bigquery_to_vcf_shards(
        known_args,  # type: argparse.Namespace
        beam_pipeline_options,  # type: pipeline_options.PipelineOptions
        vcf_data_temp_folder,  # type: str
        header_file_path,  # type: str
):
    # type: (...) -> None
    """Runs BigQuery to VCF shards pipelines.

  It reads the variants from BigQuery table, groups a collection of variants
  within a contiguous region of the genome (the size of the collection is
  adjustable through flag `--number_of_bases_per_shard`), sorts them, and then
  writes to one VCF file. All VCF data files are saved in
  `vcf_data_temp_folder`.

  Also, it writes the meta info and data header with the call names to
  `vcf_header_file_path`.
  """
    schema = _get_schema(known_args.input_table)
    # TODO(allieychen): Modify the SQL query with the specified call_names.
    query = _get_bigquery_query(known_args, schema)
    logging.info('Processing BigQuery query %s:', query)
    bq_source = bigquery.BigQuerySource(query=query,
                                        validate=True,
                                        use_standard_sql=True)
    annotation_names = _extract_annotation_names(schema)
    with beam.Pipeline(options=beam_pipeline_options) as p:
        variants = (p
                    | 'ReadFromBigQuery ' >> beam.io.Read(bq_source)
                    | bigquery_to_variant.BigQueryToVariant(annotation_names))
        if known_args.call_names:
            call_names = (p
                          | transforms.Create(known_args.call_names)
                          | beam.combiners.ToList())
        else:
            call_names = (
                variants
                | 'CombineCallNames' >> combine_call_names.CallNamesCombiner(
                    known_args.preserve_call_names_order))

        _ = (call_names
             | 'GenerateVcfDataHeader' >> beam.ParDo(
                 _write_vcf_header_with_call_names, _VCF_FIXED_COLUMNS,
                 known_args.representative_header_file, header_file_path))

        _ = (variants
             | densify_variants.DensifyVariants(
                 beam.pvalue.AsSingleton(call_names))
             | 'PairVariantWithKey' >> beam.Map(
                 _pair_variant_with_key, known_args.number_of_bases_per_shard)
             | 'GroupVariantsByKey' >> beam.GroupByKey()
             | beam.ParDo(_get_file_path_and_sorted_variants,
                          vcf_data_temp_folder)
             | vcfio.WriteVcfDataLines())
def run_pipeline(pipeline_args, known_args):
  """A beam pipeline to resize and pad images from urls and save to storage.

  Args:
    pipeline_args: Arguments consumed by the beam pipeline
    known_args: Extra args used to set various fields such as the dataset and
                table from which to read cat urls and labels, and the bucket
                and image directory to write processed images

  Returns:
    [nothing], just writes processed images to the image directory
  """

  # Specify pipeline options
  pipeline_options = PipelineOptions(pipeline_args)
  pipeline_options.view_as(SetupOptions).save_main_session = True

  # Determine bigquery source from dataset and table arguments
  query = ('SELECT ROW_NUMBER() OVER() as index, original_url, label, randnum'
           ' from [' + known_args.dataset + '.' + known_args.table + ']')
  bq_source = bigquery.BigQuerySource(query=query)

  logging.info('Starting image collection into directory '
               + known_args.output_dir)

  # Create destination directory if it doesn't exist
  output_dir = known_args.output_dir
  if known_args.cloud:
    output_dir = 'gs://' + known_args.storage_bucket + '/' + output_dir

  # Directory needs to be explicitly made on some filesystems.
  if not FileSystems.exists(output_dir):
    FileSystems.mkdirs(output_dir)

  # Run pipeline
  with beam.Pipeline(options=pipeline_options) as p:
    _ = (p
         | 'read_rows_from_cat_info_table'
         >> beam.io.Read(bq_source)
         | 'fetch_images_from_urls'
         >> beam.Map(fetch_image_from_url)
         | 'filter_bad_or_absent_images'
         >> beam.Filter(filter_bad_or_missing_image)
         | 'resize_and_pad_images'
         >> beam.Map(resize_and_pad,
                     output_image_dim=known_args.output_image_dim)
         | 'write_images_to_storage'
         >> beam.Map(write_processed_image,
                     output_dir=output_dir)
         )

  logging.info('Done collecting images')
Example #3
0
def _bigquery_to_vcf_shards(
        known_args,  # type: argparse.Namespace
        beam_pipeline_options,  # type: pipeline_options.PipelineOptions
        vcf_data_temp_folder,  # type: str
        vcf_data_header_file_path,  # type: str
):
    # type: (...) -> None
    """Runs BigQuery to VCF shards pipelines.

  It reads the variants from BigQuery table, groups a collection of variants
  within a contiguous region of the genome (the size of the collection is
  adjustable through flag `--number_of_bases_per_shard`), sorts them, and then
  writes to one VCF file. All VCF data files are saved in
  `vcf_data_temp_folder`.

  Also, it writes the data header to `vcf_data_header_file_path`.
  TODO(allieychen): Eventually, it also generates the meta information file.
  """
    bq_source = bigquery.BigQuerySource(
        query=_BASE_QUERY_TEMPLATE.format(INPUT_TABLE='.'.join(
            bigquery_util.parse_table_reference(known_args.input_table))),
        validate=True,
        use_standard_sql=True)

    with beam.Pipeline(options=beam_pipeline_options) as p:
        variants = (p
                    | 'ReadFromBigQuery ' >> beam.io.Read(bq_source)
                    | bigquery_to_variant.BigQueryToVariant())
        call_names = (
            variants
            | 'CombineCallNames' >> combine_call_names.CallNamesCombiner())

        _ = (call_names
             | 'GenerateVcfDataHeader' >> beam.ParDo(
                 _write_vcf_data_header, _VCF_FIXED_COLUMNS,
                 vcf_data_header_file_path))

        _ = (variants
             | densify_variants.DensifyVariants(
                 beam.pvalue.AsSingleton(call_names))
             | 'PairVariantWithKey' >> beam.Map(
                 _pair_variant_with_key, known_args.number_of_bases_per_shard)
             | 'GroupVariantsByKey' >> beam.GroupByKey()
             | beam.ParDo(_get_file_path_and_sorted_variants,
                          vcf_data_temp_folder)
             | vcfio.WriteVcfDataLines())
def run(argv=None):
    # type: (List[str]) -> None
    """Runs BigQuery to VCF pipeline."""
    logging.info('Command: %s', ' '.join(argv or sys.argv))
    known_args, pipeline_args = vcf_to_bq_common.parse_args(
        argv, _COMMAND_LINE_OPTIONS)
    bq_source = bigquery.BigQuerySource(
        query=_BASE_QUERY_TEMPLATE.format(INPUT_TABLE='.'.join(
            bigquery_util.parse_table_reference(known_args.input_table))),
        validate=True,
        use_standard_sql=True)

    options = pipeline_options.PipelineOptions(pipeline_args)
    with beam.Pipeline(options=options) as p:
        _ = (p | 'ReadFromBigQuery ' >> beam.io.Read(bq_source)
             | bigquery_to_variant.BigQueryToVariant()
             | densify_variants.DensifyVariants()
             | vcfio.WriteToVcf(known_args.output_file))
def _bigquery_to_vcf_shards(
        known_args,  # type: argparse.Namespace
        beam_pipeline_options,  # type: pipeline_options.PipelineOptions
        vcf_data_temp_folder,  # type: str
        header_file_path,  # type: str
):
    # type: (...) -> None
    """Runs BigQuery to VCF shards pipelines.

  It reads the variants from BigQuery table, groups a collection of variants
  within a contiguous region of the genome (the size of the collection is
  adjustable through flag `--number_of_bases_per_shard`), sorts them, and then
  writes to one VCF file. All VCF data files are saved in
  `vcf_data_temp_folder`.

  Also, it writes the meta info and data header with the sample names to
  `vcf_header_file_path`.
  """
    schema = _get_schema(known_args.input_table)
    variant_query = _get_variant_query(known_args, schema)
    logging.info('Processing BigQuery query %s:', variant_query)
    project_id, dataset_id, table_id = bigquery_util.parse_table_reference(
        known_args.input_table)
    bq_variant_source = bigquery.BigQuerySource(query=variant_query,
                                                validate=True,
                                                use_standard_sql=True)
    annotation_names = _extract_annotation_names(schema)

    base_table_id = bigquery_util.get_table_base_name(table_id)
    sample_query = _SAMPLE_INFO_QUERY_TEMPLATE.format(
        PROJECT_ID=project_id,
        DATASET_ID=dataset_id,
        TABLE_NAME=bigquery_util.compose_table_name(base_table_id,
                                                    SAMPLE_INFO_TABLE_SUFFIX))
    bq_sample_source = bigquery.BigQuerySource(query=sample_query,
                                               validate=True,
                                               use_standard_sql=True)
    with beam.Pipeline(options=beam_pipeline_options) as p:
        variants = (p
                    | 'ReadFromBigQuery ' >> beam.io.Read(bq_variant_source)
                    | bigquery_to_variant.BigQueryToVariant(annotation_names))
        sample_table_rows = (
            p
            | 'ReadFromSampleTable' >> beam.io.Read(bq_sample_source))
        if known_args.sample_names:
            temp_sample_names = (p
                                 | transforms.Create(known_args.sample_names,
                                                     reshuffle=False))
        else:
            # Get sample names from sample IDs in the variants and sort.
            id_to_name_hash_table = (sample_table_rows
                                     | 'SampleIdToNameDict' >>
                                     sample_mapping_table.SampleIdToNameDict())
            temp_sample_ids = (
                variants
                | 'CombineSampleIds' >> combine_sample_ids.SampleIdsCombiner(
                    known_args.preserve_sample_order))
            temp_sample_names = (
                temp_sample_ids
                | 'GetSampleNames' >> sample_mapping_table.GetSampleNames(
                    beam.pvalue.AsSingleton(id_to_name_hash_table))
                | 'CombineToList' >> beam.combiners.ToList()
                | 'SortSampleNames' >> beam.ParDo(sorted))

        name_to_id_hash_table = (
            sample_table_rows
            |
            'SampleNameToIdDict' >> sample_mapping_table.SampleNameToIdDict())
        sample_ids = (temp_sample_names
                      | 'GetSampleIds' >> sample_mapping_table.GetSampleIds(
                          beam.pvalue.AsSingleton(name_to_id_hash_table))
                      | 'CombineSortedSampleIds' >> beam.combiners.ToList())
        sample_names = temp_sample_names | beam.combiners.ToList()

        _ = (sample_names
             | 'GenerateVcfDataHeader' >> beam.ParDo(
                 _write_vcf_header_with_sample_names, _VCF_FIXED_COLUMNS,
                 known_args.representative_header_file, header_file_path))

        _ = (variants
             | densify_variants.DensifyVariants(
                 beam.pvalue.AsSingleton(sample_ids))
             | 'PairVariantWithKey' >> beam.Map(
                 _pair_variant_with_key, known_args.number_of_bases_per_shard)
             | 'GroupVariantsByKey' >> beam.GroupByKey()
             | beam.ParDo(_get_file_path_and_sorted_variants,
                          vcf_data_temp_folder)
             | vcfio.WriteVcfDataLines(known_args.bq_uses_1_based_coordinate))