コード例 #1
0
    def __init__(
            self,
            output_path,  # type: str
            schema,  # type: bigquery.TableSchema
            allow_incompatible_records=False,  # type: bool
            omit_empty_sample_calls=False,  # type: bool
            null_numeric_value_replacement=None  # type: int
    ):
        # type: (...) -> None
        """Initializes the transform.

    Args:
      output_path: The path under which output Avro files are generated.
      schema: Schema of the table to be generated.
      allow_incompatible_records: If true, field values are casted to Bigquery
+       schema if there is a mismatch.
      omit_empty_sample_calls: If true, samples that don't have a given call
        will be omitted.
      null_numeric_value_replacement: the value to use instead of null for
        numeric (float/int/long) lists. For instance, [0, None, 1] will become
        [0, `null_numeric_value_replacement`, 1]. If not set, the value will set
        to bigquery_util._DEFAULT_NULL_NUMERIC_VALUE_REPLACEMENT.
    """
        self._output_path = output_path
        self._avro_schema = avro.schema.parse(
            schema_converter.convert_table_schema_to_json_avro_schema(schema))
        self._bigquery_row_generator = (
            bigquery_row_generator.VariantCallRowGenerator(
                bigquery_schema_descriptor.SchemaDescriptor(schema),
                vcf_field_conflict_resolver.FieldConflictResolver(
                    resolve_always=allow_incompatible_records),
                null_numeric_value_replacement))

        self._allow_incompatible_records = allow_incompatible_records
        self._omit_empty_sample_calls = omit_empty_sample_calls
コード例 #2
0
    def setUp(self):
        self._schema_descriptor = bigquery_schema_descriptor.SchemaDescriptor(
            self._get_table_schema())
        self._conflict_resolver = (
            vcf_field_conflict_resolver.FieldConflictResolver())

        self._row_generator = bigquery_row_generator.VariantCallRowGenerator(
            self._schema_descriptor, self._conflict_resolver)
コード例 #3
0
    def __init__(
            self,
            output_table,  # type: str
            schema,  # type: bigquery.TableSchema
            append=False,  # type: bool
            allow_incompatible_records=False,  # type: bool
            omit_empty_sample_calls=False,  # type: bool
            num_bigquery_write_shards=1,  # type: int
            null_numeric_value_replacement=None,  # type: int
            include_call_name=False,  # type: bool
            move_hom_ref_calls=False  # type: bool
    ):
        # type: (...) -> None
        """Initializes the transform.

    Args:
      output_table: Full path of the output BigQuery table.
      schema: Schema of the table to be generated.
      append: If true, existing records in output_table will not be
        overwritten. New records will be appended to those that already exist.
      allow_incompatible_records: If true, field values are casted to Bigquery
        schema if there is a mismatch.
      omit_empty_sample_calls: If true, samples that don't have a given call
        will be omitted.
      num_bigquery_write_shards: If > 1, we will limit number of sources which
        are used for writing to the output BigQuery table.
      null_numeric_value_replacement: the value to use instead of null for
        numeric (float/int/long) lists. For instance, [0, None, 1] will become
        [0, `null_numeric_value_replacement`, 1]. If not set, the value will set
        to bigquery_util._DEFAULT_NULL_NUMERIC_VALUE_REPLACEMENT.
      include_call_name: If true, sample name will be included in addition to
        sample ID.
      move_hom_ref_calls: If true, filter out 0 GT data out of call list and add
        the call name to a hom_ref_calls column.
    """
        self._output_table = output_table
        self._append = append
        self._schema = schema
        # Resolver makes extra effort to resolve conflict when flag
        # allow_incompatible_records is set.
        self._bigquery_row_generator = (
            bigquery_row_generator.VariantCallRowGenerator(
                bigquery_schema_descriptor.SchemaDescriptor(self._schema),
                vcf_field_conflict_resolver.FieldConflictResolver(
                    resolve_always=allow_incompatible_records),
                null_numeric_value_replacement, include_call_name,
                move_hom_ref_calls))

        self._allow_incompatible_records = allow_incompatible_records
        self._omit_empty_sample_calls = omit_empty_sample_calls
        self._num_bigquery_write_shards = num_bigquery_write_shards
コード例 #4
0
    def __init__(
            self,
            output_path,  # type: str
            header_fields,  # type: vcf_header_io.VcfHeader
            proc_var_factory,  # type: processed_variant.ProcessedVariantFactory
            variant_merger=None,  # type: variant_merge_strategy.VariantMergeStrategy
            allow_incompatible_records=False,  # type: bool
            omit_empty_sample_calls=False,  # type: bool
            null_numeric_value_replacement=None  # type: int
    ):
        # type: (...) -> None
        """Initializes the transform.

    Args:
      output_path: The path under which output Avro files are generated.
      header_fields: Representative header fields for all variants. This is
        needed for dynamically generating the schema.
      proc_var_factory: The factory class that knows how to convert Variant
        instances to ProcessedVariant. As a side effect it also knows how to
        modify BigQuery schema based on the ProcessedVariants that it generates.
        The latter functionality is what is needed here.
      variant_merger: The strategy used for merging variants (if any). Some
        strategies may change the schema, which is why this may be needed here.
      allow_incompatible_records: If true, field values are casted to Bigquery
+       schema if there is a mismatch.
      omit_empty_sample_calls: If true, samples that don't have a given call
        will be omitted.
      null_numeric_value_replacement: the value to use instead of null for
        numeric (float/int/long) lists. For instance, [0, None, 1] will become
        [0, `null_numeric_value_replacement`, 1]. If not set, the value will set
        to bigquery_util._DEFAULT_NULL_NUMERIC_VALUE_REPLACEMENT.
    """
        self._output_path = output_path
        self._proc_var_factory = proc_var_factory
        table_schema = (schema_converter.generate_schema_from_header_fields(
            header_fields, proc_var_factory, variant_merger))
        self._avro_schema = avro.schema.parse(
            schema_converter.convert_table_schema_to_json_avro_schema(
                table_schema))
        self._bigquery_row_generator = (
            bigquery_row_generator.VariantCallRowGenerator(
                bigquery_schema_descriptor.SchemaDescriptor(table_schema),
                vcf_field_conflict_resolver.FieldConflictResolver(
                    resolve_always=allow_incompatible_records),
                null_numeric_value_replacement))

        self._allow_incompatible_records = allow_incompatible_records
        self._omit_empty_sample_calls = omit_empty_sample_calls
コード例 #5
0
    def test_convert_variant_to_bigquery_row_allow_incompatible_recoreds(self):
        variant, row, header_num_dict = (
            self._get_sample_variant_with_incompatible_records())
        header_fields = vcf_header_util.make_header(header_num_dict)
        proc_var = processed_variant.ProcessedVariantFactory(
            header_fields).create_processed_variant(variant)
        pipeline = TestPipeline(blocking=True)
        bigquery_rows = (
            pipeline
            | Create([proc_var])
            | 'ConvertToRow' >> beam.ParDo(
                ConvertVariantToRow(self._row_generator,
                                    allow_incompatible_records=True)))
        assert_that(bigquery_rows, equal_to([row]))
        pipeline.run()

        self._row_generator = bigquery_row_generator.VariantCallRowGenerator(
            self._schema_descriptor, self._conflict_resolver)
コード例 #6
0
 def test_convert_variant_with_sample_name_to_bigquery_row(self):
     self._row_generator = bigquery_row_generator.VariantCallRowGenerator(
         self._schema_descriptor,
         self._conflict_resolver,
         include_call_name=True)
     variant, row, header_num_dict = self._get_sample_variant_with_sample_name(
     )
     header_fields = vcf_header_util.make_header(header_num_dict)
     proc_var = processed_variant.ProcessedVariantFactory(
         header_fields).create_processed_variant(variant)
     pipeline = TestPipeline(blocking=True)
     bigquery_rows = (
         pipeline
         | Create([proc_var])
         | 'ConvertToRow' >> beam.ParDo(
             ConvertVariantToRow(self._row_generator,
                                 omit_empty_sample_calls=True)))
     assert_that(bigquery_rows, equal_to([row]))
     pipeline.run()
コード例 #7
0
    def __init__(
            self,
            output_table,  # type: str
            header_fields,  # type: vcf_header_io.VcfHeader
            variant_merger=None,  # type: variant_merge_strategy.VariantMergeStrategy
            proc_var_factory=None,  # type: processed_variant.ProcessedVariantFactory
            # TODO(bashir2): proc_var_factory is a required argument and if `None` is
            # supplied this will fail in schema generation.
        append=False,  # type: bool
            update_schema_on_append=False,  # type: bool
            allow_incompatible_records=False,  # type: bool
            omit_empty_sample_calls=False,  # type: bool
            num_bigquery_write_shards=1,  # type: int
            null_numeric_value_replacement=None  # type: int
    ):
        # type: (...) -> None
        """Initializes the transform.

    Args:
      output_table: Full path of the output BigQuery table.
      header_fields: Representative header fields for all variants. This is
        needed for dynamically generating the schema.
      variant_merger: The strategy used for merging variants (if any). Some
        strategies may change the schema, which is why this may be needed here.
      proc_var_factory: The factory class that knows how to convert Variant
        instances to ProcessedVariant. As a side effect it also knows how to
        modify BigQuery schema based on the ProcessedVariants that it generates.
        The latter functionality is what is needed here.
      append: If true, existing records in output_table will not be
        overwritten. New records will be appended to those that already exist.
      update_schema_on_append: If true, BigQuery schema will be updated by
        combining the existing schema and the new schema if they are compatible.
      allow_incompatible_records: If true, field values are casted to Bigquery
+       schema if there is a mismatch.
      omit_empty_sample_calls: If true, samples that don't have a given call
        will be omitted.
      num_bigquery_write_shards: If > 1, we will limit number of sources which
        are used for writing to the output BigQuery table.
      null_numeric_value_replacement: the value to use instead of null for
        numeric (float/int/long) lists. For instance, [0, None, 1] will become
        [0, `null_numeric_value_replacement`, 1]. If not set, the value will set
        to bigquery_util._DEFAULT_NULL_NUMERIC_VALUE_REPLACEMENT.
    """
        self._output_table = output_table
        self._header_fields = header_fields
        self._variant_merger = variant_merger
        self._proc_var_factory = proc_var_factory
        self._append = append
        self._schema = (schema_converter.generate_schema_from_header_fields(
            self._header_fields, self._proc_var_factory, self._variant_merger))
        # Resolver makes extra effort to resolve conflict when flag
        # allow_incompatible_records is set.
        self._bigquery_row_generator = (
            bigquery_row_generator.VariantCallRowGenerator(
                bigquery_schema_descriptor.SchemaDescriptor(self._schema),
                vcf_field_conflict_resolver.FieldConflictResolver(
                    resolve_always=allow_incompatible_records),
                null_numeric_value_replacement))

        self._allow_incompatible_records = allow_incompatible_records
        self._omit_empty_sample_calls = omit_empty_sample_calls
        self._num_bigquery_write_shards = num_bigquery_write_shards
        if update_schema_on_append:
            bigquery_util.update_bigquery_schema_on_append(
                self._schema.fields, self._output_table)
コード例 #8
0
    def test_all_fields_with_hom_ref(self):
        schema_descriptor = bigquery_schema_descriptor.SchemaDescriptor(
            _get_table_schema(move_hom_ref_calls=True))
        conflict_resolver = (
            vcf_field_conflict_resolver.FieldConflictResolver())

        variant = vcfio.Variant(
            reference_name='chr19',
            start=11,
            end=12,
            reference_bases='C',
            alternate_bases=['A', 'TT'],
            names=['rs1', 'rs2'],
            quality=2,
            filters=['PASS'],
            info={
                'IFR': [0.1, 0.2],
                'IFR2': [0.2, 0.3],
                'IS': 'some data',
                'ISR': ['data1', 'data2']
            },
            hom_ref_calls=[('Sample2', hash_name('Sample2')),
                           ('Sample3', hash_name('Sample3'))],
            calls=[
                vcfio.VariantCall(sample_id=hash_name('Sample1'),
                                  name='Sample1',
                                  genotype=[0, 1],
                                  phaseset='*',
                                  info={
                                      'GQ': 20,
                                      'FIR': [10, 20]
                                  })
            ])
        header_num_dict = {'IFR': 'A', 'IFR2': 'A', 'IS': '1', 'ISR': '2'}
        expected_row = {
            ColumnKeyConstants.REFERENCE_NAME:
            'chr19',
            ColumnKeyConstants.START_POSITION:
            11,
            ColumnKeyConstants.END_POSITION:
            12,
            ColumnKeyConstants.REFERENCE_BASES:
            'C',
            ColumnKeyConstants.ALTERNATE_BASES: [{
                ColumnKeyConstants.ALTERNATE_BASES_ALT:
                'A',
                'IFR':
                0.1,
                'IFR2':
                0.2
            }, {
                ColumnKeyConstants.ALTERNATE_BASES_ALT:
                'TT',
                'IFR':
                0.2,
                'IFR2':
                0.3
            }],
            ColumnKeyConstants.NAMES: ['rs1', 'rs2'],
            ColumnKeyConstants.QUALITY:
            2,
            ColumnKeyConstants.FILTER: ['PASS'],
            ColumnKeyConstants.HOM_REF_CALLS: [{
                ColumnKeyConstants.CALLS_SAMPLE_ID:
                hash_name('Sample2'),
                ColumnKeyConstants.CALLS_NAME:
                'Sample2'
            }, {
                ColumnKeyConstants.CALLS_SAMPLE_ID:
                hash_name('Sample3'),
                ColumnKeyConstants.CALLS_NAME:
                'Sample3'
            }],
            ColumnKeyConstants.CALLS: [{
                ColumnKeyConstants.CALLS_SAMPLE_ID:
                hash_name('Sample1'),
                ColumnKeyConstants.CALLS_NAME:
                'Sample1',
                ColumnKeyConstants.CALLS_GENOTYPE: [0, 1],
                ColumnKeyConstants.CALLS_PHASESET:
                '*',
                'GQ':
                20,
                'FIR': [10, 20]
            }],
            'IS':
            'some data',
            'ISR': ['data1', 'data2']
        }
        proc_variant = _get_processed_variant(variant, header_num_dict)
        row_generator = bigquery_row_generator.VariantCallRowGenerator(
            schema_descriptor,
            conflict_resolver,
            include_call_name=True,
            move_hom_ref_calls=True)
        self.assertEqual([expected_row],
                         list(row_generator.get_rows(proc_variant)))