Esempio n. 1
0
    def setUp(self):
        self._schema_descriptor = bigquery_schema_descriptor.SchemaDescriptor(
            _get_table_schema())
        self._conflict_resolver = (
            vcf_field_conflict_resolver.FieldConflictResolver())

        self._row_generator = bigquery_vcf_data_converter.BigQueryRowGenerator(
            self._schema_descriptor, self._conflict_resolver)
    def __init__(
            self,
            output_path,  # type: str
            header_fields,  # type: vcf_header_io.VcfHeader
            proc_var_factory,  # type: processed_variant.ProcessedVariantFactory
            variant_merger=None,  # type: variant_merge_strategy.VariantMergeStrategy
            allow_incompatible_records=False,  # type: bool
            omit_empty_sample_calls=False,  # type: bool
            null_numeric_value_replacement=None  # type: int
    ):
        # type: (...) -> None
        """Initializes the transform.

    Args:
      output_path: The path under which output Avro files are generated.
      header_fields: Representative header fields for all variants. This is
        needed for dynamically generating the schema.
      proc_var_factory: The factory class that knows how to convert Variant
        instances to ProcessedVariant. As a side effect it also knows how to
        modify BigQuery schema based on the ProcessedVariants that it generates.
        The latter functionality is what is needed here.
      variant_merger: The strategy used for merging variants (if any). Some
        strategies may change the schema, which is why this may be needed here.
      allow_incompatible_records: If true, field values are casted to Bigquery
+       schema if there is a mismatch.
      omit_empty_sample_calls: If true, samples that don't have a given call
        will be omitted.
      null_numeric_value_replacement: the value to use instead of null for
        numeric (float/int/long) lists. For instance, [0, None, 1] will become
        [0, `null_numeric_value_replacement`, 1]. If not set, the value will set
        to bigquery_util._DEFAULT_NULL_NUMERIC_VALUE_REPLACEMENT.
    """
        self._output_path = output_path
        self._proc_var_factory = proc_var_factory
        table_schema = (schema_converter.generate_schema_from_header_fields(
            header_fields, proc_var_factory, variant_merger))
        self._avro_schema = avro.schema.parse(
            schema_converter.convert_table_schema_to_json_avro_schema(
                table_schema))
        self._bigquery_row_generator = (
            bigquery_vcf_data_converter.BigQueryRowGenerator(
                bigquery_schema_descriptor.SchemaDescriptor(table_schema),
                vcf_field_conflict_resolver.FieldConflictResolver(
                    resolve_always=allow_incompatible_records),
                null_numeric_value_replacement))

        self._allow_incompatible_records = allow_incompatible_records
        self._omit_empty_sample_calls = omit_empty_sample_calls
Esempio n. 3
0
    def __init__(
            self,
            output_table,  # type: str
            header_fields,  # type: vcf_header_io.VcfHeader
            variant_merger=None,  # type: variant_merge_strategy.VariantMergeStrategy
            proc_var_factory=None,  # type: processed_variant.ProcessedVariantFactory
            append=False,  # type: bool
            update_schema_on_append=False,  # type: bool
            allow_incompatible_records=False,  # type: bool
            omit_empty_sample_calls=False,  # type: bool
            num_bigquery_write_shards=1,  # type: int
            null_numeric_value_replacement=None  # type: int
    ):
        # type: (...) -> None
        """Initializes the transform.

    Args:
      output_table: Full path of the output BigQuery table.
      header_fields: Representative header fields for all variants. This is
        needed for dynamically generating the schema.
      variant_merger: The strategy used for merging variants (if any). Some
        strategies may change the schema, which is why this may be needed here.
      proc_var_factory: The factory class that knows how to convert Variant
        instances to ProcessedVariant. As a side effect it also knows how to
        modify BigQuery schema based on the ProcessedVariants that it generates.
        The latter functionality is what is needed here.
      append: If true, existing records in output_table will not be
        overwritten. New records will be appended to those that already exist.
      update_schema_on_append: If true, BigQuery schema will be updated by
        combining the existing schema and the new schema if they are compatible.
      allow_incompatible_records: If true, field values are casted to Bigquery
+       schema if there is a mismatch.
      omit_empty_sample_calls: If true, samples that don't have a given call
        will be omitted.
      num_bigquery_write_shards: If > 1, we will limit number of sources which
        are used for writing to the output BigQuery table.
      null_numeric_value_replacement: the value to use instead of null for
        numeric (float/int/long) lists. For instance, [0, None, 1] will become
        [0, `null_numeric_value_replacement`, 1]. If not set, the value will set
        to bigquery_util._DEFAULT_NULL_NUMERIC_VALUE_REPLACEMENT.
    """
        self._output_table = output_table
        self._header_fields = header_fields
        self._variant_merger = variant_merger
        self._proc_var_factory = proc_var_factory
        self._append = append
        self._schema = (
            bigquery_vcf_schema_converter.generate_schema_from_header_fields(
                self._header_fields, self._proc_var_factory,
                self._variant_merger))
        # Resolver makes extra effort to resolve conflict when flag
        # allow_incompatible_records is set.
        self._bigquery_row_generator = (
            bigquery_vcf_data_converter.BigQueryRowGenerator(
                bigquery_schema_descriptor.SchemaDescriptor(self._schema),
                vcf_field_conflict_resolver.FieldConflictResolver(
                    resolve_always=allow_incompatible_records),
                null_numeric_value_replacement))

        self._allow_incompatible_records = allow_incompatible_records
        self._omit_empty_sample_calls = omit_empty_sample_calls
        self._num_bigquery_write_shards = num_bigquery_write_shards
        if update_schema_on_append:
            self._update_bigquery_schema_on_append()