def setUp(self): self._schema_descriptor = bigquery_schema_descriptor.SchemaDescriptor( _get_table_schema()) self._conflict_resolver = ( vcf_field_conflict_resolver.FieldConflictResolver()) self._row_generator = bigquery_vcf_data_converter.BigQueryRowGenerator( self._schema_descriptor, self._conflict_resolver)
def __init__( self, output_path, # type: str header_fields, # type: vcf_header_io.VcfHeader proc_var_factory, # type: processed_variant.ProcessedVariantFactory variant_merger=None, # type: variant_merge_strategy.VariantMergeStrategy allow_incompatible_records=False, # type: bool omit_empty_sample_calls=False, # type: bool null_numeric_value_replacement=None # type: int ): # type: (...) -> None """Initializes the transform. Args: output_path: The path under which output Avro files are generated. header_fields: Representative header fields for all variants. This is needed for dynamically generating the schema. proc_var_factory: The factory class that knows how to convert Variant instances to ProcessedVariant. As a side effect it also knows how to modify BigQuery schema based on the ProcessedVariants that it generates. The latter functionality is what is needed here. variant_merger: The strategy used for merging variants (if any). Some strategies may change the schema, which is why this may be needed here. allow_incompatible_records: If true, field values are casted to Bigquery + schema if there is a mismatch. omit_empty_sample_calls: If true, samples that don't have a given call will be omitted. null_numeric_value_replacement: the value to use instead of null for numeric (float/int/long) lists. For instance, [0, None, 1] will become [0, `null_numeric_value_replacement`, 1]. If not set, the value will set to bigquery_util._DEFAULT_NULL_NUMERIC_VALUE_REPLACEMENT. """ self._output_path = output_path self._proc_var_factory = proc_var_factory table_schema = (schema_converter.generate_schema_from_header_fields( header_fields, proc_var_factory, variant_merger)) self._avro_schema = avro.schema.parse( schema_converter.convert_table_schema_to_json_avro_schema( table_schema)) self._bigquery_row_generator = ( bigquery_vcf_data_converter.BigQueryRowGenerator( bigquery_schema_descriptor.SchemaDescriptor(table_schema), vcf_field_conflict_resolver.FieldConflictResolver( resolve_always=allow_incompatible_records), null_numeric_value_replacement)) self._allow_incompatible_records = allow_incompatible_records self._omit_empty_sample_calls = omit_empty_sample_calls
def __init__( self, output_table, # type: str header_fields, # type: vcf_header_io.VcfHeader variant_merger=None, # type: variant_merge_strategy.VariantMergeStrategy proc_var_factory=None, # type: processed_variant.ProcessedVariantFactory append=False, # type: bool update_schema_on_append=False, # type: bool allow_incompatible_records=False, # type: bool omit_empty_sample_calls=False, # type: bool num_bigquery_write_shards=1, # type: int null_numeric_value_replacement=None # type: int ): # type: (...) -> None """Initializes the transform. Args: output_table: Full path of the output BigQuery table. header_fields: Representative header fields for all variants. This is needed for dynamically generating the schema. variant_merger: The strategy used for merging variants (if any). Some strategies may change the schema, which is why this may be needed here. proc_var_factory: The factory class that knows how to convert Variant instances to ProcessedVariant. As a side effect it also knows how to modify BigQuery schema based on the ProcessedVariants that it generates. The latter functionality is what is needed here. append: If true, existing records in output_table will not be overwritten. New records will be appended to those that already exist. update_schema_on_append: If true, BigQuery schema will be updated by combining the existing schema and the new schema if they are compatible. allow_incompatible_records: If true, field values are casted to Bigquery + schema if there is a mismatch. omit_empty_sample_calls: If true, samples that don't have a given call will be omitted. num_bigquery_write_shards: If > 1, we will limit number of sources which are used for writing to the output BigQuery table. null_numeric_value_replacement: the value to use instead of null for numeric (float/int/long) lists. For instance, [0, None, 1] will become [0, `null_numeric_value_replacement`, 1]. If not set, the value will set to bigquery_util._DEFAULT_NULL_NUMERIC_VALUE_REPLACEMENT. """ self._output_table = output_table self._header_fields = header_fields self._variant_merger = variant_merger self._proc_var_factory = proc_var_factory self._append = append self._schema = ( bigquery_vcf_schema_converter.generate_schema_from_header_fields( self._header_fields, self._proc_var_factory, self._variant_merger)) # Resolver makes extra effort to resolve conflict when flag # allow_incompatible_records is set. self._bigquery_row_generator = ( bigquery_vcf_data_converter.BigQueryRowGenerator( bigquery_schema_descriptor.SchemaDescriptor(self._schema), vcf_field_conflict_resolver.FieldConflictResolver( resolve_always=allow_incompatible_records), null_numeric_value_replacement)) self._allow_incompatible_records = allow_incompatible_records self._omit_empty_sample_calls = omit_empty_sample_calls self._num_bigquery_write_shards = num_bigquery_write_shards if update_schema_on_append: self._update_bigquery_schema_on_append()