def _validate_reserved_field_mode(field_schema, reserved_definition): schema_mode = (field_schema.mode or bigquery_util.TableFieldConstants.MODE_NULLABLE) reserved_mode = bigquery_util.get_bigquery_mode_from_vcf_num( reserved_definition.num) if schema_mode != reserved_mode: raise ValueError( 'The mode of field {} is different from the VCF spec: {} vs {}.'. format(field_schema.name, schema_mode, reserved_mode))
def generate_schema_from_header_fields( header_fields, # type: vcf_header_io.VcfHeader proc_variant_factory, # type: processed_variant.ProcessedVariantFactory variant_merger=None # type: variant_merge_strategy.VariantMergeStrategy ): # type: (...) -> bigquery.TableSchema """Returns a ``TableSchema`` for the BigQuery table storing variants. Args: header_fields: Representative header fields for all variants. proc_variant_factory: The factory class that knows how to convert Variant instances to ProcessedVariant. As a side effect it also knows how to modify BigQuery schema based on the ProcessedVariants that it generates. The latter functionality is what is needed here. variant_merger: The strategy used for merging variants (if any). Some strategies may change the schema, which is why this may be needed here. """ schema = bigquery.TableSchema() schema.fields.append( bigquery.TableFieldSchema( name=bigquery_util.ColumnKeyConstants.REFERENCE_NAME, type=bigquery_util.TableFieldConstants.TYPE_STRING, mode=bigquery_util.TableFieldConstants.MODE_NULLABLE, description='Reference name.')) schema.fields.append( bigquery.TableFieldSchema( name=bigquery_util.ColumnKeyConstants.START_POSITION, type=bigquery_util.TableFieldConstants.TYPE_INTEGER, mode=bigquery_util.TableFieldConstants.MODE_NULLABLE, description=( 'Start position (0-based). Corresponds to the first base ' 'of the string of reference bases.'))) schema.fields.append( bigquery.TableFieldSchema( name=bigquery_util.ColumnKeyConstants.END_POSITION, type=bigquery_util.TableFieldConstants.TYPE_INTEGER, mode=bigquery_util.TableFieldConstants.MODE_NULLABLE, description=( 'End position (0-based). Corresponds to the first base ' 'after the last base in the reference allele.'))) schema.fields.append( bigquery.TableFieldSchema( name=bigquery_util.ColumnKeyConstants.REFERENCE_BASES, type=bigquery_util.TableFieldConstants.TYPE_STRING, mode=bigquery_util.TableFieldConstants.MODE_NULLABLE, description='Reference bases.')) schema.fields.append(proc_variant_factory.create_alt_bases_field_schema()) schema.fields.append( bigquery.TableFieldSchema( name=bigquery_util.ColumnKeyConstants.NAMES, type=bigquery_util.TableFieldConstants.TYPE_STRING, mode=bigquery_util.TableFieldConstants.MODE_REPEATED, description='Variant names (e.g. RefSNP ID).')) schema.fields.append( bigquery.TableFieldSchema( name=bigquery_util.ColumnKeyConstants.QUALITY, type=bigquery_util.TableFieldConstants.TYPE_FLOAT, mode=bigquery_util.TableFieldConstants.MODE_NULLABLE, description=( 'Phred-scaled quality score (-10log10 prob(call is wrong)). ' 'Higher values imply better quality.'))) schema.fields.append( bigquery.TableFieldSchema( name=bigquery_util.ColumnKeyConstants.FILTER, type=bigquery_util.TableFieldConstants.TYPE_STRING, mode=bigquery_util.TableFieldConstants.MODE_REPEATED, description=( 'List of failed filters (if any) or "PASS" indicating the ' 'variant has passed all filters.'))) # Add calls. calls_record = bigquery.TableFieldSchema( name=bigquery_util.ColumnKeyConstants.CALLS, type=bigquery_util.TableFieldConstants.TYPE_RECORD, mode=bigquery_util.TableFieldConstants.MODE_REPEATED, description='One record for each call.') calls_record.fields.append( bigquery.TableFieldSchema( name=bigquery_util.ColumnKeyConstants.CALLS_NAME, type=bigquery_util.TableFieldConstants.TYPE_STRING, mode=bigquery_util.TableFieldConstants.MODE_NULLABLE, description='Name of the call.')) calls_record.fields.append( bigquery.TableFieldSchema( name=bigquery_util.ColumnKeyConstants.CALLS_GENOTYPE, type=bigquery_util.TableFieldConstants.TYPE_INTEGER, mode=bigquery_util.TableFieldConstants.MODE_REPEATED, description=( 'Genotype of the call. "-1" is used in cases where the ' 'genotype is not called.'))) calls_record.fields.append( bigquery.TableFieldSchema( name=bigquery_util.ColumnKeyConstants.CALLS_PHASESET, type=bigquery_util.TableFieldConstants.TYPE_STRING, mode=bigquery_util.TableFieldConstants.MODE_NULLABLE, description=( 'Phaseset of the call (if any). "*" is used in cases where ' 'the genotype is phased, but no phase set ("PS" in FORMAT) ' 'was specified.'))) for key, field in header_fields.formats.iteritems(): # GT and PS are already included in 'genotype' and 'phaseset' fields. if key in (vcfio.GENOTYPE_FORMAT_KEY, vcfio.PHASESET_FORMAT_KEY): continue calls_record.fields.append( bigquery.TableFieldSchema( name=bigquery_sanitizer.SchemaSanitizer. get_sanitized_field_name(key), type=bigquery_util.get_bigquery_type_from_vcf_type( field[_HeaderKeyConstants.TYPE]), mode=bigquery_util.get_bigquery_mode_from_vcf_num( field[_HeaderKeyConstants.NUM]), description=bigquery_sanitizer.SchemaSanitizer. get_sanitized_string(field[_HeaderKeyConstants.DESC]))) schema.fields.append(calls_record) # Add info fields. info_keys = set() for key, field in header_fields.infos.iteritems(): # END info is already included by modifying the end_position. if (key == vcfio.END_INFO_KEY or proc_variant_factory.info_is_in_alt_bases(key)): continue schema.fields.append( bigquery.TableFieldSchema( name=bigquery_sanitizer.SchemaSanitizer. get_sanitized_field_name(key), type=bigquery_util.get_bigquery_type_from_vcf_type( field[_HeaderKeyConstants.TYPE]), mode=bigquery_util.get_bigquery_mode_from_vcf_num( field[_HeaderKeyConstants.NUM]), description=bigquery_sanitizer.SchemaSanitizer. get_sanitized_string(field[_HeaderKeyConstants.DESC]))) info_keys.add(key) if variant_merger: variant_merger.modify_bigquery_schema(schema, info_keys) return schema
def generate_schema_from_header_fields( header_fields, # type: vcf_header_io.VcfHeader proc_variant_factory, # type: processed_variant.ProcessedVariantFactory variant_merger=None, # type: variant_merge_strategy.VariantMergeStrategy use_1_based_coordinate=False, # type: bool include_call_name=False, # type: bool move_hom_ref_calls=False # type: bool ): # type: (...) -> bigquery.TableSchema """Returns a ``TableSchema`` for the BigQuery table storing variants. Args: header_fields: Representative header fields for all variants. proc_variant_factory: The factory class that knows how to convert Variant instances to ProcessedVariant. As a side effect it also knows how to modify BigQuery schema based on the ProcessedVariants that it generates. The latter functionality is what is needed here. variant_merger: The strategy used for merging variants (if any). Some strategies may change the schema, which is why this may be needed here. use_1_based_coordinate: If True use 1-based coordinate, otherwise 0-based. """ schema = bigquery.TableSchema() schema.fields.append( bigquery.TableFieldSchema( name=bigquery_util.ColumnKeyConstants.REFERENCE_NAME, type=bigquery_util.TableFieldConstants.TYPE_STRING, mode=bigquery_util.TableFieldConstants.MODE_NULLABLE, description='Reference name.')) coordinate = '1-based' if use_1_based_coordinate else '0-based' schema.fields.append( bigquery.TableFieldSchema( name=bigquery_util.ColumnKeyConstants.START_POSITION, type=bigquery_util.TableFieldConstants.TYPE_INTEGER, mode=bigquery_util.TableFieldConstants.MODE_NULLABLE, description=( 'Start position ({}). Corresponds to the first base ' 'of the string of reference bases.'.format(coordinate)))) schema.fields.append( bigquery.TableFieldSchema( name=bigquery_util.ColumnKeyConstants.END_POSITION, type=bigquery_util.TableFieldConstants.TYPE_INTEGER, mode=bigquery_util.TableFieldConstants.MODE_NULLABLE, description=('End position. Corresponds to the first base ' 'after the last base in the reference allele.'))) schema.fields.append( bigquery.TableFieldSchema( name=bigquery_util.ColumnKeyConstants.REFERENCE_BASES, type=bigquery_util.TableFieldConstants.TYPE_STRING, mode=bigquery_util.TableFieldConstants.MODE_NULLABLE, description='Reference bases.')) schema.fields.append(proc_variant_factory.create_alt_bases_field_schema()) schema.fields.append( bigquery.TableFieldSchema( name=bigquery_util.ColumnKeyConstants.NAMES, type=bigquery_util.TableFieldConstants.TYPE_STRING, mode=bigquery_util.TableFieldConstants.MODE_REPEATED, description='Variant names (e.g. RefSNP ID).')) schema.fields.append( bigquery.TableFieldSchema( name=bigquery_util.ColumnKeyConstants.QUALITY, type=bigquery_util.TableFieldConstants.TYPE_FLOAT, mode=bigquery_util.TableFieldConstants.MODE_NULLABLE, description=( 'Phred-scaled quality score (-10log10 prob(call is wrong)). ' 'Higher values imply better quality.'))) schema.fields.append( bigquery.TableFieldSchema( name=bigquery_util.ColumnKeyConstants.FILTER, type=bigquery_util.TableFieldConstants.TYPE_STRING, mode=bigquery_util.TableFieldConstants.MODE_REPEATED, description=( 'List of failed filters (if any) or "PASS" indicating the ' 'variant has passed all filters.'))) if move_hom_ref_calls: hom_ref_calls_record = bigquery.TableFieldSchema( name=bigquery_util.ColumnKeyConstants.HOM_REF_CALLS, type=bigquery_util.TableFieldConstants.TYPE_RECORD, mode=bigquery_util.TableFieldConstants.MODE_REPEATED, description='One record for each homogeneous call.') hom_ref_calls_record.fields.append( bigquery.TableFieldSchema( name=bigquery_util.ColumnKeyConstants.CALLS_SAMPLE_ID, type=bigquery_util.TableFieldConstants.TYPE_INTEGER, mode=bigquery_util.TableFieldConstants.MODE_NULLABLE, description= 'Unique ID (type INT64) assigned to each sample. Table ' 'with `__sample_info` suffix contains the mapping of ' 'sample names (as read from VCF header) to these assigned ' 'IDs.')) if include_call_name: hom_ref_calls_record.fields.append( bigquery.TableFieldSchema( name=bigquery_util.ColumnKeyConstants.CALLS_NAME, type=bigquery_util.TableFieldConstants.TYPE_STRING, mode=bigquery_util.TableFieldConstants.MODE_NULLABLE, description= 'Name of the call (sample names in the VCF Header ' 'line).')) schema.fields.append(hom_ref_calls_record) # Add calls. calls_record = bigquery.TableFieldSchema( name=bigquery_util.ColumnKeyConstants.CALLS, type=bigquery_util.TableFieldConstants.TYPE_RECORD, mode=bigquery_util.TableFieldConstants.MODE_REPEATED, description='One record for each call.') calls_record.fields.append( bigquery.TableFieldSchema( name=bigquery_util.ColumnKeyConstants.CALLS_SAMPLE_ID, type=bigquery_util.TableFieldConstants.TYPE_INTEGER, mode=bigquery_util.TableFieldConstants.MODE_NULLABLE, description= 'Unique ID (type INT64) assigned to each sample. Table with ' '`__sample_info` suffix contains the mapping of sample names ' '(as read from VCF header) to these assigned IDs.')) if include_call_name: calls_record.fields.append( bigquery.TableFieldSchema( name=bigquery_util.ColumnKeyConstants.CALLS_NAME, type=bigquery_util.TableFieldConstants.TYPE_STRING, mode=bigquery_util.TableFieldConstants.MODE_NULLABLE, description= 'Name of the call (sample names in the VCF Header line).')) calls_record.fields.append( bigquery.TableFieldSchema( name=bigquery_util.ColumnKeyConstants.CALLS_GENOTYPE, type=bigquery_util.TableFieldConstants.TYPE_INTEGER, mode=bigquery_util.TableFieldConstants.MODE_REPEATED, description=( 'Genotype of the call. "-1" is used in cases where the ' 'genotype is not called.'))) calls_record.fields.append( bigquery.TableFieldSchema( name=bigquery_util.ColumnKeyConstants.CALLS_PHASESET, type=bigquery_util.TableFieldConstants.TYPE_STRING, mode=bigquery_util.TableFieldConstants.MODE_NULLABLE, description=( 'Phaseset of the call (if any). "*" is used in cases where ' 'the genotype is phased, but no phase set ("PS" in FORMAT) ' 'was specified.'))) for key, field in header_fields.formats.items(): # GT and PS are already included in 'genotype' and 'phaseset' fields. if key in (vcfio.GENOTYPE_FORMAT_KEY, vcfio.PHASESET_FORMAT_KEY): continue calls_record.fields.append( bigquery.TableFieldSchema( name=bigquery_sanitizer.SchemaSanitizer. get_sanitized_field_name(key), type=bigquery_util.get_bigquery_type_from_vcf_type( field[_HeaderKeyConstants.TYPE]), mode=bigquery_util.get_bigquery_mode_from_vcf_num( field[_HeaderKeyConstants.NUM]), description=bigquery_sanitizer.SchemaSanitizer. get_sanitized_string(field[_HeaderKeyConstants.DESC]))) schema.fields.append(calls_record) # Add info fields. info_keys = set() annotation_info_type_keys_set = set( proc_variant_factory.gen_annotation_info_type_keys()) for key, field in header_fields.infos.items(): # END info is already included by modifying the end_position. Info type # fields exist only to indicate the type of corresponding annotation fields, # and should not be added to the schema. if (key == vcfio.END_INFO_KEY or proc_variant_factory.info_is_in_alt_bases(key) or key in annotation_info_type_keys_set): continue schema.fields.append( bigquery.TableFieldSchema( name=bigquery_sanitizer.SchemaSanitizer. get_sanitized_field_name(key), type=bigquery_util.get_bigquery_type_from_vcf_type( field[_HeaderKeyConstants.TYPE]), mode=bigquery_util.get_bigquery_mode_from_vcf_num( field[_HeaderKeyConstants.NUM]), description=bigquery_sanitizer.SchemaSanitizer. get_sanitized_string(field[_HeaderKeyConstants.DESC]))) info_keys.add(key) if variant_merger: variant_merger.modify_bigquery_schema(schema, info_keys) return schema