def _write_vcf_header_with_sample_names(sample_names, vcf_fixed_columns, representative_header_file, file_path): # type: (List[str], List[str], str, str) -> None """Writes VCF header containing meta info and header line with sample names. It writes all meta-information starting with `##` extracted from `representative_header_file`, followed by one data header line with ` vcf_fixed_columns`, and sample names in `sample_names`. Example: ##INFO=<ID=CGA_SDO,Number=1,Type=Integer,Description="Number"> ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> #CHROM POS ID REF ALT QUAL FILTER INFO SAMPLE1 SAMPLE2 Args: sample_names: The sample names appended to `vcf_fixed_columns`. vcf_fixed_columns: The VCF fixed columns. representative_header_file: The location of the file that provides the meta-information. file_path: The location where the VCF headers is saved. """ # pylint: disable=redefined-outer-name,reimported from apache_beam.io import filesystems from gcp_variant_transforms.libs import vcf_header_parser metadata_header_lines = vcf_header_parser.get_metadata_header_lines( representative_header_file) with filesystems.FileSystems.create(file_path) as file_to_write: file_to_write.write(''.join(metadata_header_lines)) file_to_write.write(str('\t'.join(vcf_fixed_columns + sample_names))) file_to_write.write('\n')
def _read_variants( all_patterns, # type: List[str] pipeline, # type: beam.Pipeline known_args, # type: argparse.Namespace pipeline_mode, # type: int pre_infer_headers=False, # type: bool keep_raw_sample_names=False, # type: bool use_1_based_coordinate=True # type: bool ): # type: (...) -> pvalue.PCollection """Helper method for returning a PCollection of Variants from VCFs.""" representative_header_lines = None if known_args.representative_header_file: representative_header_lines = vcf_header_parser.get_metadata_header_lines( known_args.representative_header_file) return pipeline_common.read_variants( pipeline, all_patterns, pipeline_mode, known_args.allow_malformed_records, representative_header_lines, pre_infer_headers=pre_infer_headers, sample_name_encoding=( SampleNameEncoding.NONE if keep_raw_sample_names else SampleNameEncoding[known_args.sample_name_encoding]), use_1_based_coordinate=use_1_based_coordinate)
def _read_variants( all_patterns, # type: List[str] pipeline, # type: beam.Pipeline known_args, # type: argparse.Namespace pipeline_mode # type: int ): # type: (...) -> pvalue.PCollection """Helper method for returning a PCollection of Variants from VCFs.""" representative_header_lines = None if known_args.representative_header_file: representative_header_lines = vcf_header_parser.get_metadata_header_lines( known_args.representative_header_file) if pipeline_mode == pipeline_common.PipelineModes.LARGE: variants = ( pipeline | 'InputFilePattern' >> beam.Create(all_patterns) | 'ReadAllFromVcf' >> vcfio.ReadAllFromVcf( representative_header_lines=representative_header_lines, allow_malformed_records=(known_args.allow_malformed_records))) else: variants = pipeline | 'ReadFromVcf' >> vcfio.ReadFromVcf( all_patterns[0], representative_header_lines=representative_header_lines, allow_malformed_records=known_args.allow_malformed_records, vcf_parser_type=vcfio.VcfParserType[known_args.vcf_parser]) return variants
def test_get_metadata_header_lines(self): lines = [ '##fileformat=VCFv4.2\n', '##INFO=<ID=NS,Number=1,Type=Integer,Description="Number samples">\n', '##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">\n', '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n', '##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="GQ">\n', '#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Sample1 Sample2\n', '19 1234567 mi1 G T 50 PASS NS=3 GT:GQ:DP 0/1:35:4 0/2:17:2',] with temp_dir.TempDir() as tempdir: file_path = self._create_temp_vcf_file(lines, tempdir) header_lines = vcf_header_parser.get_metadata_header_lines(file_path) self.assertEqual(header_lines, lines[:-2])
def _read_variants(all_patterns, # type: List[str] pipeline, # type: beam.Pipeline known_args, # type: argparse.Namespace pipeline_mode # type: int ): # type: (...) -> pvalue.PCollection """Helper method for returning a PCollection of Variants from VCFs.""" representative_header_lines = None if known_args.representative_header_file: representative_header_lines = vcf_header_parser.get_metadata_header_lines( known_args.representative_header_file) return pipeline_common.read_variants( pipeline, all_patterns, pipeline_mode, known_args.allow_malformed_records, representative_header_lines, vcfio.VcfParserType[known_args.vcf_parser])
def _read_variants(pipeline, known_args): # type: (beam.Pipeline, argparse.Namespace) -> pvalue.PCollection """Helper method for returning a PCollection of Variants from VCFs.""" representative_header_lines = None if known_args.representative_header_file: representative_header_lines = vcf_header_parser.get_metadata_header_lines( known_args.representative_header_file) if known_args.optimize_for_large_inputs: variants = ( pipeline | 'InputFilePattern' >> beam.Create([known_args.input_pattern]) | 'ReadAllFromVcf' >> vcfio.ReadAllFromVcf( representative_header_lines=representative_header_lines, allow_malformed_records=(known_args.allow_malformed_records))) else: variants = pipeline | 'ReadFromVcf' >> vcfio.ReadFromVcf( known_args.input_pattern, representative_header_lines=representative_header_lines, allow_malformed_records=known_args.allow_malformed_records) return variants