def _write_vcf_header_with_sample_names(sample_names, vcf_fixed_columns,
                                        representative_header_file, file_path):
    # type: (List[str], List[str], str, str) -> None
    """Writes VCF header containing meta info and header line with sample names.

  It writes all meta-information starting with `##` extracted from
  `representative_header_file`, followed by one data header line with
  ` vcf_fixed_columns`, and sample names in `sample_names`. Example:
  ##INFO=<ID=CGA_SDO,Number=1,Type=Integer,Description="Number">
  ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
  #CHROM  POS  ID  REF  ALT  QUAL  FILTER  INFO  SAMPLE1  SAMPLE2

  Args:
    sample_names: The sample names appended to `vcf_fixed_columns`.
    vcf_fixed_columns: The VCF fixed columns.
    representative_header_file: The location of the file that provides the
      meta-information.
    file_path: The location where the VCF headers is saved.
  """
    # pylint: disable=redefined-outer-name,reimported
    from apache_beam.io import filesystems
    from gcp_variant_transforms.libs import vcf_header_parser
    metadata_header_lines = vcf_header_parser.get_metadata_header_lines(
        representative_header_file)
    with filesystems.FileSystems.create(file_path) as file_to_write:
        file_to_write.write(''.join(metadata_header_lines))
        file_to_write.write(str('\t'.join(vcf_fixed_columns + sample_names)))
        file_to_write.write('\n')
Esempio n. 2
0
def _read_variants(
        all_patterns,  # type: List[str]
        pipeline,  # type: beam.Pipeline
        known_args,  # type: argparse.Namespace
        pipeline_mode,  # type: int
        pre_infer_headers=False,  # type: bool
        keep_raw_sample_names=False,  # type: bool
        use_1_based_coordinate=True  # type: bool
):
    # type: (...) -> pvalue.PCollection
    """Helper method for returning a PCollection of Variants from VCFs."""
    representative_header_lines = None
    if known_args.representative_header_file:
        representative_header_lines = vcf_header_parser.get_metadata_header_lines(
            known_args.representative_header_file)
    return pipeline_common.read_variants(
        pipeline,
        all_patterns,
        pipeline_mode,
        known_args.allow_malformed_records,
        representative_header_lines,
        pre_infer_headers=pre_infer_headers,
        sample_name_encoding=(
            SampleNameEncoding.NONE if keep_raw_sample_names else
            SampleNameEncoding[known_args.sample_name_encoding]),
        use_1_based_coordinate=use_1_based_coordinate)
Esempio n. 3
0
def _read_variants(
        all_patterns,  # type: List[str]
        pipeline,  # type: beam.Pipeline
        known_args,  # type: argparse.Namespace
        pipeline_mode  # type: int
):
    # type: (...) -> pvalue.PCollection
    """Helper method for returning a PCollection of Variants from VCFs."""
    representative_header_lines = None
    if known_args.representative_header_file:
        representative_header_lines = vcf_header_parser.get_metadata_header_lines(
            known_args.representative_header_file)

    if pipeline_mode == pipeline_common.PipelineModes.LARGE:
        variants = (
            pipeline
            | 'InputFilePattern' >> beam.Create(all_patterns)
            | 'ReadAllFromVcf' >> vcfio.ReadAllFromVcf(
                representative_header_lines=representative_header_lines,
                allow_malformed_records=(known_args.allow_malformed_records)))
    else:
        variants = pipeline | 'ReadFromVcf' >> vcfio.ReadFromVcf(
            all_patterns[0],
            representative_header_lines=representative_header_lines,
            allow_malformed_records=known_args.allow_malformed_records,
            vcf_parser_type=vcfio.VcfParserType[known_args.vcf_parser])

    return variants
Esempio n. 4
0
 def test_get_metadata_header_lines(self):
   lines = [
       '##fileformat=VCFv4.2\n',
       '##INFO=<ID=NS,Number=1,Type=Integer,Description="Number samples">\n',
       '##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">\n',
       '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n',
       '##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="GQ">\n',
       '#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	Sample1	Sample2\n',
       '19	1234567	mi1	G	T	50	PASS	NS=3	GT:GQ:DP	0/1:35:4	0/2:17:2',]
   with temp_dir.TempDir() as tempdir:
     file_path = self._create_temp_vcf_file(lines, tempdir)
     header_lines = vcf_header_parser.get_metadata_header_lines(file_path)
     self.assertEqual(header_lines, lines[:-2])
Esempio n. 5
0
def _read_variants(all_patterns,  # type: List[str]
                   pipeline,  # type: beam.Pipeline
                   known_args,  # type: argparse.Namespace
                   pipeline_mode  # type: int
                  ):
  # type: (...) -> pvalue.PCollection
  """Helper method for returning a PCollection of Variants from VCFs."""
  representative_header_lines = None
  if known_args.representative_header_file:
    representative_header_lines = vcf_header_parser.get_metadata_header_lines(
        known_args.representative_header_file)
  return pipeline_common.read_variants(
      pipeline,
      all_patterns,
      pipeline_mode,
      known_args.allow_malformed_records,
      representative_header_lines,
      vcfio.VcfParserType[known_args.vcf_parser])
Esempio n. 6
0
def _read_variants(pipeline, known_args):
    # type: (beam.Pipeline, argparse.Namespace) -> pvalue.PCollection
    """Helper method for returning a PCollection of Variants from VCFs."""
    representative_header_lines = None
    if known_args.representative_header_file:
        representative_header_lines = vcf_header_parser.get_metadata_header_lines(
            known_args.representative_header_file)

    if known_args.optimize_for_large_inputs:
        variants = (
            pipeline
            | 'InputFilePattern' >> beam.Create([known_args.input_pattern])
            | 'ReadAllFromVcf' >> vcfio.ReadAllFromVcf(
                representative_header_lines=representative_header_lines,
                allow_malformed_records=(known_args.allow_malformed_records)))
    else:
        variants = pipeline | 'ReadFromVcf' >> vcfio.ReadFromVcf(
            known_args.input_pattern,
            representative_header_lines=representative_header_lines,
            allow_malformed_records=known_args.allow_malformed_records)
    return variants