def test_validation_failure_for_empty_input_file(self):
     with temp_dir.TempDir() as tempdir:
         filename = tempdir.create_temp_file(lines=[])
         with self.assertRaisesRegexp(ValueError,
                                      'Input file .* is empty.'):
             pipeline_common._get_all_patterns(input_pattern=None,
                                               input_file=filename)
Exemple #2
0
 def test_write_headers(self):
   header = _get_vcf_header_from_lines(self.lines)
   with temp_dir.TempDir() as tempdir:
     tempfile = tempdir.create_temp_file(suffix='.vcf')
     header_fn = WriteVcfHeaderFn(tempfile)
     header_fn.process(header)
     self._assert_file_contents_equal(tempfile, self.lines)
Exemple #3
0
 def test_config_failed_missing_partition_name(self):
   tempdir = temp_dir.TempDir()
   missing_par_name = [
       '-  partition:',
       '     regions:',
       '       - "chr1:0-1,000,000"',
   ]
   with self.assertRaisesRegexp(
       ValueError,
       'Each partition must have partition_name field.'):
     _ = variant_partition.VariantPartition(
         tempdir.create_temp_file(suffix='.yaml',
                                  lines='\n'.join(missing_par_name)))
   empty_par_name = [
       '-  partition:',
       '     partition_name: "          "',
       '     regions:',
       '       - "chr1:0-1,000,000"',
   ]
   with self.assertRaisesRegexp(
       ValueError,
       'Partition name can not be empty string.'):
     _ = variant_partition.VariantPartition(
         tempdir.create_temp_file(suffix='.yaml',
                                  lines='\n'.join(empty_par_name)))
Exemple #4
0
    def test_pipeline_read_all_file_pattern(self):
        with temp_dir.TempDir() as tempdir:
            headers_1 = [self.lines[1], self.lines[-1]]
            headers_2 = [self.lines[2], self.lines[3], self.lines[-1]]
            headers_3 = [self.lines[4], self.lines[-1]]

            file_name_1 = tempdir.create_temp_file(suffix='.vcf',
                                                   lines=headers_1)
            file_name_2 = tempdir.create_temp_file(suffix='.vcf',
                                                   lines=headers_2)
            file_name_3 = tempdir.create_temp_file(suffix='.vcf',
                                                   lines=headers_3)

            pipeline = TestPipeline()
            pcoll = (pipeline
                     | 'Create' >> beam.Create(
                         [os.path.join(tempdir.get_path(), '*.vcf')])
                     | 'ReadHeaders' >> ReadAllVcfHeaders())

            expected = [
                _get_vcf_header_from_lines(h, file_name=file_name)
                for h, file_name in [(
                    headers_1,
                    file_name_1), (headers_2,
                                   file_name_2), (headers_3, file_name_3)]
            ]
            assert_that(pcoll, asserts.header_vars_equal(expected))
            pipeline.run()
Exemple #5
0
    def test_pipeline_read_all_file_pattern(self):
        with temp_dir.TempDir() as tempdir:
            lines_1 = self.headers[1:2] + self.headers[-1:] + self.records[:2]
            lines_2 = self.headers[2:4] + self.headers[-1:] + self.records[2:4]
            lines_3 = self.headers[4:5] + self.headers[-1:] + self.records[4:]
            file_name_1 = tempdir.create_temp_file(suffix='.vcf',
                                                   lines=lines_1)
            file_name_2 = tempdir.create_temp_file(suffix='.vcf',
                                                   lines=lines_2)
            file_name_3 = tempdir.create_temp_file(suffix='.vcf',
                                                   lines=lines_3)

            pipeline = TestPipeline()
            pcoll = pipeline | 'ReadHeaders' >> GetEstimates(
                os.path.join(tempdir.get_path(), '*.vcf'))
            pcoll = (pipeline
                     | 'Create' >> beam.Create(
                         [os.path.join(tempdir.get_path(), '*.vcf')])
                     | 'GetAllEstimates' >> GetAllEstimates())

            expected = [
                _get_estimate_from_lines(lines, file_name=file_name)
                for lines, file_name in [(
                    lines_1, file_name_1), (lines_2,
                                            file_name_2), (lines_3,
                                                           file_name_3)]
            ]
            assert_that(pcoll, asserts.header_vars_equal(expected))
            pipeline.run()
 def test_config_failed_duplicate_residual_shard(self):
     tempdir = temp_dir.TempDir()
     duplicate_residual = [
         '-  output_table:',
         '     table_name_suffix: "all_remaining"',
         '     regions:',
         '       - "residual"',
         '     partition_range_end: 999999999',
         '-  output_table:',
         '     table_name_suffix: "chr01"',
         '     regions:',
         '       - "chr1"',
         '     partition_range_end: 999999999',
         '-  output_table:',
         '     table_name_suffix: "all_remaining_2"',
         '     regions:',
         '       - "residual"',
         '     partition_range_end: 999999999',
     ]
     with self.assertRaisesRegex(
             ValueError,
             'Wrong sharding config file, there can be only one residual output*'
     ):
         _ = variant_sharding.VariantSharding(
             tempdir.create_temp_file(suffix='.yaml',
                                      lines='\n'.join(duplicate_residual)))
 def test_config_failed_duplicate_table_name(self):
     tempdir = temp_dir.TempDir()
     dup_table_name = [
         '-  output_table:',
         '     table_name_suffix: "duplicate_name"',
         '     regions:',
         '       - "chr1:0-1,000,000"',
         '     partition_range_end: 999999999',
         '-  output_table:',
         '     table_name_suffix: "all_remaining"',
         '     regions:',
         '       - "residual"',
         '     partition_range_end: 999999999',
         '-  output_table:',
         '     table_name_suffix: "duplicate_name"',
         '     regions:',
         '       - "chr1:1,000,000-2,000,000"',
         '     partition_range_end: 999999999',
     ]
     with self.assertRaisesRegex(
             ValueError,
             'Wrong sharding config file, table name suffixes must be unique*'
     ):
         _ = variant_sharding.VariantSharding(
             tempdir.create_temp_file(suffix='.yaml',
                                      lines='\n'.join(dup_table_name)))
 def test_config_failed_missing_shard_name(self):
     tempdir = temp_dir.TempDir()
     missing_par_name = [
         '-  output_table:',
         '     regions:',
         '       - "chr1:0-1,000,000"',
         '     partition_range_end: 999999999',
     ]
     with self.assertRaisesRegex(
             ValueError,
             'Wrong sharding config file, table_name_suffix field missing.'
     ):
         _ = variant_sharding.VariantSharding(
             tempdir.create_temp_file(suffix='.yaml',
                                      lines='\n'.join(missing_par_name)))
     empty_par_name = [
         '-  output_table:',
         '     table_name_suffix: "          "',
         '     regions:',
         '       - "chr1:0-1,000,000"',
         '     partition_range_end: 999999999',
     ]
     with self.assertRaisesRegex(
             ValueError,
             'Wrong sharding config file, table_name_suffix can not be empty.'
     ):
         _ = variant_sharding.VariantSharding(
             tempdir.create_temp_file(suffix='.yaml',
                                      lines='\n'.join(empty_par_name)))
Exemple #9
0
 def _create_file_and_read_headers(self):
     with temp_dir.TempDir() as tempdir:
         filename = tempdir.create_temp_file(suffix='.vcf',
                                             lines=self.lines)
         headers = source_test_utils.read_from_source(
             VcfHeaderSource(filename))
         return headers[0]
 def test_write_vcf_data_header(self):
     lines = [
         '##fileformat=VCFv4.2\n',
         '##INFO=<ID=NS,Number=1,Type=Integer,Description="Number samples">\n',
         '##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">\n',
         '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n',
         '##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="GQ">\n',
         '#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	\n'
     ]
     with temp_dir.TempDir() as tempdir:
         representative_header = tempdir.create_temp_file(lines=lines)
         file_path = filesystems.FileSystems.join(tempdir.get_path(),
                                                  'data_header')
         bq_to_vcf._write_vcf_header_with_call_names(
             ['Sample 1', 'Sample 2'],
             ['#CHROM', 'POS', 'ID', 'REF', 'ALT'], representative_header,
             file_path)
         expected_content = [
             '##fileformat=VCFv4.2\n',
             '##INFO=<ID=NS,Number=1,Type=Integer,Description="Number samples">\n',
             '##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">\n',
             '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n',
             '##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="GQ">\n',
             '#CHROM\tPOS\tID\tREF\tALT\tSample 1\tSample 2\n'
         ]
         with filesystems.FileSystems.open(file_path) as f:
             content = f.readlines()
             self.assertEqual(content, expected_content)
Exemple #11
0
    def test_write_to_shards(self):
        with temp_dir.TempDir() as tempdir:
            shards_writter = write_variants_to_shards._WriteVariantsToVCFShards(
                tempdir.get_path(), 3)
            variants = self._get_variants()
            variant_lines = [
                shards_writter._coder.encode(v).strip('\n') for v in variants
            ]
            shards_writter._write_variant_lines_to_vcf_shard(variant_lines)

            expected_content = [
                '\t'.join([
                    '#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER',
                    'INFO', 'FORMAT\n'
                ]), '\t'.join([
                    '19', '12', 'rs1', 'C', 'A,TT', '2', 'PASS',
                    'A1=some data;A2=data1,data2', '.\n'
                ]), '\t'.join([
                    '19', '12', 'rs1', 'C', 'A,TT', '20', 'q10',
                    'A1=some data2;A3=data3,data4', '.'
                ])
            ]

            file_paths = []
            for dirpath, _, filenames in os.walk(tempdir.get_path()):
                for f in filenames:
                    file_paths.append(os.path.abspath(os.path.join(dirpath,
                                                                   f)))
            self.assertEqual(1, len(file_paths))
            with filesystems.FileSystems.open(file_paths[0]) as f:
                content = f.readlines()
                self.assertEqual(content, expected_content)
 def test_write_to_shards_pipeline(self):
     with temp_dir.TempDir() as tempdir:
         pipeline = TestPipeline()
         _ = (pipeline
              | Create(self._get_variants())
              | 'WriteToShards' >> write_variants_to_shards.WriteToShards(
                  tempdir.get_path(), ['Sample 1', 'Sample 2']))
         pipeline.run()
Exemple #13
0
 def test_print_estimates_to_file(self):
     with temp_dir.TempDir() as tempdir:
         file_path = os.path.join(tempdir.get_path(), 'test_file_name')
         extract_input_size.print_estimates_to_file(1, 2, 3, 4, 5,
                                                    file_path)
         with FileSystems.open(file_path) as f:
             lines = f.readlines()
         self.assertEqual([int(line.strip()) for line in lines],
                          [1, 2, 3, 4, 5])
Exemple #14
0
 def test_write_dataflow(self):
   header = _get_vcf_header_from_lines(self.lines)
   with temp_dir.TempDir() as tempdir:
     tempfile = tempdir.create_temp_file(suffix='.vcf')
     pipeline = TestPipeline()
     pcoll = pipeline | beam.Create([header])
     _ = pcoll | 'Write' >> WriteVcfHeaders(tempfile)
     pipeline.run()
     self._assert_file_contents_equal(tempfile, self.lines)
    def test_get_mode_optimize_set(self):
        with temp_dir.TempDir() as tempdir:
            filename = tempdir.create_temp_file(lines=self.SAMPLE_LINES)
            args = self._create_mock_args(input_pattern=None,
                                          input_file=filename,
                                          optimize_for_large_inputs=True)

            self.assertEqual(self._get_pipeline_mode(args),
                             PipelineModes.LARGE)
Exemple #16
0
 def test_empty_file(self):
   lines = []
   with temp_dir.TempDir() as tempdir:
     file_path = self._create_temp_vcf_file(lines, tempdir)
     try:
       vcf_header_parser.get_vcf_headers(file_path)
       self.fail('Empty VCF file must throw an exception.')
     except ValueError:
       pass
 def test_preprocess_no_conflicts(self):
     with temp_dir.TempDir() as tempdir:
         report_path = filesystems.FileSystems.join(
             tempdir.get_path(), PreprocessTest._REPORT_NAME)
         argv = [
             '--input_pattern',
             'gs://gcp-variant-transforms-testfiles/small_tests/valid-4.0.vcf',
             '--report_path', report_path, '--report_all_conflicts'
         ]
         vcf_to_bq_preprocess.run(argv)
         assert filesystems.FileSystems.exists(report_path)
 def test_write_vcf_data_header(self):
   with temp_dir.TempDir() as tempdir:
     file_path = filesystems.FileSystems.join(tempdir.get_path(),
                                              'data_header')
     bq_to_vcf._write_vcf_data_header(['Sample 1', 'Sample 2'],
                                      ['#CHROM', 'POS', 'ID', 'REF', 'ALT'],
                                      file_path)
     expected_content = '#CHROM\tPOS\tID\tREF\tALT\tSample 1\tSample 2\n'
     with filesystems.FileSystems.open(file_path) as f:
       content = f.readlines()
       self.assertEqual(content, [expected_content])
Exemple #19
0
  def test_pipeline_read_file_headers(self):
    headers = self.lines
    self.lines = testdata_util.get_sample_vcf_file_lines()

    with temp_dir.TempDir() as tempdir:
      filename = tempdir.create_temp_file(suffix='.vcf', lines=self.lines)

      pipeline = TestPipeline()
      pcoll = pipeline | 'ReadHeaders' >> ReadVcfHeaders(filename)

      assert_that(pcoll, equal_to([_get_vcf_header_from_lines(headers)]))
      pipeline.run()
Exemple #20
0
    def test_get_mode_small_still_large(self):
        with temp_dir.TempDir() as tempdir:
            filename = tempdir.create_temp_file(lines=self.SAMPLE_LINES)
            args = self._create_mock_args(input_pattern=None,
                                          input_file=filename)
            match_result = collections.namedtuple('MatchResult',
                                                  ['metadata_list'])

            match = match_result([None for _ in range(100)])
            with mock.patch.object(FileSystems, 'match', return_value=[match]):
                self.assertEqual(self._get_pipeline_mode(args),
                                 PipelineModes.LARGE)
 def test_failure_for_conflicting_flags_no_errors_with_file_input(self):
     lines = [
         './gcp_variant_transforms/testing/data/vcf/valid-4.0.vcf\n',
         './gcp_variant_transforms/testing/data/vcf/valid-4.0.vcf\n',
         './gcp_variant_transforms/testing/data/vcf/valid-4.0.vcf\n'
     ]
     with temp_dir.TempDir() as tempdir:
         filename = tempdir.create_temp_file(lines=lines)
         args = self._make_args([
             '--input_file', filename, '--representative_header_file',
             'gs://some_file'
         ])
         self._options.validate(args)
Exemple #22
0
 def test_get_metadata_header_lines(self):
   lines = [
       '##fileformat=VCFv4.2\n',
       '##INFO=<ID=NS,Number=1,Type=Integer,Description="Number samples">\n',
       '##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">\n',
       '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n',
       '##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="GQ">\n',
       '#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	Sample1	Sample2\n',
       '19	1234567	mi1	G	T	50	PASS	NS=3	GT:GQ:DP	0/1:35:4	0/2:17:2',]
   with temp_dir.TempDir() as tempdir:
     file_path = self._create_temp_vcf_file(lines, tempdir)
     header_lines = vcf_header_parser.get_metadata_header_lines(file_path)
     self.assertEqual(header_lines, lines[:-2])
Exemple #23
0
 def test_one_file(self):
   lines = [
       '##fileformat=VCFv4.2\n',
       '##INFO=<ID=NS,Number=1,Type=Integer,Description="Number samples">\n',
       '##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">\n',
       '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n',
       '##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="GQ">\n',
       '#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	Sample1	Sample2\n']
   with temp_dir.TempDir() as tempdir:
     file_path = self._create_temp_vcf_file(lines, tempdir)
     header_fields = vcf_header_parser.get_vcf_headers(file_path)
     self.assertItemsEqual(['NS', 'AF'], header_fields.infos.keys())
     self.assertItemsEqual(['GT', 'GQ'], header_fields.formats.keys())
Exemple #24
0
 def test_validation_failure_for_wrong_pattern_in_input_file(self):
     lines = [
         './gcp_variant_transforms/testing/data/vcf/valid-4.0.vcf\n',
         'non_existent.vcf\n',
         './gcp_variant_transforms/testing/data/vcf/valid-4.0.vcf\n'
     ]
     with temp_dir.TempDir() as tempdir:
         filename = tempdir.create_temp_file(lines=lines)
         with self.assertRaisesRegex(
                 ValueError,
                 'Input pattern .* from .* did not match any files.'):
             pipeline_common._get_all_patterns(input_pattern=None,
                                               input_file=filename)
Exemple #25
0
    def test_pipeline_read_file_headers(self):

        with temp_dir.TempDir() as tempdir:
            filename = tempdir.create_temp_file(suffix='.vcf',
                                                lines=self.lines)

            pipeline = TestPipeline()
            pcoll = pipeline | 'GetEstimates' >> GetEstimates(filename)

            assert_that(
                pcoll,
                equal_to([_get_estimate_from_lines(self.lines, filename)]))
            pipeline.run()
Exemple #26
0
 def test_invalid_file(self):
   lines = [
       '##fileformat=VCFv4.2\n',
       '##INFO=<ID=NS,Number=1,Type=Integer,Description="Number samples">\n',
       '##INFO=<ID=AF,Number=A,Type=Float,Desc\n',
       '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\r\n',
       '#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	Sample1	Sample2\n']
   with temp_dir.TempDir() as tempdir:
     file_path = self._create_temp_vcf_file(lines, tempdir)
     try:
       vcf_header_parser.get_vcf_headers(file_path)
       self.fail('Invalid VCF file must throw an exception.')
     except ValueError:
       pass
 def test_preprocess_run_locally(self):
     with temp_dir.TempDir() as tempdir:
         report_path = filesystems.FileSystems.join(
             tempdir.get_path(), PreprocessTest._REPORT_NAME)
         resolved_headers_path = filesystems.FileSystems.join(
             tempdir.get_path(), PreprocessTest._RESOLVED_HEADERS_FILE_NAME)
         argv = [
             '--input_pattern',
             'gs://gcp-variant-transforms-testfiles/small_tests/infer-undefined'
             '-header-fields.vcf', '--report_all_conflicts',
             '--report_path', report_path, '--resolved_headers_path',
             resolved_headers_path
         ]
         vcf_to_bq_preprocess.run(argv)
         assert filesystems.FileSystems.exists(report_path)
         assert filesystems.FileSystems.exists(resolved_headers_path)
Exemple #28
0
  def test_read_file_pattern(self):
    with temp_dir.TempDir() as tempdir:
      headers_1 = [self.lines[1], self.lines[-1]]
      headers_2 = [self.lines[2], self.lines[3], self.lines[-1]]
      headers_3 = [self.lines[4], self.lines[-1]]
      file_name_1 = tempdir.create_temp_file(suffix='.vcf', lines=headers_1)
      file_name_2 = tempdir.create_temp_file(suffix='.vcf', lines=headers_2)
      file_name_3 = tempdir.create_temp_file(suffix='.vcf', lines=headers_3)

      actual = source_test_utils.read_from_source(VcfHeaderSource(
          os.path.join(tempdir.get_path(), '*.vcf')))

      expected = [_get_vcf_header_from_lines(h, file_name=file_name)
                  for h, file_name in [(headers_1, file_name_1),
                                       (headers_2, file_name_2),
                                       (headers_3, file_name_3)]]

      asserts.header_vars_equal(expected)(actual)
 def _generate_report_and_assert_contents_equal(
     self,
     expected_content,  # type: List[str]
     header_definitions,  # type: merge_header_definitions.VcfHeaderDefinitions
     resolved_headers=None,  # type: VcfHeader
     inferred_headers=None,  # type: VcfHeader
     malformed_records=None  # type: List[vcfio.MalformedVcfRecord]
 ):
     # type: (...) -> None
     with temp_dir.TempDir() as tempdir:
         file_path = FileSystems.join(tempdir.get_path(),
                                      PreprocessReporterTest._REPORT_NAME)
         preprocess_reporter.generate_report(header_definitions, file_path,
                                             resolved_headers,
                                             inferred_headers,
                                             malformed_records)
         with FileSystems.open(file_path) as f:
             reader = f.readlines()
             self.assertItemsEqual(reader, expected_content)
Exemple #30
0
 def test_config_failed_missing_region(self):
   tempdir = temp_dir.TempDir()
   missing_region = [
       '-  partition:',
       '     partition_name: "chr01_part1"',
       '     regions:',
       '       - "chr1:0-1,000,000"',
       '-  partition:',
       '     partition_name: "all_remaining"',
       '     regions:',
       '       - "residual"',
       '-  partition:',
       '     partition_name: "missing_region"',
       '     regions:',
   ]
   with self.assertRaisesRegexp(
       ValueError,
       'Each partition must have at least one region.'):
     _ = variant_partition.VariantPartition(
         tempdir.create_temp_file(suffix='.yaml',
                                  lines='\n'.join(missing_region)))