def test_read_single_file_large(self): test_data_conifgs = [ { 'file': 'valid-4.0.vcf', 'num_infos': 6, 'num_formats': 4 }, { 'file': 'valid-4.0.vcf.gz', 'num_infos': 6, 'num_formats': 4 }, { 'file': 'valid-4.0.vcf.bz2', 'num_infos': 6, 'num_formats': 4 }, { 'file': 'valid-4.1-large.vcf', 'num_infos': 21, 'num_formats': 33 }, { 'file': 'valid-4.2.vcf', 'num_infos': 8, 'num_formats': 5 }, ] for config in test_data_conifgs: read_data = source_test_utils.read_from_source( VcfHeaderSource( testdata_util.get_full_file_path(config['file']))) self.assertEqual(config['num_infos'], len(read_data[0].infos)) self.assertEqual(config['num_formats'], len(read_data[0].formats))
def _create_file_and_read_headers(self): with temp_dir.TempDir() as tempdir: filename = tempdir.create_temp_file(suffix='.vcf', lines=self.lines) headers = source_test_utils.read_from_source( VcfHeaderSource(filename)) return headers[0]
def _read_records(self, file_or_pattern, representative_header_lines=None, vcf_parser_type=VcfParserType.PYVCF, **kwargs): return source_test_utils.read_from_source( VcfSource(file_or_pattern, representative_header_lines=representative_header_lines, vcf_parser_type=vcf_parser_type, **kwargs))
def _read_records(self, file_or_pattern, representative_header_lines=None, sample_name_encoding=SampleNameEncoding.WITHOUT_FILE_PATH, **kwargs): return source_test_utils.read_from_source( VcfSource(file_or_pattern, representative_header_lines=representative_header_lines, sample_name_encoding=sample_name_encoding, **kwargs))
def _read_records(self, file_or_pattern, representative_header_lines=None, **kwargs): return source_test_utils.read_from_source( VcfSource(file_or_pattern, representative_header_lines=representative_header_lines, **kwargs))
def test_corrupted_file(self): file_name = self._write_data() with open(file_name, 'rb') as f: data = f.read() # Corrupt the last character of the file which is also the last character of # the last sync_marker. # https://avro.apache.org/docs/current/spec.html#Object+Container+Files corrupted_data = bytearray(data) corrupted_data[-1] = (corrupted_data[-1] + 1) % 256 with tempfile.NamedTemporaryFile(delete=False, prefix=tempfile.template) as f: f.write(corrupted_data) corrupted_file_name = f.name source = _create_avro_source(corrupted_file_name) with self.assertRaisesRegex(ValueError, r'expected sync marker'): source_test_utils.read_from_source(source, None, None)
def test_read_after_splitting_skip_header(self): file_name, expected_data = write_data(100) assert len(expected_data) == 100 source = TextSource(file_name, 0, CompressionTypes.UNCOMPRESSED, True, coders.StrUtf8Coder(), skip_header_lines=2) splits = list(source.split(desired_bundle_size=33)) reference_source_info = (source, None, None) sources_info = ([ (split.source, split.start_position, split.stop_position) for split in splits]) self.assertGreater(len(sources_info), 1) reference_lines = source_test_utils.read_from_source(*reference_source_info) split_lines = [] for source_info in sources_info: split_lines.extend(source_test_utils.read_from_source(*source_info)) self.assertEqual(expected_data[2:], reference_lines) self.assertEqual(reference_lines, split_lines)
def test_read_after_splitting_skip_header(self): file_name, expected_data = write_data(100) assert len(expected_data) == 100 source = TextSource(file_name, 0, CompressionTypes.UNCOMPRESSED, True, coders.StrUtf8Coder(), skip_header_lines=2) splits = list(source.split(desired_bundle_size=33)) reference_source_info = (source, None, None) sources_info = ([ (split.source, split.start_position, split.stop_position) for split in splits]) self.assertGreater(len(sources_info), 1) reference_lines = source_test_utils.read_from_source(*reference_source_info) split_lines = [] for source_info in sources_info: split_lines.extend(source_test_utils.read_from_source(*source_info)) self.assertEqual(expected_data[2:], reference_lines) self.assertEqual(reference_lines, split_lines)
def test_corrupted_file(self): file_name = self._write_data() with open(file_name, 'rb') as f: data = f.read() # Corrupt the last character of the file which is also the last character of # the last sync_marker. last_char_index = len(data) - 1 corrupted_data = data[:last_char_index] corrupted_data += 'A' if data[last_char_index] == 'B' else 'B' with tempfile.NamedTemporaryFile( delete=False, prefix=tempfile.template) as f: f.write(corrupted_data) corrupted_file_name = f.name source = AvroSource(corrupted_file_name) with self.assertRaises(ValueError) as exn: source_test_utils.read_from_source(source, None, None) self.assertEqual(0, exn.exception.message.find('Unexpected sync marker'))
def test_corrupted_file(self): file_name = self._write_data() with open(file_name, 'rb') as f: data = f.read() # Corrupt the last character of the file which is also the last character of # the last sync_marker. last_char_index = len(data) - 1 corrupted_data = data[:last_char_index] corrupted_data += 'A' if data[last_char_index] == 'B' else 'B' with tempfile.NamedTemporaryFile( delete=False, prefix=tempfile.template) as f: f.write(corrupted_data) corrupted_file_name = f.name source = AvroSource(corrupted_file_name) with self.assertRaises(ValueError) as exn: source_test_utils.read_from_source(source, None, None) self.assertEqual(0, exn.exception.message.find('Unexpected sync marker'))
def test_read_after_splitting(self): file_name = get_full_file_path('valid-4.1-large.vcf') source = VcfSource(file_name) splits = [p for p in source.split(desired_bundle_size=500)] self.assertGreater(len(splits), 1) sources_info = ([ (split.source, split.start_position, split.stop_position) for split in splits]) self.assertGreater(len(sources_info), 1) split_records = [] for source_info in sources_info: split_records.extend(source_test_utils.read_from_source(*source_info)) self.assertEqual(9882, len(split_records))
def test_read_after_splitting(self): file_name = testdata_util.get_full_file_path('valid-4.1-large.vcf') source = VcfSource(file_name) splits = [p for p in source.split(desired_bundle_size=500)] self.assertGreater(len(splits), 1) sources_info = ([(split.source, split.start_position, split.stop_position) for split in splits]) self.assertGreater(len(sources_info), 1) split_records = [] for source_info in sources_info: split_records.extend( source_test_utils.read_from_source(*source_info)) self.assertEqual(9882, len(split_records))
def test_single_file_1_based_verify_details(self): variant = _get_sample_variant_1(use_1_based_coordinate=True) read_data = None with TempDir() as tempdir: file_name = tempdir.create_temp_file( suffix='.vcf', lines=_SAMPLE_HEADER_LINES + [VCF_LINE_1]) read_data = source_test_utils.read_from_source( VcfSource(file_name, representative_header_lines=None, sample_name_encoding=SampleNameEncoding.WITHOUT_FILE_PATH, use_1_based_coordinate=True)) self.assertEqual(1, len(read_data)) self.assertEqual(variant, read_data[0])
def test_file_pattern_1_based_verify_details(self): variant_1 = _get_sample_variant_1(use_1_based_coordinate=True) variant_2 = _get_sample_variant_2(use_1_based_coordinate=True) variant_3 = _get_sample_variant_3(use_1_based_coordinate=True) with TempDir() as tempdir: _ = tempdir.create_temp_file( suffix='.vcf', lines=_SAMPLE_HEADER_LINES + [VCF_LINE_1]) _ = tempdir.create_temp_file( suffix='.vcf', lines=_SAMPLE_HEADER_LINES + [VCF_LINE_2, VCF_LINE_3]) read_data = source_test_utils.read_from_source( VcfSource(os.path.join(tempdir.get_path(), '*.vcf'), representative_header_lines=None, sample_name_encoding=SampleNameEncoding.WITHOUT_FILE_PATH, use_1_based_coordinate=True)) self.assertEqual(3, len(read_data)) self._assert_variants_equal([variant_1, variant_2, variant_3], read_data)
def _run_parquet_test(self, pattern, columns, desired_bundle_size, perform_splitting, expected_result): source = _create_parquet_source(pattern, columns=columns) if perform_splitting: assert desired_bundle_size sources_info = [ (split.source, split.start_position, split.stop_position) for split in source.split(desired_bundle_size=desired_bundle_size) ] if len(sources_info) < 2: raise ValueError('Test is trivial. Please adjust it so that at least ' 'two splits get generated') source_test_utils.assert_sources_equal_reference_source( (source, None, None), sources_info) else: read_records = source_test_utils.read_from_source(source, None, None) self.assertCountEqual(expected_result, read_records)
def test_read_file_pattern(self): with temp_dir.TempDir() as tempdir: headers_1 = [self.lines[1], self.lines[-1]] headers_2 = [self.lines[2], self.lines[3], self.lines[-1]] headers_3 = [self.lines[4], self.lines[-1]] file_name_1 = tempdir.create_temp_file(suffix='.vcf', lines=headers_1) file_name_2 = tempdir.create_temp_file(suffix='.vcf', lines=headers_2) file_name_3 = tempdir.create_temp_file(suffix='.vcf', lines=headers_3) actual = source_test_utils.read_from_source(VcfHeaderSource( os.path.join(tempdir.get_path(), '*.vcf'))) expected = [_get_vcf_header_from_lines(h, file_name=file_name) for h, file_name in [(headers_1, file_name_1), (headers_2, file_name_2), (headers_3, file_name_3)]] asserts.header_vars_equal(expected)(actual)
def test_read_single_file_large(self): test_data_conifgs = [ { 'file': 'valid-4.0.vcf', 'variant_count': 4, 'size': 1500 }, { 'file': 'valid-4.0.vcf.gz', 'variant_count': 13, 'size': 1454 }, { 'file': 'valid-4.0.vcf.bz2', 'variant_count': 14, 'size': 1562 }, { 'file': 'valid-4.1-large.vcf', 'variant_count': 14425, 'size': 832396 }, { 'file': 'valid-4.1-large.vcf.gz', 'variant_count': 5498, 'size': 313430 }, { 'file': 'valid-4.2.vcf', 'variant_count': 10, 'size': 3195 }, ] for config in test_data_conifgs: read_data = source_test_utils.read_from_source( VcfEstimateSource( testdata_util.get_full_file_path(config['file']))) self.assertEqual(config['variant_count'], int(read_data[0].estimated_variant_count)) self.assertEqual(config['size'], read_data[0].size_in_bytes)
def _run_avro_test(self, pattern, desired_bundle_size, perform_splitting, expected_result): source = _create_avro_source(pattern, use_fastavro=self.use_fastavro) read_records = [] if perform_splitting: assert desired_bundle_size splits = [ split for split in source.split( desired_bundle_size=desired_bundle_size) ] if len(splits) < 2: raise ValueError( 'Test is trivial. Please adjust it so that at least ' 'two splits get generated') sources_info = [(split.source, split.start_position, split.stop_position) for split in splits] source_test_utils.assert_sources_equal_reference_source( (source, None, None), sources_info) else: read_records = source_test_utils.read_from_source( source, None, None) self.assertItemsEqual(expected_result, read_records)
def test_read_file_pattern(self): with temp_dir.TempDir() as tempdir: lines_1 = self.headers[1:2] + self.headers[-1:] + self.records[:2] lines_2 = self.headers[2:4] + self.headers[-1:] + self.records[2:4] lines_3 = self.headers[4:5] + self.headers[-1:] + self.records[4:] file_name_1 = tempdir.create_temp_file(suffix='.vcf', lines=lines_1) file_name_2 = tempdir.create_temp_file(suffix='.vcf', lines=lines_2) file_name_3 = tempdir.create_temp_file(suffix='.vcf', lines=lines_3) actual = source_test_utils.read_from_source( VcfEstimateSource(os.path.join(tempdir.get_path(), '*.vcf'))) expected = [ _get_estimate_from_lines(lines, file_name=file_name) for lines, file_name in [( lines_1, file_name_1), (lines_2, file_name_2), (lines_3, file_name_3)] ] asserts.header_vars_equal(expected)(actual)
def _run_avro_test(self, pattern, desired_bundle_size, perform_splitting, expected_result): source = AvroSource(pattern) read_records = [] if perform_splitting: assert desired_bundle_size splits = [ split for split in source.split(desired_bundle_size=desired_bundle_size) ] if len(splits) < 2: raise ValueError('Test is trivial. Please adjust it so that at least ' 'two splits get generated') sources_info = [ (split.source, split.start_position, split.stop_position) for split in splits ] source_test_utils.assert_sources_equal_reference_source( (source, None, None), sources_info) else: read_records = source_test_utils.read_from_source(source, None, None) self.assertItemsEqual(expected_result, read_records)
def test_read_from_source(self): data = self._create_data(100) source = self._create_source(data) self.assertItemsEqual( data, source_test_utils.read_from_source(source, None, None))
def check_read(self, values, coder): source = Create._create_source_from_iterable(values, coder) read_values = source_test_utils.read_from_source(source) self.assertEqual(sorted(values), sorted(read_values))
def check_read(self, values, coder): source = Create._create_source_from_iterable(values, coder) read_values = source_test_utils.read_from_source(source) self.assertEqual(sorted(values), sorted(read_values))
def test_read_from_source(self): data = self._create_data(100) source = self._create_source(data) self.assertItemsEqual( data, source_test_utils.read_from_source(source, None, None))
def _read_records(self, file_or_pattern, **kwargs): return source_test_utils.read_from_source( VcfSource(file_or_pattern, **kwargs))
def _read_records(self, file_or_pattern, **kwargs): return source_test_utils.read_from_source( VcfSource(file_or_pattern, **kwargs))