def test_shard_variants(self): expected_shards = self._get_expected_variant_shards() variants = [variant for variant_list in expected_shards.values() for variant in variant_list] sharding = variant_sharding.VariantSharding( 'gcp_variant_transforms/data/sharding_configs/' 'homo_sapiens_default.yaml') pipeline = TestPipeline() shards = ( pipeline | Create(variants, reshuffle=False) | 'ShardVariants' >> beam.Partition( shard_variants.ShardVariants(sharding), sharding.get_num_shards())) for i in range(sharding.get_num_shards()): assert_that(shards[i], equal_to(expected_shards.get(i, [])), label=str(i)) pipeline.run()
def expand(self, pcoll): # This is a composite transform involves the following: # 1. Create a singleton of the user provided `query` and apply a ``ParDo`` # that splits the query into `num_splits` queries if possible. # # If the value of `num_splits` is 0, the number of splits will be # computed dynamically based on the size of the data for the `query`. # # 2. The resulting ``PCollection`` is sharded across workers using a # ``Reshuffle`` operation. # # 3. In the third step, a ``ParDo`` reads entities for each query and # outputs a ``PCollection[Entity]``. return (pcoll.pipeline | 'UserQuery' >> Create([self._query]) | 'SplitQuery' >> ParDo( ReadFromDatastore._SplitQueryFn(self._num_splits)) | Reshuffle() | 'Read' >> ParDo(ReadFromDatastore._QueryFn()))
def test_partition_variants(self): expected_partitions = self._get_standard_variant_partitions() expected_partitions.update(self._get_nonstandard_variant_partitions()) variants = [ variant for variant_list in expected_partitions.values() for variant in variant_list ] partitioner = variant_partition.VariantPartition() pipeline = TestPipeline() partitions = (pipeline | Create(variants) | 'PartitionVariants' >> Partition( partition_variants.PartitionVariants(partitioner), partitioner.get_num_partitions())) for i in xrange(partitioner.get_num_partitions()): assert_that(partitions[i], equal_to(expected_partitions.get(i, [])), label=str(i)) pipeline.run()
def test_combine_pipeline(self): headers_1 = self._get_header_from_lines(FILE_1_LINES) headers_2 = self._get_header_from_lines(FILE_2_LINES) # TODO(nmousavi): Either use TestPipeline or combiner_fn.* everywhere. # After moving out _HeaderMerger to its file, it makes sense to use # TestPipeline everywhere. header_merger = HeaderMerger( vcf_field_conflict_resolver.FieldConflictResolver( split_alternate_allele_info_fields=True)) expected = vcf_header_io.VcfHeader() header_merger.merge(expected, headers_1) header_merger.merge(expected, headers_2) pipeline = TestPipeline() merged_headers = (pipeline | Create([headers_1, headers_2]) | 'MergeHeaders' >> merge_headers.MergeHeaders()) assert_that(merged_headers, equal_to([expected]))
def test_header_fields_inferred_one_variant(self): with TestPipeline() as p: variant = self._get_sample_variant_1() inferred_headers = ( p | Create([variant]) | 'InferUndefinedHeaderFields' >> infer_undefined_headers.InferUndefinedHeaderFields( defined_headers=None)) expected_infos = {'IS': Info('IS', 1, 'String', '', '', ''), 'IF': Info('IF', 0, 'Flag', '', '', ''), 'IA': Info('IA', None, 'String', '', '', '')} expected_formats = {'FI': Format('FI', 1, 'String', ''), 'FU': Format('FU', None, 'String', '')} expected = vcf_header_io.VcfHeader( infos=expected_infos, formats=expected_formats) assert_that(inferred_headers, equal_to([expected])) p.run()
def test_pipeline_sdk_not_overridden(self): pipeline_options = PipelineOptions([ '--experiments=beam_fn_api', '--experiments=use_unified_worker', '--temp_location', 'gs://any-location/temp', '--worker_harness_container_image=dummy_prefix/dummy_name:dummy_tag' ]) pipeline = Pipeline(options=pipeline_options) pipeline | Create([1, 2, 3]) | ParDo(DoFn()) # pylint:disable=expression-not-assigned proto_pipeline, _ = pipeline.to_runner_api(return_context=True) dummy_env = beam_runner_api_pb2.Environment( urn=common_urns.environments.DOCKER.urn, payload=(beam_runner_api_pb2.DockerPayload( container_image='dummy_prefix/dummy_name:dummy_tag') ).SerializeToString()) proto_pipeline.components.environments['dummy_env_id'].CopyFrom( dummy_env) dummy_transform = beam_runner_api_pb2.PTransform( environment_id='dummy_env_id') proto_pipeline.components.transforms['dummy_transform_id'].CopyFrom( dummy_transform) # Accessing non-public method for testing. apiclient.DataflowApplicationClient._apply_sdk_environment_overrides( proto_pipeline, dict(), pipeline_options) self.assertIsNotNone(2, len(proto_pipeline.components.environments)) from apache_beam.utils import proto_utils found_override = False for env in proto_pipeline.components.environments.values(): docker_payload = proto_utils.parse_Bytes( env.payload, beam_runner_api_pb2.DockerPayload) if docker_payload.container_image.startswith( names.DATAFLOW_CONTAINER_IMAGE_REPOSITORY): found_override = True self.assertFalse(found_override)
def test_memory_usage(self): try: import resource except ImportError: # Skip the test if resource module is not available (e.g. non-Unix os). self.skipTest('resource module not available.') if platform.mac_ver()[0]: # Skip the test on macos, depending on version it returns ru_maxrss in # different units. self.skipTest('ru_maxrss is not in standard units.') def get_memory_usage_in_bytes(): return resource.getrusage(resource.RUSAGE_SELF).ru_maxrss * (2**10) def check_memory(value, memory_threshold): memory_usage = get_memory_usage_in_bytes() if memory_usage > memory_threshold: raise RuntimeError('High memory usage: %d > %d' % (memory_usage, memory_threshold)) return value len_elements = 1000000 num_elements = 10 num_maps = 100 pipeline = TestPipeline(runner='DirectRunner') # Consumed memory should not be proportional to the number of maps. memory_threshold = (get_memory_usage_in_bytes() + (3 * len_elements * num_elements)) biglist = pipeline | 'oom:create' >> Create( ['x' * len_elements] * num_elements) for i in range(num_maps): biglist = biglist | ('oom:addone-%d' % i) >> Map(lambda x: x + 'y') result = biglist | 'oom:check' >> Map(check_memory, memory_threshold) assert_that( result, equal_to(['x' * len_elements + 'y' * num_maps] * num_elements)) pipeline.run()
def test_densify_variants_pipeline(self): call_names = ['sample1', 'sample2', 'sample3'] variant_calls = [ vcfio.VariantCall(name=call_names[0]), vcfio.VariantCall(name=call_names[1]), vcfio.VariantCall(name=call_names[2]), ] variants = [ vcfio.Variant(calls=[variant_calls[0], variant_calls[1]]), vcfio.Variant(calls=[variant_calls[1], variant_calls[2]]), ] pipeline = TestPipeline() densified_variants = ( pipeline | Create(variants) | 'DensifyVariants' >> densify_variants.DensifyVariants(call_names)) assert_that(densified_variants, asserts.has_calls(call_names)) pipeline.run()
def test_convert_variant_to_bigquery_row(self): variant_1, row_1, header_num_dict_1 = self._get_sample_variant_1() variant_2, row_2, header_num_dict_2 = self._get_sample_variant_2() variant_3, row_3, header_num_dict_3 = self._get_sample_variant_3() header_num_dict = header_num_dict_1.copy() header_num_dict.update(header_num_dict_2) header_num_dict.update(header_num_dict_3) header_fields = vcf_header_util.make_header(header_num_dict) proc_var_1 = processed_variant.ProcessedVariantFactory( header_fields).create_processed_variant(variant_1) proc_var_2 = processed_variant.ProcessedVariantFactory( header_fields).create_processed_variant(variant_2) proc_var_3 = processed_variant.ProcessedVariantFactory( header_fields).create_processed_variant(variant_3) pipeline = TestPipeline(blocking=True) bigquery_rows = (pipeline | Create([proc_var_1, proc_var_2, proc_var_3]) | 'ConvertToRow' >> beam.ParDo( ConvertVariantToRow(self._row_generator))) assert_that(bigquery_rows, equal_to([row_1, row_2, row_3])) pipeline.run()
def test_window_param(self): class TestDoFn(DoFn): def process(self, element, window=DoFn.WindowParam): yield (element, (float(window.start), float(window.end))) pipeline = TestPipeline() pcoll = (pipeline | Create([1, 7]) | Map(lambda x: TimestampedValue(x, x)) | WindowInto(windowfn=SlidingWindows(10, 5)) | ParDo(TestDoFn())) assert_that( pcoll, equal_to([(1, (-5, 5)), (1, (0, 10)), (7, (0, 10)), (7, (5, 15))])) pcoll2 = pcoll | 'Again' >> ParDo(TestDoFn()) assert_that(pcoll2, equal_to([((1, (-5, 5)), (-5, 5)), ((1, (0, 10)), (0, 10)), ((7, (0, 10)), (0, 10)), ((7, (5, 15)), (5, 15))]), label='doubled windows') pipeline.run()
def test_pipeline(self): infos = { 'IS': createInfo('IS', 1, 'String', ''), 'ISI': createInfo('ISI', 1, 'Integer', ''), 'ISF': createInfo('ISF', 1, 'Float', ''), 'IB': createInfo('IB', 0, 'Flag', ''), 'IA': createInfo('IA', 'A', 'Integer', '') } formats = OrderedDict([ ('FS', createFormat('FS', 1, 'String', 'desc')), ('FI', createFormat('FI', 2, 'Integer', 'desc')), ('GT', createFormat('GT', 2, 'Integer', 'Special GT key')), ('PS', createFormat('PS', 1, 'Integer', 'Special PS key')) ]) with TestPipeline() as p: variant_1 = self._get_sample_variant_info_ia_cardinality_mismatch() variant_2 = self._get_sample_variant_format_fi_float_value() inferred_headers = ( p | Create([variant_1, variant_2]) | 'InferHeaderFields' >> infer_headers.InferHeaderFields( defined_headers=vcf_header_io.VcfHeader(infos=infos, formats=formats), allow_incompatible_records=True, infer_headers=True)) expected_infos = { 'IA': createInfo('IA', '.', 'Float', ''), 'IF': createInfo('IF', 1, 'Float', '') } expected_formats = { 'FI': createFormat('FI', 2, 'Float', 'desc'), 'FU': createFormat('FU', '.', 'Float', '') } expected = vcf_header_io.VcfHeader(infos=expected_infos, formats=expected_formats) assert_that(inferred_headers, asserts.header_fields_equal_ignore_order([expected])) p.run()
def test_merge_header_definitions_save_five_copies(self): lines_1 = [ '##INFO=<ID=NS,Number=1,Type=Float,Description="Number samples">\n', '#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Sample1 Sample2\n' ] lines_2 = [ '##INFO=<ID=NS,Number=1,Type=Integer,Description="Number samples">\n', '#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Sample3\n' ] vcf_reader_1 = vcf.Reader(fsock=iter(lines_1)) vcf_reader_2 = vcf.Reader(fsock=iter(lines_2)) file_names = ['file1', 'file2', 'file3', 'file4', 'file5', 'file6'] headers = [] for file_name in file_names: headers.append( self._get_vcf_header_from_reader(vcf_reader_1, file_name)) headers.append(self._get_vcf_header_from_reader(vcf_reader_2, 'file7')) pipeline = TestPipeline() merged_definitions = ( pipeline | Create(headers) | 'MergeDefinitions' >> merge_header_definitions.MergeDefinitions()) expected = VcfHeaderDefinitions() expected._infos = { 'NS': { Definition(1, 'Float'): ['file1', 'file2', 'file3', 'file4', 'file5'], Definition(1, 'Integer'): ['file7'] } } assert_that(merged_definitions, equal_to([expected])) pipeline.run()
def test_timestamp_param_map(self): with TestPipeline() as p: assert_that( p | Create([1, 2]) | beam.Map(lambda _, t=DoFn.TimestampParam: t), equal_to([MIN_TIMESTAMP, MIN_TIMESTAMP]))
def test_apply_custom_transform(self): with TestPipeline() as pipeline: pcoll = pipeline | 'pcoll' >> Create([1, 2, 3]) result = pcoll | PipelineTest.CustomTransform() assert_that(result, equal_to([2, 3, 4]))
def test_create_singleton_pcollection(self): with TestPipeline() as pipeline: pcoll = pipeline | 'label' >> Create([[1, 2, 3]]) assert_that(pcoll, equal_to([[1, 2, 3]]))
def timestamped_key_values(self, pipeline, key, *timestamps): return (pipeline | 'start' >> Create(timestamps) | Map(lambda x: WindowedValue((key, x), x, [GlobalWindow()])))
def test_eager_pipeline(self): p = Pipeline('EagerRunner') self.assertEqual([1, 4, 9], p | Create([1, 2, 3]) | Map(lambda x: x * x))
def test_run(self): elems = [ { 'a': 1, 'b': 1, 'c': { 'x': 1, 'y': 1 }, 'd': { 'x': 1, 'y': 1 }, 'e': 1, 'f': 1, 'g': 1, 'h': 1 }, { 'a': 2, 'b': 2, 'c': { 'x': 2, 'y': 2 }, 'd': { 'x': 2, 'y': 2 }, 'e': 2, 'f': 2, 'g': 2, 'h': 2 }, ] with TestPipeline() as p: pc = (p | Create(elems) | RestructDict( mappings={ 'a': 'moved_a', 'b': 'nested.moved_b', 'c.x': 'nested.moved_c_x', 'c.y': 'moved_c_y', 'd': True, 'e': False, 'f': None, })) assert_that( pc, equal_to([ { 'moved_a': 1, 'nested': { 'moved_b': 1, 'moved_c_x': 1 }, 'moved_c_y': 1, 'd': { 'x': 1, 'y': 1 } }, { 'moved_a': 2, 'nested': { 'moved_b': 2, 'moved_c_x': 2 }, 'moved_c_y': 2, 'd': { 'x': 2, 'y': 2 } }, ]))
def test_create_singleton_pcollection(self): pipeline = TestPipeline() pcoll = pipeline | 'label' >> Create([[1, 2, 3]]) assert_that(pcoll, equal_to([[1, 2, 3]])) pipeline.run()