def test_write_dataflow(self): pipeline = TestPipeline() pcoll = pipeline | beam.Create(self.variants) _ = pcoll | 'Write' >> vcfio.WriteToVcf(self.path) pipeline.run() read_result = [] for file_name in glob.glob(self.path + '*'): with open(file_name, 'r') as f: read_result.extend(f.read().splitlines()) for actual, expected in zip(read_result, self.variant_lines): self._assert_variant_lines_equal(actual, expected)
def test_write_dataflow_auto_compression(self): pipeline = TestPipeline() pcoll = pipeline | beam.Create(self.variants) _ = pcoll | 'Write' >> vcfio.WriteToVcf( self.path + '.gz', compression_type=CompressionTypes.AUTO) pipeline.run() read_result = [] for file_name in glob.glob(self.path + '*'): with gzip.GzipFile(file_name, 'r') as f: read_result.extend(f.read().splitlines()) for actual, expected in zip(read_result, self.variant_lines): self._assert_variant_lines_equal(actual, expected)
def test_write_dataflow_header(self): pipeline = TestPipeline() pcoll = pipeline | 'Create' >> beam.Create(self.variants) headers = ['foo\n'] _ = pcoll | 'Write' >> vcfio.WriteToVcf( self.path + '.gz', compression_type=CompressionTypes.AUTO, headers=headers) pipeline.run() read_result = [] for file_name in glob.glob(self.path + '*'): with gzip.GzipFile(file_name, 'r') as f: read_result.extend(f.read().splitlines()) self.assertEqual(read_result[0], 'foo') for actual, expected in zip(read_result[1:], self.variant_lines): self._assert_variant_lines_equal(actual, expected)
def test_write_dataflow_1_based(self): variants = [ _get_sample_variant_1(use_1_based_coordinate=True), _get_sample_variant_2(use_1_based_coordinate=True), _get_sample_variant_3(use_1_based_coordinate=True), _get_sample_non_variant(use_1_based_coordinate=True)] pipeline = TestPipeline() pcoll = pipeline | beam.Create(variants, reshuffle=False) _ = pcoll | 'Write' >> vcfio.WriteToVcf(self.path) pipeline.run() read_result = [] for file_name in glob.glob(self.path + '*'): with open(file_name, 'r') as f: read_result.extend(f.read().splitlines()) for actual, expected in zip(read_result, self.variant_lines): self._assert_variant_lines_equal(actual, expected)
def run(argv=None): # type: (List[str]) -> None """Runs BigQuery to VCF pipeline.""" logging.info('Command: %s', ' '.join(argv or sys.argv)) known_args, pipeline_args = vcf_to_bq_common.parse_args( argv, _COMMAND_LINE_OPTIONS) bq_source = bigquery.BigQuerySource( query=_BASE_QUERY_TEMPLATE.format(INPUT_TABLE='.'.join( bigquery_util.parse_table_reference(known_args.input_table))), validate=True, use_standard_sql=True) options = pipeline_options.PipelineOptions(pipeline_args) with beam.Pipeline(options=options) as p: _ = (p | 'ReadFromBigQuery ' >> beam.io.Read(bq_source) | bigquery_to_variant.BigQueryToVariant() | densify_variants.DensifyVariants() | vcfio.WriteToVcf(known_args.output_file))