def test_write_dataflow(self):
        pipeline = TestPipeline()
        pcoll = pipeline | beam.Create(self.variants)
        _ = pcoll | 'Write' >> vcfio.WriteToVcf(self.path)
        pipeline.run()

        read_result = []
        for file_name in glob.glob(self.path + '*'):
            with open(file_name, 'r') as f:
                read_result.extend(f.read().splitlines())

        for actual, expected in zip(read_result, self.variant_lines):
            self._assert_variant_lines_equal(actual, expected)
    def test_write_dataflow_auto_compression(self):
        pipeline = TestPipeline()
        pcoll = pipeline | beam.Create(self.variants)
        _ = pcoll | 'Write' >> vcfio.WriteToVcf(
            self.path + '.gz', compression_type=CompressionTypes.AUTO)
        pipeline.run()

        read_result = []
        for file_name in glob.glob(self.path + '*'):
            with gzip.GzipFile(file_name, 'r') as f:
                read_result.extend(f.read().splitlines())

        for actual, expected in zip(read_result, self.variant_lines):
            self._assert_variant_lines_equal(actual, expected)
    def test_write_dataflow_header(self):
        pipeline = TestPipeline()
        pcoll = pipeline | 'Create' >> beam.Create(self.variants)
        headers = ['foo\n']
        _ = pcoll | 'Write' >> vcfio.WriteToVcf(
            self.path + '.gz',
            compression_type=CompressionTypes.AUTO,
            headers=headers)
        pipeline.run()

        read_result = []
        for file_name in glob.glob(self.path + '*'):
            with gzip.GzipFile(file_name, 'r') as f:
                read_result.extend(f.read().splitlines())

        self.assertEqual(read_result[0], 'foo')
        for actual, expected in zip(read_result[1:], self.variant_lines):
            self._assert_variant_lines_equal(actual, expected)
Beispiel #4
0
  def test_write_dataflow_1_based(self):
    variants = [
        _get_sample_variant_1(use_1_based_coordinate=True),
        _get_sample_variant_2(use_1_based_coordinate=True),
        _get_sample_variant_3(use_1_based_coordinate=True),
        _get_sample_non_variant(use_1_based_coordinate=True)]
    pipeline = TestPipeline()
    pcoll = pipeline | beam.Create(variants, reshuffle=False)
    _ = pcoll | 'Write' >> vcfio.WriteToVcf(self.path)
    pipeline.run()

    read_result = []
    for file_name in glob.glob(self.path + '*'):
      with open(file_name, 'r') as f:
        read_result.extend(f.read().splitlines())

    for actual, expected in zip(read_result, self.variant_lines):
      self._assert_variant_lines_equal(actual, expected)
def run(argv=None):
    # type: (List[str]) -> None
    """Runs BigQuery to VCF pipeline."""
    logging.info('Command: %s', ' '.join(argv or sys.argv))
    known_args, pipeline_args = vcf_to_bq_common.parse_args(
        argv, _COMMAND_LINE_OPTIONS)
    bq_source = bigquery.BigQuerySource(
        query=_BASE_QUERY_TEMPLATE.format(INPUT_TABLE='.'.join(
            bigquery_util.parse_table_reference(known_args.input_table))),
        validate=True,
        use_standard_sql=True)

    options = pipeline_options.PipelineOptions(pipeline_args)
    with beam.Pipeline(options=options) as p:
        _ = (p | 'ReadFromBigQuery ' >> beam.io.Read(bq_source)
             | bigquery_to_variant.BigQueryToVariant()
             | densify_variants.DensifyVariants()
             | vcfio.WriteToVcf(known_args.output_file))