Exemple #1
0
 def test_read_variants_large_mode(self):
     pipeline = test_pipeline.TestPipeline()
     all_patterns = [testdata_util.get_full_file_path('valid-4.0.vcf')]
     variants = pipeline_common.read_variants(pipeline, all_patterns,
                                              PipelineModes.LARGE, False)
     assert_that(variants, asserts.count_equals_to(5))
     pipeline.run()
Exemple #2
0
def _read_variants(
        all_patterns,  # type: List[str]
        pipeline,  # type: beam.Pipeline
        known_args,  # type: argparse.Namespace
        pipeline_mode,  # type: int
        pre_infer_headers=False,  # type: bool
        keep_raw_sample_names=False,  # type: bool
        use_1_based_coordinate=True  # type: bool
):
    # type: (...) -> pvalue.PCollection
    """Helper method for returning a PCollection of Variants from VCFs."""
    representative_header_lines = None
    if known_args.representative_header_file:
        representative_header_lines = vcf_header_parser.get_metadata_header_lines(
            known_args.representative_header_file)
    return pipeline_common.read_variants(
        pipeline,
        all_patterns,
        pipeline_mode,
        known_args.allow_malformed_records,
        representative_header_lines,
        pre_infer_headers=pre_infer_headers,
        sample_name_encoding=(
            SampleNameEncoding.NONE if keep_raw_sample_names else
            SampleNameEncoding[known_args.sample_name_encoding]),
        use_1_based_coordinate=use_1_based_coordinate)
Exemple #3
0
 def test_read_variants_use_1_based_coordinate(self):
     pipeline = test_pipeline.TestPipeline()
     all_patterns = [testdata_util.get_full_file_path('valid-4.0.vcf')]
     variants = pipeline_common.read_variants(pipeline,
                                              all_patterns,
                                              PipelineModes.SMALL,
                                              False,
                                              use_1_based_coordinate=True)
     assert_that(variants, asserts.count_equals_to(5))
     pipeline.run()
Exemple #4
0
def run(argv=None):
    # type: (List[str]) -> (str, str)
    """Runs preprocess pipeline."""
    logging.info('Command: %s', ' '.join(argv or sys.argv))
    known_args, pipeline_args = pipeline_common.parse_args(
        argv, _COMMAND_LINE_OPTIONS)
    options = pipeline_options.PipelineOptions(pipeline_args)
    all_patterns = known_args.all_patterns
    pipeline_mode = pipeline_common.get_pipeline_mode(all_patterns)

    with beam.Pipeline(options=options) as p:
        headers = pipeline_common.read_headers(p, pipeline_mode, all_patterns)
        merged_headers = pipeline_common.get_merged_headers(headers)
        merged_definitions = (
            headers
            |
            'MergeDefinitions' >> merge_header_definitions.MergeDefinitions())
        if known_args.report_all_conflicts:
            variants = pipeline_common.read_variants(
                p,
                all_patterns,
                pipeline_mode,
                allow_malformed_records=True,
                pre_infer_headers=True)
            malformed_records = variants | filter_variants.ExtractMalformedVariants(
            )
            inferred_headers, merged_headers = (_get_inferred_headers(
                variants, merged_headers))
            _ = (merged_definitions
                 | 'GenerateConflictsReport' >> beam.ParDo(
                     preprocess_reporter.generate_report,
                     known_args.report_path,
                     beam.pvalue.AsSingleton(merged_headers),
                     beam.pvalue.AsSingleton(inferred_headers),
                     beam.pvalue.AsIter(malformed_records)))
        else:
            _ = (merged_definitions
                 | 'GenerateConflictsReport' >> beam.ParDo(
                     preprocess_reporter.generate_report,
                     known_args.report_path,
                     beam.pvalue.AsSingleton(merged_headers)))

        if known_args.resolved_headers_path:
            pipeline_common.write_headers(merged_headers,
                                          known_args.resolved_headers_path)
Exemple #5
0
def _read_variants(all_patterns,  # type: List[str]
                   pipeline,  # type: beam.Pipeline
                   known_args,  # type: argparse.Namespace
                   pipeline_mode  # type: int
                  ):
  # type: (...) -> pvalue.PCollection
  """Helper method for returning a PCollection of Variants from VCFs."""
  representative_header_lines = None
  if known_args.representative_header_file:
    representative_header_lines = vcf_header_parser.get_metadata_header_lines(
        known_args.representative_header_file)
  return pipeline_common.read_variants(
      pipeline,
      all_patterns,
      pipeline_mode,
      known_args.allow_malformed_records,
      representative_header_lines,
      vcfio.VcfParserType[known_args.vcf_parser])