Ejemplo n.º 1
0
def _read_variants(
        all_patterns,  # type: List[str]
        pipeline,  # type: beam.Pipeline
        known_args,  # type: argparse.Namespace
        pipeline_mode  # type: int
):
    # type: (...) -> pvalue.PCollection
    """Helper method for returning a PCollection of Variants from VCFs."""
    representative_header_lines = None
    if known_args.representative_header_file:
        representative_header_lines = vcf_header_parser.get_metadata_header_lines(
            known_args.representative_header_file)

    if pipeline_mode == pipeline_common.PipelineModes.LARGE:
        variants = (
            pipeline
            | 'InputFilePattern' >> beam.Create(all_patterns)
            | 'ReadAllFromVcf' >> vcfio.ReadAllFromVcf(
                representative_header_lines=representative_header_lines,
                allow_malformed_records=(known_args.allow_malformed_records)))
    else:
        variants = pipeline | 'ReadFromVcf' >> vcfio.ReadFromVcf(
            all_patterns[0],
            representative_header_lines=representative_header_lines,
            allow_malformed_records=known_args.allow_malformed_records,
            vcf_parser_type=vcfio.VcfParserType[known_args.vcf_parser])

    return variants
Ejemplo n.º 2
0
def read_variants(
        pipeline,  # type: beam.Pipeline
        all_patterns,  # type: List[str]
        pipeline_mode,  # type: PipelineModes
        allow_malformed_records,  # type: bool
        representative_header_lines=None,  # type: List[str]
        pre_infer_headers=False,  # type: bool
        sample_name_encoding=SampleNameEncoding.WITHOUT_FILE_PATH,  # type: int
        use_1_based_coordinate=False,  # type: bool
        move_hom_ref_calls=False  # type: bool
):
    # type: (...) -> pvalue.PCollection
    """Returns a PCollection of Variants by reading VCFs."""
    compression_type = get_compression_type(all_patterns)
    if compression_type == filesystem.CompressionTypes.GZIP:
        splittable_bgzf = _get_splittable_bgzf(all_patterns)
        if splittable_bgzf:
            return (pipeline
                    | 'ReadVariants' >> vcfio.ReadFromBGZF(
                        splittable_bgzf, representative_header_lines,
                        allow_malformed_records, pre_infer_headers,
                        sample_name_encoding, use_1_based_coordinate,
                        move_hom_ref_calls))

    if pipeline_mode == PipelineModes.LARGE:
        variants = (
            pipeline
            | 'InputFilePattern' >> beam.Create(all_patterns)
            | 'ReadAllFromVcf' >> vcfio.ReadAllFromVcf(
                representative_header_lines=representative_header_lines,
                compression_type=compression_type,
                allow_malformed_records=allow_malformed_records,
                pre_infer_headers=pre_infer_headers,
                sample_name_encoding=sample_name_encoding,
                use_1_based_coordinate=use_1_based_coordinate,
                move_hom_ref_calls=move_hom_ref_calls))
    else:
        variants = pipeline | 'ReadFromVcf' >> vcfio.ReadFromVcf(
            all_patterns[0],
            representative_header_lines=representative_header_lines,
            compression_type=compression_type,
            allow_malformed_records=allow_malformed_records,
            pre_infer_headers=pre_infer_headers,
            sample_name_encoding=sample_name_encoding,
            use_1_based_coordinate=use_1_based_coordinate,
            move_hom_ref_calls=move_hom_ref_calls)

    if compression_type == filesystem.CompressionTypes.GZIP:
        variants |= 'FusionBreak' >> fusion_break.FusionBreak()
    return variants
Ejemplo n.º 3
0
def _read_variants(pipeline, known_args):
  """Helper method for returning a ``PCollection`` of Variants from VCFs."""
  if known_args.optimize_for_large_inputs:
    variants = (pipeline
                | 'InputFilePattern' >> beam.Create(
                    [known_args.input_pattern])
                | 'ReadAllFromVcf' >> vcfio.ReadAllFromVcf(
                    allow_malformed_records=(
                        known_args.allow_malformed_records)))
  else:
    variants = pipeline | 'ReadFromVcf' >> vcfio.ReadFromVcf(
        known_args.input_pattern,
        allow_malformed_records=known_args.allow_malformed_records)
  return variants
def run(argv=None):
    # type: (List[str]) -> (str, str)
    """Runs preprocess pipeline."""
    logging.info('Command: %s', ' '.join(argv or sys.argv))
    known_args, pipeline_args = pipeline_common.parse_args(
        argv, _COMMAND_LINE_OPTIONS)
    options = pipeline_options.PipelineOptions(pipeline_args)
    all_patterns = known_args.all_patterns
    pipeline_mode = pipeline_common.get_pipeline_mode(all_patterns)

    with beam.Pipeline(options=options) as p:
        headers = pipeline_common.read_headers(p, pipeline_mode, all_patterns)
        merged_headers = pipeline_common.get_merged_headers(headers)
        merged_definitions = (
            headers
            |
            'MergeDefinitions' >> merge_header_definitions.MergeDefinitions())
        if known_args.report_all_conflicts:
            if len(all_patterns) == 1:
                variants = p | 'ReadFromVcf' >> vcfio.ReadFromVcf(
                    all_patterns[0], allow_malformed_records=True)
            else:
                variants = (p
                            | 'InputFilePattern' >> beam.Create(all_patterns)
                            | 'ReadAllFromVcf' >>
                            vcfio.ReadAllFromVcf(allow_malformed_records=True))

            malformed_records = variants | filter_variants.ExtractMalformedVariants(
            )
            inferred_headers, merged_headers = (_get_inferred_headers(
                variants, merged_headers))
            _ = (merged_definitions
                 | 'GenerateConflictsReport' >> beam.ParDo(
                     preprocess_reporter.generate_report,
                     known_args.report_path,
                     beam.pvalue.AsSingleton(merged_headers),
                     beam.pvalue.AsSingleton(inferred_headers),
                     beam.pvalue.AsIter(malformed_records)))
        else:
            _ = (merged_definitions
                 | 'GenerateConflictsReport' >> beam.ParDo(
                     preprocess_reporter.generate_report,
                     known_args.report_path,
                     beam.pvalue.AsSingleton(merged_headers)))

        if known_args.resolved_headers_path:
            pipeline_common.write_headers(merged_headers,
                                          known_args.resolved_headers_path)
Ejemplo n.º 5
0
def _read_variants(pipeline, known_args):
    # type: (beam.Pipeline, argparse.Namespace) -> pvalue.PCollection
    """Helper method for returning a PCollection of Variants from VCFs."""
    representative_header_lines = None
    if known_args.representative_header_file:
        representative_header_lines = vcf_header_parser.get_metadata_header_lines(
            known_args.representative_header_file)

    if known_args.optimize_for_large_inputs:
        variants = (
            pipeline
            | 'InputFilePattern' >> beam.Create([known_args.input_pattern])
            | 'ReadAllFromVcf' >> vcfio.ReadAllFromVcf(
                representative_header_lines=representative_header_lines,
                allow_malformed_records=(known_args.allow_malformed_records)))
    else:
        variants = pipeline | 'ReadFromVcf' >> vcfio.ReadFromVcf(
            known_args.input_pattern,
            representative_header_lines=representative_header_lines,
            allow_malformed_records=known_args.allow_malformed_records)
    return variants
def read_variants(
    pipeline,  # type: beam.Pipeline
    all_patterns,  # type: List[str]
    pipeline_mode,  # type: PipelineModes
    allow_malformed_records,  # type: bool
    representative_header_lines=None,  # type: List[str]
    vcf_parser=vcfio.VcfParserType.PYVCF  # type: vcfio.VcfParserType
    ):
  # type: (...) -> pvalue.PCollection
  """Returns a PCollection of Variants by reading VCFs."""
  compression_type = get_compression_type(all_patterns)
  if compression_type == filesystem.CompressionTypes.GZIP:
    splittable_bgzf = _get_splittable_bgzf(all_patterns)
    if splittable_bgzf:
      return (pipeline
              | 'ReadVariants'
              >> vcfio.ReadFromBGZF(splittable_bgzf,
                                    representative_header_lines,
                                    allow_malformed_records))

  if pipeline_mode == PipelineModes.LARGE:
    variants = (pipeline
                | 'InputFilePattern' >> beam.Create(all_patterns)
                | 'ReadAllFromVcf' >> vcfio.ReadAllFromVcf(
                    representative_header_lines=representative_header_lines,
                    compression_type=compression_type,
                    allow_malformed_records=allow_malformed_records))
  else:
    variants = pipeline | 'ReadFromVcf' >> vcfio.ReadFromVcf(
        all_patterns[0],
        representative_header_lines=representative_header_lines,
        compression_type=compression_type,
        allow_malformed_records=allow_malformed_records,
        vcf_parser_type=vcf_parser)

  if compression_type == filesystem.CompressionTypes.GZIP:
    variants |= 'FusionBreak' >> fusion_break.FusionBreak()
  return variants