Esempio n. 1
0
def _merge_headers(known_args,
                   pipeline_args,
                   pipeline_mode,
                   annotated_vcf_pattern=None):
    # type: (str, argparse.Namespace, List[str], int, str) -> None
    """Merges VCF headers using beam based on pipeline_mode."""
    options = pipeline_options.PipelineOptions(pipeline_args)

    # Always run pipeline locally if data is small.
    if (pipeline_mode == pipeline_common.PipelineModes.SMALL
            and not known_args.infer_headers
            and not known_args.infer_annotation_types):
        options.view_as(
            pipeline_options.StandardOptions).runner = 'DirectRunner'

    google_cloud_options = options.view_as(pipeline_options.GoogleCloudOptions)
    merge_headers_job_name = pipeline_common.generate_unique_name(
        _MERGE_HEADERS_JOB_NAME)
    if google_cloud_options.job_name:
        google_cloud_options.job_name += '-' + merge_headers_job_name
    else:
        google_cloud_options.job_name = merge_headers_job_name

    temp_directory = google_cloud_options.temp_location or tempfile.mkdtemp()
    temp_merged_headers_file_name = '-'.join(
        [google_cloud_options.job_name, _MERGE_HEADERS_FILE_NAME])
    temp_merged_headers_file_path = filesystems.FileSystems.join(
        temp_directory, temp_merged_headers_file_name)
    if not known_args.append:
        bigquery_util.create_sample_info_table(known_args.output_table)

    with beam.Pipeline(options=options) as p:
        headers = pipeline_common.read_headers(p, pipeline_mode,
                                               known_args.all_patterns)
        _ = (headers
             | 'SampleInfoToBigQuery' >>
             sample_info_to_bigquery.SampleInfoToBigQuery(
                 known_args.output_table, SampleNameEncoding[
                     known_args.sample_name_encoding], known_args.append))
        if known_args.representative_header_file:
            return
        merged_header = pipeline_common.get_merged_headers(
            headers, known_args.split_alternate_allele_info_fields,
            known_args.allow_incompatible_records)
        if annotated_vcf_pattern:
            merged_header = pipeline_common.add_annotation_headers(
                p, known_args, pipeline_mode, merged_header,
                annotated_vcf_pattern)
        if known_args.infer_headers or known_args.infer_annotation_types:
            infer_headers_input_pattern = ([
                annotated_vcf_pattern
            ] if annotated_vcf_pattern else known_args.all_patterns)
            merged_header = _add_inferred_headers(infer_headers_input_pattern,
                                                  p, known_args, merged_header,
                                                  pipeline_mode)

        pipeline_common.write_headers(merged_header,
                                      temp_merged_headers_file_path)
        known_args.representative_header_file = temp_merged_headers_file_path
def run(argv=None):
    # type: (List[str]) -> (str, str)
    """Runs preprocess pipeline."""
    logging.info('Command: %s', ' '.join(argv or sys.argv))
    known_args, pipeline_args = pipeline_common.parse_args(
        argv, _COMMAND_LINE_OPTIONS)
    options = pipeline_options.PipelineOptions(pipeline_args)
    all_patterns = known_args.all_patterns
    pipeline_mode = pipeline_common.get_pipeline_mode(all_patterns)

    with beam.Pipeline(options=options) as p:
        headers = pipeline_common.read_headers(p, pipeline_mode, all_patterns)
        merged_headers = pipeline_common.get_merged_headers(headers)
        merged_definitions = (
            headers
            |
            'MergeDefinitions' >> merge_header_definitions.MergeDefinitions())
        if known_args.report_all_conflicts:
            if len(all_patterns) == 1:
                variants = p | 'ReadFromVcf' >> vcfio.ReadFromVcf(
                    all_patterns[0], allow_malformed_records=True)
            else:
                variants = (p
                            | 'InputFilePattern' >> beam.Create(all_patterns)
                            | 'ReadAllFromVcf' >>
                            vcfio.ReadAllFromVcf(allow_malformed_records=True))

            malformed_records = variants | filter_variants.ExtractMalformedVariants(
            )
            inferred_headers, merged_headers = (_get_inferred_headers(
                variants, merged_headers))
            _ = (merged_definitions
                 | 'GenerateConflictsReport' >> beam.ParDo(
                     preprocess_reporter.generate_report,
                     known_args.report_path,
                     beam.pvalue.AsSingleton(merged_headers),
                     beam.pvalue.AsSingleton(inferred_headers),
                     beam.pvalue.AsIter(malformed_records)))
        else:
            _ = (merged_definitions
                 | 'GenerateConflictsReport' >> beam.ParDo(
                     preprocess_reporter.generate_report,
                     known_args.report_path,
                     beam.pvalue.AsSingleton(merged_headers)))

        if known_args.resolved_headers_path:
            pipeline_common.write_headers(merged_headers,
                                          known_args.resolved_headers_path)
Esempio n. 3
0
def _merge_headers(known_args, pipeline_args, pipeline_mode):
    # type: (argparse.Namespace, List[str], int) -> None
    """Merges VCF headers using beam based on pipeline_mode."""
    if known_args.representative_header_file:
        return

    options = pipeline_options.PipelineOptions(pipeline_args)

    # Always run pipeline locally if data is small.
    if (pipeline_mode == pipeline_common.PipelineModes.SMALL
            and not known_args.infer_headers
            and not known_args.infer_annotation_types):
        options.view_as(
            pipeline_options.StandardOptions).runner = 'DirectRunner'

    google_cloud_options = options.view_as(pipeline_options.GoogleCloudOptions)
    merge_headers_job_name = pipeline_common.generate_unique_name(
        _MERGE_HEADERS_JOB_NAME)
    if google_cloud_options.job_name:
        google_cloud_options.job_name += '-' + merge_headers_job_name
    else:
        google_cloud_options.job_name = merge_headers_job_name

    temp_directory = google_cloud_options.temp_location or tempfile.mkdtemp()
    temp_merged_headers_file_name = '-'.join(
        [google_cloud_options.job_name, _MERGE_HEADERS_FILE_NAME])
    temp_merged_headers_file_path = filesystems.FileSystems.join(
        temp_directory, temp_merged_headers_file_name)

    with beam.Pipeline(options=options) as p:
        headers = pipeline_common.read_headers(p, pipeline_mode, known_args)
        merged_header = pipeline_common.get_merged_headers(
            headers, known_args.split_alternate_allele_info_fields,
            known_args.allow_incompatible_records)
        if known_args.infer_headers or known_args.infer_annotation_types:
            merged_header = _add_inferred_headers(p, known_args, merged_header)
        pipeline_common.write_headers(merged_header,
                                      temp_merged_headers_file_path)
        known_args.representative_header_file = temp_merged_headers_file_path