def _annotate_vcf_files(all_patterns, known_args, pipeline_args): # type: (List[str], argparse.Namespace, List[str]) -> str """Annotates the VCF files using VEP. Returns: The annotated VCF files directory. """ options = pipeline_options.PipelineOptions(pipeline_args) google_cloud_options = options.view_as(pipeline_options.GoogleCloudOptions) annotate_files_job_name = pipeline_common.generate_unique_name( _ANNOTATE_FILES_JOB_NAME) _update_google_cloud_job_name(google_cloud_options, annotate_files_job_name) with beam.Pipeline(options=options) as p: _ = (p | beam.Create(all_patterns) | 'AnnotateShards' >> beam.ParDo( annotate_files.AnnotateFile(known_args, pipeline_args))) if known_args.annotation_fields: known_args.annotation_fields.append(known_args.vep_info_field) else: known_args.annotation_fields = [known_args.vep_info_field] # TODO(bashir2): The VEP runner by default runs VEP with --allele_number hence # we turn on this feature here. However, this might be inconsistent with other # annotation fields that are originally present in input files, if they do not # have ALLELE_NUM annotation. The fix is to make annotation ALT matching # smarter to fall back on other matching methods if ALLELE_NUM is not present. # When this is implemented, we may even consider removing use_allele_num flag # and always start by checking if ALLELE_NUM is present. known_args.use_allele_num = True return vep_runner_util.get_output_pattern(known_args.annotation_output_dir)
def get_output_pattern(self): # type: () -> str return vep_runner_util.get_output_pattern(self._output_dir)