def extract_pharmcat_pgx_regions(tabix_executable_path, input_vcf, output_dir, input_ref_pgx_vcf): ''' extract pgx regions in input_ref_pgx_vcf from input_vcf and save variants to path_output ''' print( 'Modify chromosome names.\nExtract PGx regions based on the input reference PGx position file.' ) path_output = os.path.join( output_dir, obtain_vcf_file_prefix(input_vcf) + '.pgx_regions.vcf.gz') input_vcf_cyvcf2 = VCF(input_vcf) input_ref_pgx_pos_cyvcf2 = VCF(input_ref_pgx_vcf) # get pgx regions in each chromosome input_ref_pgx_pos_pandas = allel.vcf_to_dataframe(input_ref_pgx_vcf) input_ref_pgx_pos_pandas['CHROM'] = input_ref_pgx_pos_pandas[ 'CHROM'].replace({ 'chr': '' }, regex=True).astype(str).astype(int) ref_pgx_regions = input_ref_pgx_pos_pandas.groupby( ['CHROM'])['POS'].agg(get_vcf_pos_min_max).reset_index() # fix chr names chr_name_match = re.compile("^chr") if any(chr_name_match.match(line) for line in input_vcf_cyvcf2.seqnames): # chromosomes have leading 'chr' characters in the original VCF # pgx regions to be extracted ref_pgx_regions = ref_pgx_regions.apply( lambda row: ':'.join(row.values.astype(str)), axis=1).replace({'^': 'chr'}, regex=True) else: # chromosomes do not have leading 'chr' characters in the original VCF # add chromosome name with leading 'chr' to the VCF header for single_chr in input_vcf_cyvcf2.seqnames: input_vcf_cyvcf2.add_to_header('##contig=<ID=chr' + single_chr + '>') # pgx regions to be extracted ref_pgx_regions = ref_pgx_regions.apply( lambda row: ':'.join(row.values.astype(str)), axis=1) # write to a VCF output file # header output_vcf_cyvcf2 = Writer(path_output, input_vcf_cyvcf2, mode="wz") # content for single_region in ref_pgx_regions: for single_variant in input_vcf_cyvcf2(single_region): single_variant.CHROM = re.sub(r'^([0-9]+)', r'chr\1', single_variant.CHROM) output_vcf_cyvcf2.write_record(single_variant) # close pipe input_vcf_cyvcf2.close() input_ref_pgx_pos_cyvcf2.close() output_vcf_cyvcf2.close() tabix_index_vcf(tabix_executable_path, path_output) return path_output
def annotate(vcf, roh, bed, v14, quality_threshold, flag_upd_at_fraction, output, verbose): """Markup VCF file using rho-calls. Use BED file to mark all variants in AZ windows. Alternatively, use a bcftools v>=1.4 file with RG entries to mark all vars. With the --no-v14 flag, use an older bcftools v<=1.2 style roh TSV to mark only selected AZ variants. Roh is broken in bcftools v1.3 - do not use.""" loglevel = LEVELS.get(min(verbose, 3)) configure_stream(level=loglevel) proband_vcf = VCF(vcf) # add this command to VCF header ## This is for logging the command line string ## frame = inspect.currentframe() args, _, _, values = inspect.getargvalues(frame) argument_list = [ i + '=' + str(values[i]) for i in values if values[i] and i not in ['frame'] ] logger.info("Running rhocall annotate {0}".format(__version__)) logger.debug("Arguments: {0}".format(', '.join(argument_list))) ## add additional tags to VCF header proband_vcf.add_to_header('##rhocall_version={0}'.format(__version__)) proband_vcf.add_to_header("##rhocall_arguments={0}".format( ', '.join(argument_list))) if roh and not bed and not v14: run_annotate_var(proband_vcf=proband_vcf, roh=roh, quality_threshold=quality_threshold, flag_upd_at_fraction=flag_upd_at_fraction, output=output) elif roh and v14 and not bed: run_annotate_rg(proband_vcf=proband_vcf, bcfroh=roh, quality_threshold=quality_threshold, flag_upd_at_fraction=flag_upd_at_fraction, output=output) elif bed and not roh: run_annotate(proband_vcf=proband_vcf, bed=bed, quality_threshold=quality_threshold, flag_upd_at_fraction=flag_upd_at_fraction, output=output) else: click.echo("""Cannot use both BED and ROH at once. Please apply them sequentially instead.""")
raise ValueError(f"{fai} does not exist.") TEMPLATE_VCF = f"""##fileformat=VCFv4.2 ##FILTER=<ID=PASS,Description="All filters passed"> ##fileDate={date.today().strftime("%Y%m%d")} #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t{sample} chr1\t1\t.\tA\tT\t.\tPASS\t.\tGT\t0|0 """ TEMPLATE_VCF_FILE = joboutdir / "template.vcf" TEMPLATE_VCF_FILE.write_text(TEMPLATE_VCF) vcf = VCF(TEMPLATE_VCF_FILE) # Add source vcf.add_to_header(f"##source=biopipen.ns.bed.Bed2Vcf") # Add genome assembly if genome: vcf.add_to_header(f"##reference={genome}") vcf.add_info_to_header( { "ID": "END", "Number": "1", "Type": "Integer", "Description": "End position of the variant described in this record" } ) vcf.add_format_to_header(
logger.success(f"Extracted {len(genes)} genes from the panel") logger.info("Extracting intervals for genes from GFF...") with open(snakemake.input.annotation) as istream: ivtree = extract_intervals_for_genes_from_gff(genes, istream, padding) logger.success(f"Intervals extracted for {len(ivtree)} genes") logger.info( "Extracting those VCF records that fall within the gene intervals and altering " "their CHROM and POS accordingly...") vcf_reader = VCF(snakemake.input.vcf) logger.debug("Adding genes to header...") for iv in ivtree: vcf_reader.add_to_header( f"##contig=<ID={iv.data[0]},length={iv.end-iv.begin}>") logger.debug("Genes added to header") with TemporaryDirectory() as tmpdirname: tmpvcf = str(Path(tmpdirname) / "tmp.vcf") vcf_writer = Writer(tmpvcf, tmpl=vcf_reader) for record in vcf_reader: if apply_filters and record.FILTER is not None: continue gt = Genotype.from_arr(record.genotypes[0]) if only_alt and not gt.is_hom_alt(): continue ivs = ivtree[record.start]