def vcf2zarr(vcf_files, pop_file, zarr_path): # Two veery good tutorials: # http://alimanfoo.github.io/2018/04/09/selecting-variants.html # http://alimanfoo.github.io/2017/06/14/read-vcf.html # TODO: Refactor to work without pysam and allel # Get a list of the wanted samples from one population which are found in # the VCF files. The files must be numbered, and that number must be # substituted in the input path string with {n}. first_vcf = pysam.VariantFile(vcf_files.replace("{n}", "1")) wanted_samples = samples_from_population(pop_file) found_samples = list( set(wanted_samples).intersection(list(first_vcf.header.samples)) ) # Create one zarr folder for each chromosome for chrom in range(1, 23): vcf = vcf_files.replace("{n}", str(chrom)) print(f"Creating zarr object for chromosome {chrom}") allel.vcf_to_zarr( vcf, zarr_path, group=str(chrom), region=str(chrom), fields=["POS", "ALT", "samples", "GT"], samples=found_samples, overwrite=True, ) print("VCF data transformed into Zarr objects")
def vcf_to_zarr(vcf_path, zarr_path, fields=None): if fields is None: fields = DEF_VCF_FIELDS # convert our fields to allele zarr fields zarr_fields = [VARIATION_ZARR_FIELD_MAPPING[field] for field in fields] if 'samples' not in zarr_fields: zarr_fields.append('samples') allel.vcf_to_zarr(str(vcf_path), str(zarr_path), fields=zarr_fields)
def vcf_to_zarr_func(ch): allel.vcf_to_zarr(INFN, OUTFN, region=ch, group=ch, log=sys.stderr, fields=FIELDS, exclude_fields=EXCLUDE_FIELDS, tabix=TABIX_EXEC, transformers=transformers)
def create_allel_vcfzarr( shared_datadir: Path, tmpdir: Path, *, vcf_file: str = "sample.vcf.gz", **kwargs: Any, ) -> Path: """Create a vcfzarr file using scikit-allel""" vcf_path = shared_datadir / vcf_file output_path = tmpdir / f"allel_{vcf_file}.zarr" allel.vcf_to_zarr(str(vcf_path), str(output_path), **kwargs) return output_path
def create_vcfzarr( shared_datadir, tmpdir, *, fields=None, grouped_by_contig=False, consolidated=False ): """Create a vcfzarr file using scikit-allel""" vcf_path = shared_datadir / "sample.vcf" output_path = tmpdir / "sample.vcf.zarr" if grouped_by_contig: for contig in ["19", "20", "X"]: allel.vcf_to_zarr( str(vcf_path), str(output_path), fields=fields, group=contig, region=contig, ) else: allel.vcf_to_zarr(str(vcf_path), str(output_path), fields=fields) if consolidated: zarr.consolidate_metadata(str(output_path)) return output_path
def vcf_to_zarr(vcf_in, tabix_exec, chrom): """Convert on-disk VCF to on-disk Zarr database using scikit-allele and zarr modules Zarr database written to same directory as input VCF Args: vcf_in (str): Path to input VCF on disk tabix_exec (str): Full path to tabix executable chrom (str): Chromosome for which Zarr database should be created Returns: None """ vcf_path = os.path.dirname(vcf_in) # allel.vcf_to_zarr returns a directory with Zarr databse # Set Zarr database outdir zarr_base = os.path.basename(vcf_in).split('.')[0] zarr_out = vcf_path + '/' + zarr_base + '.zarr' # Rename 'numalt' field. Required by Zarr to distinguish `NUMALT` from `numalt` # `numalt` is automatically computed by scikit-allel rename_dict = {'variants/numalt':'variants/numalt_sci'} # Use vcf_to_zarr function from scikit-allel to create zarr database # Currently optimized for biallelic SNP VCF but easy to extend functionality allel.vcf_to_zarr( input=vcf_in, output=zarr_out, overwrite=True, group=chrom, rename_fields=rename_dict, fields='*', alt_number=1, tabix=tabix_exec, region=chrom, compressor=numcodecs.Blosc(cname='zstd', clevel=1, shuffle=False) )
def vcf2zarr(chrom, zarr_path, vcf_path): """Convert vcf to zarr. Parameters ---------- chroms : TYPE DESCRIPTION. zarr_path : TYPE DESCRIPTION. vcf_path : TYPE DESCRIPTION. Returns ------- None. """ if path.isdir(path.join(zarr_path, chrom)): pass else: allel.vcf_to_zarr(vcf_path, zarr_path, group=chrom, fields='*', alt_number=2, log=sys.stdout, compressor=numcodecs.Blosc(cname='zstd', clevel=1, shuffle=False)) return None
import allel, argparse parser = argparse.ArgumentParser() parser.add_argument("--vcf", help="path to VCF (or .vcf.gz)") parser.add_argument("--zarr", help="path for zarr output") args = parser.parse_args() allel.vcf_to_zarr(args.vcf, args.zarr)
# How does it work with tri-allelic variants? ############################ ## testing scikit-allel allel.__version__ zarr.__version__ numcodecs.__version__ np.__version__ # !ls -lh {VCFdata_fp} # list the files in the directory # format conversion zarr_path = '../Data/match_combined_filtered/' allel.vcf_to_zarr(VCFdata_fp + 'a_106_filtered.vcf.gz', zarr_path + 'a_106_filtered_genotypes.zarr', group='106', fields='*', log=sys.stdout, overwrite=True) callset = zarr.open_group(zarr_path + 'a_106_filtered_genotypes.zarr', mode='r') callset.tree(expand=True) gt_zarr = callset['106/calldata/GT'] gt_zarr.info pos = callset['106/variants/POS'] loc_region = pos.locate_range(20000000, 20100000) gt_region = allel.GenotypeArray(gt_zarr)
from os.path import join import zarr import allel VCF_FILE, OUT_FILE = sys.argv[:] allel.vcf_to_zarr(VCF_FILE, OUT_FILE, fields='*', overwrite=False)
-i: -o: Date: """ # Import Modules import allel import subprocess import argparse import os import pdb import zarr import shutil os.environ["NUMEXPR_MAX_THREADS"]="272" # Set up command line execution if __name__ == "__main__": parser = argparse.ArgumentParser(description='') parser.add_argument('-vcf', type=str, metavar='vcf_file', required=True, help='') parser.add_argument('-name', type=str, metavar='vcf_name', required=True, help='') parser.add_argument('-o', type=str, metavar='output_path', required=True, help='Output_path') args = parser.parse_args() outdir = os.path.dirname(args.o) if not os.path.exists(outdir): os.mkdir(outdir) print("compressing chromosome " + args.name) allel.vcf_to_zarr(args.vcf, args.o, group=str(args.name), fields='*', overwrite=True) print("done compressing chromosome " + args.name)
def main(): import argparse parser = argparse.ArgumentParser( description="Convert a single sample VCF to Zarr using " "Blosc compression") parser.add_argument("--input", required=True, help="path to input VCF file.") parser.add_argument("--output", required=True, help="path to output Zarr directory.") parser.add_argument("--sample", required=True, help="Sample identifier.") parser.add_argument( "--contig", required=True, action='append', dest='contigs', help="Contig to extract. Multiple values may be provided.") parser.add_argument( "--field", required=True, action='append', dest='fields', help="Field to extract, e.g., 'variants/MQ' or 'calldata/GT'. Multiple " "values may be provided.") parser.add_argument( "--compress-algo", help="Blosc compression algorithm. Choose from [zstd, blosclz, lz4, " "lz4hc, zlib, snappy].", default="zstd") parser.add_argument("--compress-level", help="Compression level. Choose integer from [0, 9].", default=1, type=int) parser.add_argument( "--compress-shuffle", help="Type of data shuffling used to obtain contiguous runs of same " "values for improving compression. Choose integer value from " "NOSHUFFLE (0), SHUFFLE (1), BITSHUFFLE (2) or AUTOSHUFFLE (-1). " "If -1 (default), bit-shuffle will be used for buffers with " "itemsize 1, and byte-shuffle will be used otherwise.", default=-1, type=int) parser.add_argument("--alt-number", help="Expected maximum number of alternate alleles.", default=3, type=int) parser.add_argument("--tabix", help="Path to tabix executable v0.2.5+.", default="tabix") parser.add_argument("--chunk-length", help="Chunk length in number of variants.", default=2**18, type=int) parser.add_argument("--chunk-width", help="Chunk width in number of samples.", default=64, type=int) parser.add_argument( "--log", help="Path to logfile, stdout or stderr. Default: stdout.", default="stdout") parser.add_argument( "--zip", action="store_true", help="If flag exists, entire zarr folder is zipped with no " "compression, and the original zarr folder is deleted. " "The zip file name is the value given for the --output " "argument, appended with '.zip'.") args = parser.parse_args() input_vcf_path = args.input output_zarr_path = args.output sample = args.sample compress_algo = args.compress_algo compress_level = args.compress_level compress_shuffle = args.compress_shuffle alt_number = args.alt_number tabix = args.tabix chunk_length = args.chunk_length chunk_width = args.chunk_width do_zip = args.zip contigs = args.contigs fields = args.fields log = args.log.strip() log_file_needs_closing = False if log == "stderr": log_file = sys.stderr elif log == "stdout": log_file = sys.stdout else: log_file = open(log, "w") log_file_needs_closing = True try: for contig in contigs: allel.vcf_to_zarr( input=input_vcf_path, output=output_zarr_path, group=f"{sample}/{contig}", region=contig, samples=[sample], compressor=zarr.Blosc(cname=compress_algo, clevel=compress_level, shuffle=compress_shuffle), overwrite=True, tabix=tabix, fields=fields, alt_number=alt_number, chunk_length=chunk_length, chunk_width=chunk_width, log=log_file, ) finally: if log_file_needs_closing: log_file.close() if do_zip: zip_zarr(zarr_path=output_zarr_path, del_orig=True)
def main(args=None): if args is None: args = sys.argv[1:] # the ascii help image help_image = "█▀▀█ ░▀░ █░█ █░░█\n" "█░░█ ▀█▀ ▄▀▄ █▄▄█\n" "█▀▀▀ ▀▀▀ ▀░▀ ▄▄▄█\n" help_text = 'pixy: sensible estimates of pi and dxy from a VCF' version_text = 'version 0.95.0' # initialize arguments parser = argparse.ArgumentParser( description=help_image + help_text + '\n' + version_text, formatter_class=argparse.RawTextHelpFormatter) parser.add_argument('--version', action='version', version=version_text) parser.add_argument( '--stats', nargs='+', choices=['pi', 'dxy', 'fst'], help= 'Which statistics to calculate from the VCF (pi, dxy, and/or fst, separated by spaces)', required=True) parser.add_argument('--vcf', type=str, nargs='?', help='Path to the input VCF', required=True) parser.add_argument('--zarr_path', type=str, nargs='?', help='Folder in which to build the Zarr array(s)', required=True) parser.add_argument( '--reuse_zarr', choices=['yes', 'no'], default='no', help='Use existing Zarr array(s) (saves time if re-running)') parser.add_argument('--populations', type=str, nargs='?', help='Path to the populations file', required=True) parser.add_argument( '--window_size', type=int, nargs='?', help='Window size in base pairs over which to calculate pi/dxy') parser.add_argument( '--chromosomes', type=str, nargs='?', default='all', help= 'A single-quoted, comma separated list of chromosome(s) (e.g. \'X,1,2\')', required=False) parser.add_argument( '--interval_start', type=str, nargs='?', help= 'The start of the interval over which to calculate pi/dxy. Only valid when calculating over a single chromosome.' ) parser.add_argument( '--interval_end', type=str, nargs='?', help= 'The end of the interval over which to calculate pi/dxy. Only valid when calculating over a single chromosome.' ) parser.add_argument( '--variant_filter_expression', type=str, nargs='?', help= 'A single-quoted, comma separated list of genotype filters (e.g. \'DP>=10,GQ>=20\') to apply to SNPs', required=False) parser.add_argument( '--invariant_filter_expression', type=str, nargs='?', help= 'A single-quoted, comma separated list of genotype filters (e.g. \'DP>=10,RGQ>=20\') to apply to invariant sites', required=False) parser.add_argument( '--outfile_prefix', type=str, nargs='?', default='./pixy_output', help='Path and prefix for the output file, e.g. path/to/outfile') parser.add_argument( '--bypass_filtration', choices=['yes', 'no'], default='no', help= 'Bypass all variant filtration (for data lacking FORMAT fields, use with caution)' ) parser.add_argument( '--bypass_invariant_check', choices=['yes', 'no'], default='no', help= 'Allow computation of stats without invariant sites, will result in wildly incorrect estimates most of the time. Use with extreme caution.' ) parser.add_argument( '--fst_maf_filter', default=0.05, type=float, nargs='?', help= 'Minor allele frequency filter for FST calculations, with value 0.0-1.0 (default 0.05).' ) # ag1000g test data # args = parser.parse_args('--stats fst --vcf data/vcf/multi_chr.vcf.gz --zarr_path data/vcf/multi --window_size 10000 --populations data/vcf/ag1000/Ag1000_sampleIDs_popfile_3.txt --variant_filter_expression DP>=10,GQ>20 --invariant_filter_expression DP>=10,RGQ>20 --outfile_prefix output/pixy_out'.split()) # filter test data # args = parser.parse_args('--stats pi --vcf data/vcf/filter_test.vcf.gz --zarr_path data/vcf/filter_test --window_size 3 --populations data/vcf/ag1000/Ag1000_sampleIDs_popfile_3.txt --variant_filter_expression DP>=10,GQ>20 --invariant_filter_expression DP>=10,RGQ>20 --fst_maf_filter 0.05 --outfile_prefix output/pixy_out'.split()) # catch arguments from the command line args = parser.parse_args() # CHECK FOR TABIX # (disabled until we implement site level and BED support) #tabix_path = shutil.which("tabix") #if tabix_path is None: # warnings.warn('[pixy] WARNING: tabix is not installed (or cannot be located) -- this may reduce performance. install tabix with "conda install -c bioconda tabix"') #if not os.path.exists(args.vcf + ".tbi") and tabix_path is not None: # raise Exception('[pixy] ERROR: your vcf is not indexed with tabix, index the bgzipped vcf with "tabix your.vcf.gz"') # VALIDATE ARGUMENTS print("[pixy] pixy " + version_text) print( "[pixy] Validating VCF and input parameters (this may take some time)..." ) # expand all file paths args.vcf = os.path.expanduser(args.vcf) args.zarr_path = os.path.expanduser(args.zarr_path) args.populations = os.path.expanduser(args.populations) args.outfile_prefix = os.path.expanduser(args.outfile_prefix) # CHECK FOR EXISTANCE OF VCF AND POPFILES if os.path.exists(args.vcf) is not True: raise Exception('[pixy] ERROR: The specified VCF ' + str(args.vcf) + ' does not exist') if os.path.exists(args.populations) is not True: raise Exception('[pixy] ERROR: The specified populations file ' + str(args.populations) + ' does not exist') # VALIDATE FILTER EXPRESSIONS # get vcf header info vcf_headers = allel.read_vcf_headers(args.vcf) # skip invariant check if only asking for FST if len(args.stats) == 1 and (args.stats[0] == 'fst'): args.bypass_invariant_check = "yes" # if we are bypassing the invariant check, spoof in a invariant filter if args.bypass_invariant_check == "yes": args.invariant_filter_expression = "DP>=0" if args.bypass_filtration == 'no' and ( args.variant_filter_expression is None or args.invariant_filter_expression is None): raise Exception( '[pixy] ERROR: One or more filter expression is missing. Provide two filter expressions, or set --bypass_filtration to \'yes\'' ) if args.bypass_filtration == 'no': # get the list of format fields and requested filter fields format_fields = vcf_headers.formats.keys() filter_fields = list() for x in args.variant_filter_expression.split(","): filter_fields.append(re.sub("[^A-Za-z]+", "", x)) for x in args.invariant_filter_expression.split(","): filter_fields.append(re.sub("[^A-Za-z]+", "", x)) missing = list(set(filter_fields) - set(format_fields)) if len(missing) > 0: raise Exception( '[pixy] ERROR: the following genotype filters were requested but not occur in the VCF: ', missing) else: print( "[pixy] WARNING: --bypass_filtration is set to \'yes\', genotype filtration will be not be performed." ) # VALIDATE THE VCF # check if the vcf is zipped if re.search(".gz", args.vcf): cat_prog = "gunzip -c " else: cat_prog = "cat " # check if the vcf contains any invariant sites # a very basic check: just looks for at least one invariant site in the alt field if args.bypass_invariant_check == 'no': alt_list = subprocess.check_output( cat_prog + args.vcf + " | grep -v '#' | head -n 10000 | awk '{print $5}' | sort | uniq", shell=True).decode("utf-8").split() if "." not in alt_list: raise Exception( '[pixy] ERROR: the provided VCF appears to contain no invariant sites (ALT = \".\"). This check can be bypassed via --bypass_invariant_check \'yes\'.' ) else: if not (len(args.stats) == 1 and (args.stats[0] == 'fst')): print( "[pixy] EXTREME WARNING: --bypass_invariant_check is set to \'yes\', which assumes that your VCF contains invariant sites. Lack of invariant sites will result in incorrect estimates." ) # check if requested chromosomes exist in vcf # defaults to all the chromosomes contained in the VCF (first data column) if args.chromosomes == 'all': chrom_list = subprocess.check_output( cat_prog + args.vcf + " | grep -v '#' | awk '{print $1}' | uniq", shell=True).decode("utf-8").split() chrom_all = chrom_list if args.chromosomes == 'all': chrom_list = subprocess.check_output( cat_prog + args.vcf + " | grep -v '#' | awk '{print $1}' | uniq", shell=True).decode("utf-8").split() chrom_all = chrom_list else: chrom_list = list(args.chromosomes.split(",")) chrom_all = subprocess.check_output( cat_prog + args.vcf + " | grep -v '#' | awk '{print $1}' | uniq", shell=True).decode("utf-8").split() missing = list(set(chrom_list) - set(chrom_all)) if len(missing) > 0: raise Exception( '[pixy] ERROR: the following chromosomes were requested but not occur in the VCF: ', missing) # INTERVALS # check if intervals are correctly specified if args.interval_start is not None and args.interval_end is None: raise Exception( '[pixy] ERROR: Both --interval_start and --interval_end must be specified' ) if args.interval_start is None and args.interval_end is not None: raise Exception( '[pixy] ERROR: Both --interval_start and --interval_end must be specified' ) if args.interval_start is not None and args.interval_end is not None and len( chrom_list) > 1: raise Exception( '[pixy] ERROR: --interval_start and --interval_end are not valid when calculating over multiple chromosomes. Remove both arguments or specify a single chromosome.' ) # SAMPLES # check if requested samples exist in vcf # - parse + validate the population file # - format is IND POP (tab separated) # - throws an error if individuals are missing from VCF # read in the list of samples/populations poppanel = pandas.read_csv(args.populations, sep='\t', usecols=[0, 1], names=['ID', 'Population']) poppanel.head() # get a list of samples from the callset samples_list = vcf_headers.samples # make sure every indiv in the pop file is in the VCF callset IDs = list(poppanel['ID']) missing = list(set(IDs) - set(samples_list)) # find the samples in the callset index by matching up the order of samples between the population file and the callset # also check if there are invalid samples in the popfile try: samples_callset_index = [samples_list.index(s) for s in poppanel['ID']] except ValueError as e: raise Exception( '[pixy] ERROR: the following samples are listed in the population file but not in the VCF: ', missing) from e else: poppanel['callset_index'] = samples_callset_index # use the popindices dictionary to keep track of the indices for each population popindices = {} popnames = poppanel.Population.unique() for name in popnames: popindices[name] = poppanel[poppanel.Population == name].callset_index.values print("[pixy] Preparing for calculation of summary statistics: " + ','.join(map(str, args.stats))) print("[pixy] Data set contains " + str(len(popnames)) + " population(s), " + str(len(chrom_list)) + " chromosome(s), and " + str(len(IDs)) + " sample(s)") # initialize and remove any previous output files if os.path.exists(re.sub(r"[^\/]+$", "", args.outfile_prefix)) is not True: os.mkdir(re.sub(r"[^\/]+$", "", args.outfile_prefix)) # initialize the output files for writing if 'pi' in args.stats: pi_file = str(args.outfile_prefix) + "_pi.txt" if os.path.exists(pi_file): os.remove(pi_file) outfile = open(pi_file, 'a') outfile.write("pop" + "\t" + "chromosome" + "\t" + "window_pos_1" + "\t" + "window_pos_2" + "\t" + "avg_pi" + "\t" + "no_sites" + "\t" + "count_diffs" + "\t" + "count_comparisons" + "\t" + "count_missing" + "\n") outfile.close() if 'dxy' in args.stats: dxy_file = str(args.outfile_prefix) + "_dxy.txt" if os.path.exists(dxy_file): os.remove(dxy_file) outfile = open(dxy_file, 'a') outfile.write("pop1" + "\t" + "pop2" + "\t" + "chromosome" + "\t" + "window_pos_1" + "\t" + "window_pos_2" + "\t" + "avg_dxy" + "\t" + "no_sites" + "\t" + "count_diffs" + "\t" + "count_comparisons" + "\t" + "count_missing" + "\n") outfile.close() if 'fst' in args.stats: fst_file = str(args.outfile_prefix) + "_fst.txt" if os.path.exists(fst_file): os.remove(fst_file) outfile = open(fst_file, 'a') outfile.write("pop1" + "\t" + "pop2" + "\t" + "chromosome" + "\t" + "window_pos_1" + "\t" + "window_pos_2" + "\t" + "avg_wc_fst" + "\t" + "no_snps" + "\n") outfile.close() # initialize the folder structure for the zarr array if os.path.exists(args.zarr_path) is not True: pathlib.Path(args.zarr_path).mkdir(parents=True, exist_ok=True) # main loop for computing summary stats # time the calculations start_time = time.time() print("[pixy] Started calculations at " + time.strftime("%H:%M:%S", time.localtime(start_time))) for chromosome in chrom_list: # Zarr array conversion # the chromosome specific zarr path zarr_path = args.zarr_path + "/" + chromosome # determine the fields that will be included # TBD: just reading all fields currently # vcf_fields = ['variants/CHROM', 'variants/POS'] + ['calldata/' + s for s in np.unique(filter_fields)] # build region string (if using an interval) if args.interval_start is not None: targ_region = chromosome + ":" + str( args.interval_start) + "-" + str(args.interval_end) else: targ_region = chromosome # allow for resuse of previously calculated zarr arrays if args.reuse_zarr == 'yes' and os.path.exists(zarr_path): print( "[pixy] If a zarr array exists, it will be reused for chromosome " + chromosome + "...") elif args.reuse_zarr == 'no' or os.path.exists(zarr_path) is not True: print("[pixy] Building zarr array for chromosome " + chromosome + "...") warnings.filterwarnings("ignore") allel.vcf_to_zarr(args.vcf, zarr_path, region=targ_region, fields='*', overwrite=True) warnings.resetwarnings() print("[pixy] Calculating statistics for chromosome " + targ_region + "...") # open the zarr callset = zarr.open_group(zarr_path, mode='r') # parse the filtration expression and build the boolean filter array # define an operator dictionary for parsing the operator strings ops = { "<": operator.lt, "<=": operator.le, ">": operator.gt, ">=": operator.ge, "==": operator.eq } # determine the complete list of available calldata fields usable for filtration calldata_fields = sorted(callset['/calldata/'].array_keys()) # check if bypassing filtration, otherwise filter if args.bypass_filtration == 'no': # VARIANT SITE FILTERS var_filters = [] # iterate over each requested variant filter for x in args.variant_filter_expression.split(","): stat = re.sub("[^A-Za-z]+", "", x) value = int(re.sub("[^0-9]+", "", x)) compare = re.sub("[A-Za-z0-9]+", "", x) # check if the requested filter/format exists in the VCF try: stat_index = calldata_fields.index(stat) except ValueError as e: raise Exception( "[pixy] ERROR: The requested filter \'" + stat + "\' is not annotated in the input VCF FORMAT field" ) from e else: if type(var_filters) is list: var_filters = ops[compare](callset['/calldata/' + stat][:], value) elif type(var_filters) is not list: var_filters = np.logical_and( var_filters, ops[compare](callset['/calldata/' + stat][:], value)) # create a mask for variants only # is snp is a site level (1d) array # np.tile below creates a column of "is_snp" once for each sample # (i.e. makes it the same dimensions as the genotype table) is_snp = np.array([callset['/variants/is_snp'][:].flatten() ]).transpose() snp_mask = np.tile(is_snp, (1, var_filters.shape[1])) # force only variant sites (snps, remember we ignore indels) to be included in the filter var_filters = np.logical_and(var_filters, snp_mask) # INVARIANT SITE FILTERS invar_filters = [] for x in args.invariant_filter_expression.split(","): stat = re.sub("[^A-Za-z]+", "", x) value = int(re.sub("[^0-9]+", "", x)) compare = re.sub("[A-Za-z0-9]+", "", x) # check if the requested filter/format exists in the VCF try: stat_index = calldata_fields.index(stat) except ValueError as e: raise Exception( "[pixy] ERROR: The requested filter \'" + stat + "\' is not annotated in the input VCF") from e else: if type(invar_filters) is list: invar_filters = ops[compare](callset['/calldata/' + stat][:], value) elif type(var_filters) is not list: invar_filters = np.logical_and( invar_filters, ops[compare](callset['/calldata/' + stat][:], value)) # create a mask for invariant sites by inverting the snp filter # join that to the invariant sites filter invar_filters = np.logical_and(invar_filters, np.invert(snp_mask)) # join the variant and invariant filter masks (logical OR) filters = np.logical_or(invar_filters, var_filters) # applying the filter to the data # all the filters are in a boolean array ('filters' above) # first, recode the gt matrix as a Dask array (saves memory) -> packed # create a packed genotype array # this is a array with dims snps x samples # genotypes are represented by single byte codes # critically, as the same dims as the filters array below gt_array = allel.GenotypeArray( allel.GenotypeDaskArray(callset['/calldata/GT'])).to_packed() # apply filters # only if not bypassing filtration if args.bypass_filtration == 'no': # set all genotypes that fail filters (the inversion of the array) # to 'missing', 239 = -1 (i.e. missing) for packed arrays gt_array[np.invert(filters)] = 239 # convert the packed array back to a GenotypeArray gt_array = allel.GenotypeArray.from_packed(gt_array) # build the position array pos_array = allel.SortedIndex(callset['/variants/POS']) # a mask for snps and invariant sites snp_invar_mask = np.logical_or( np.logical_and(callset['/variants/is_snp'][:] == 1, callset['/variants/numalt'][:] == 1), callset['/variants/numalt'][:] == 0) # remove rows that are NOT snps or invariant sites from the genotype array gt_array = np.delete(gt_array, np.where(np.invert(snp_invar_mask)), axis=0) gt_array = allel.GenotypeArray(gt_array) # select rows that ARE snps or invariant sites in the position array pos_array = pos_array[snp_invar_mask] #Basic functions for comparing the genotypes at each site in a region: counts differences out of sites with data #For the given region: return average pi, # of differences, # of comparisons, and # missing. # this function loops over every site in a region passed to it #Basic functions for comparing the genotypes at each site in a region: counts differences out of sites with data #For the given region: return average pi, # of differences, # of comparisons, and # missing. # this function loops over every site in a region passed to it def tallyRegion(gt_region): total_diffs = 0 total_comps = 0 total_missing = 0 for site in gt_region: vec = site.flatten() #now we have an individual site as a numpy.ndarray, pass it to the comparison function site_diffs, site_comps, missing = compareGTs(vec) total_diffs += site_diffs total_comps += site_comps total_missing += missing if total_comps > 0: avg_pi = total_diffs / total_comps else: avg_pi = 0 return (avg_pi, total_diffs, total_comps, total_missing) #For the given region: return average dxy, # of differences, # of comparisons, and # missing. # this function loops over every site in a region passed to it def dxyTallyRegion(pop1_gt_region, pop2_gt_region): total_diffs = 0 total_comps = 0 total_missing = 0 for x in range(0, len(pop1_gt_region)): site1 = pop1_gt_region[x] site2 = pop2_gt_region[x] vec1 = site1.flatten() vec2 = site2.flatten() #now we have an individual site as 2 numpy.ndarrays, pass them to the comparison function site_diffs, site_comps, missing = dxyCompareGTs(vec1, vec2) total_diffs += site_diffs total_comps += site_comps total_missing += missing if total_comps > 0: avg_pi = total_diffs / total_comps else: avg_pi = 0 return (avg_pi, total_diffs, total_comps, total_missing) #Return the number of differences, the number of comparisons, and missing data count. def compareGTs(vec): #for pi c = Counter(vec) diffs = c[1] * c[0] gts = c[1] + c[0] missing = ( len(vec) ) - gts #anything that's not 1 or 0 is ignored and counted as missing comps = int(special.comb(gts, 2)) return (diffs, comps, missing) def dxyCompareGTs(vec1, vec2): #for dxy c1 = Counter(vec1) c2 = Counter(vec2) gt1zeros = c1[0] gt1ones = c1[1] gts1 = c1[1] + c1[0] gt2zeros = c2[0] gt2ones = c2[1] gts2 = c2[1] + c2[0] missing = (len(vec1) + len(vec2)) - ( gts1 + gts2 ) #anything that's not 1 or 0 is ignored and counted as missing diffs = (gt1zeros * gt2ones) + (gt1ones * gt2zeros) comps = gts1 * gts2 return (diffs, comps, missing) # Interval specification check # check if computing over specific intervals (otherwise, compute over whole chromosome) # window size window_size = args.window_size # set intervals based on args if (args.interval_end is None): interval_end = max(pos_array) else: interval_end = int(args.interval_end) if (args.interval_start is None): interval_start = min(pos_array) else: interval_start = int(args.interval_start) try: if (interval_start > interval_end): raise ValueError() except ValueError as e: raise Exception("[pixy] ERROR: The specified interval start (" + str(interval_start) + ") exceeds the interval end (" + str(interval_end) + ")") from e # catch misspecified intervals # TBD: harmonize this with the new interval method for the zarr array if (interval_end > max(pos_array)): print( "[pixy] WARNING: The specified interval end (" + str(interval_end) + ") exceeds the last position of the chromosome and has been substituted with " + str(max(pos_array))) interval_end = max(pos_array) if (interval_start < min(pos_array)): print( "[pixy] WARNING: The specified interval start (" + str(interval_start) + ") begins before the first position of the chromosome and has been substituted with " + str(min(pos_array))) interval_start = min(pos_array) if ((interval_end - interval_start + 1) < window_size): print( "[pixy] WARNING: The requested interval or total number of sites in the VCF (" + str(interval_start) + "-" + str(interval_end) + ") is smaller than the requested window size (" + str(window_size) + ")") # PI: # AVERAGE NUCLEOTIDE VARIATION WITHIN POPULATIONS # Compute pi over a chosen interval and window size if (args.populations is not None) and ('pi' in args.stats): # open the pi output file for writing outfile = open(pi_file, 'a') for pop in popnames: # window size: window_size = args.window_size # initialize window_pos_2 window_pos_2 = (interval_start + window_size) - 1 # loop over populations and windows, compute stats and write to file for window_pos_1 in range(interval_start, interval_end, window_size): # if the window has no sites, assign all NAs, # otherwise calculate pi if len(pos_array[(pos_array > window_pos_1) & (pos_array < window_pos_2)]) == 0: avg_pi, total_diffs, total_comps, total_missing, no_sites = "NA", "NA", "NA", "NA", 0 else: # pull out the genotypes for the window loc_region = pos_array.locate_range( window_pos_1, window_pos_2) gt_region1 = gt_array[loc_region] no_sites = len(gt_region1) # subset the window for the individuals in each population gt_pop = gt_region1.take(popindices[pop], axis=1) avg_pi, total_diffs, total_comps, total_missing = tallyRegion( gt_pop) outfile.write( str(pop) + "\t" + str(chromosome) + "\t" + str(window_pos_1) + "\t" + str(window_pos_2) + "\t" + str(avg_pi) + "\t" + str(no_sites) + "\t" + str(total_diffs) + "\t" + str(total_comps) + "\t" + str(total_missing) + "\n") window_pos_2 += window_size if window_pos_2 > interval_end: window_pos_2 = interval_end # close output file and print complete message outfile.close() print("[pixy] Pi calculations for chromosome " + chromosome + " complete and written to " + args.outfile_prefix + "_pi.txt") # DXY: # AVERAGE NUCLEOTIDE VARIATION BETWEEN POPULATIONS if (args.populations is not None) and ('dxy' in args.stats): # create a list of all pairwise comparisons between populations in the popfile dxy_pop_list = list(combinations(popnames, 2)) # open the dxy output file for writing outfile = open(dxy_file, 'a') # interate over all population pairs and compute dxy for pop_pair in dxy_pop_list: pop1 = pop_pair[0] pop2 = pop_pair[1] # window size: window_size = args.window_size # initialize window_pos_2 window_pos_2 = (interval_start + window_size) - 1 # perform the dxy calculation for all windows in the range for window_pos_1 in range(interval_start, interval_end, window_size): if len(pos_array[(pos_array > window_pos_1) & (pos_array < window_pos_2)]) == 0: avg_dxy, total_diffs, total_comps, total_missing, no_sites = "NA", "NA", "NA", "NA", 0 else: loc_region = pos_array.locate_range( window_pos_1, window_pos_2) gt_region1 = gt_array[loc_region] no_sites = len(gt_region1) # use the popGTs dictionary to keep track of this region's GTs for each population popGTs = {} for name in pop_pair: gt_pop = gt_region1.take(popindices[name], axis=1) popGTs[name] = gt_pop pop1_gt_region1 = popGTs[pop1] pop2_gt_region1 = popGTs[pop2] avg_dxy, total_diffs, total_comps, total_missing = dxyTallyRegion( pop1_gt_region1, pop2_gt_region1) outfile.write( str(pop1) + "\t" + str(pop2) + "\t" + str(chromosome) + "\t" + str(window_pos_1) + "\t" + str(window_pos_2) + "\t" + str(avg_dxy) + "\t" + str(no_sites) + "\t" + str(total_diffs) + "\t" + str(total_comps) + "\t" + str(total_missing) + "\n") window_pos_2 += window_size if window_pos_2 > interval_end: window_pos_2 = interval_end outfile.close() print("[pixy] Dxy calculations chromosome " + chromosome + " complete and written to " + args.outfile_prefix + "_dxy.txt") # FST: # WEIR AND COCKERHAMS FST # This is just a plain wrapper for the scikit-allel fst function if (args.populations is not None) and ('fst' in args.stats): # open the fst output file for writing outfile = open(fst_file, 'a') # determine all the possible population pairings pop_names = list(popindices.keys()) fst_pop_list = list(combinations(pop_names, 2)) #calculate maf allele_counts = gt_array.count_alleles() allele_freqs = allele_counts.to_frequencies() maf_array = allele_freqs[:, 1] > args.fst_maf_filter # apply the maf filter to the genotype array] gt_array_fst = gt_array[maf_array] gt_array_fst = allel.GenotypeArray(gt_array_fst) # apply the maf filter to the position array pos_array_fst = pos_array[maf_array] # for each pair, compute fst for pop_pair in fst_pop_list: # the indices for the individuals in each population fst_pop_indicies = [ popindices[pop_pair[0]].tolist(), popindices[pop_pair[1]].tolist() ] # compute FST # windowed_weir_cockerham_fst seems to generate (spurious?) warnings about div/0, so suppressing warnings # (this assumes that the scikit-allel function is working as intended) np.seterr(divide='ignore', invalid='ignore') a, b, c = allel.windowed_weir_cockerham_fst( pos_array_fst, gt_array_fst, subpops=fst_pop_indicies, size=args.window_size, start=interval_start, stop=interval_end) for fst, wind, snps in zip(a, b, c): outfile.write( str(pop_pair[0]) + "\t" + str(pop_pair[1]) + "\t" + str(chromosome) + "\t" + str(wind[0]) + "\t" + str(wind[1]) + "\t" + str(fst) + "\t" + str(snps) + "\n") outfile.close() print("[pixy] Fst calculations chromosome " + chromosome + " complete and written to " + args.outfile_prefix + "_fst.txt") print("\n[pixy] All calculations complete at " + time.strftime("%H:%M:%S", time.localtime(start_time))) end_time = (time.time() - start_time) print("[pixy] Time elapsed: " + time.strftime("%H:%M:%S", time.gmtime(end_time)))
#! /usr/bin/python # This script reads a vcf file and converts it into zarr format, for faster access from scikit-allel. # Usage: ./vcf2zarr.py <file.vcf> import os import numexpr os.environ["NUMEXPR_MAX_THREADS"] = "272" #if 'NUMEXPR_MAX_THREADS' in os.environ: os.environ.pop('NUMEXPR_MAX_THREADS') #import numexpr #print('NumExpr.nthreads =' + str(numexpr.nthreads)) import allel as al import zarr import numpy as np import sys vcfPath = sys.argv[1] variants = al.read_vcf(vcfPath, numbers={ 'GT': 2, 'ALT': 1 }, fields='*') # with all (*) fields read zarrPath = str(vcfPath.split('Bypop.')[0] + '.zarr') al.vcf_to_zarr(vcfPath, zarrPath, fields='*', log=sys.stdout, overwrite=True)
def convert_to_zarr(input_vcf_path, output_zarr_path, conversion_config, benchmark_profiler=None): """ Converts the original data (VCF) to a Zarr format. Only converts a single VCF file. If a BenchmarkRunner is provided, the actual VCF to Zarr conversion process will be benchmarked. :param input_vcf_path: The input VCF file location :param output_zarr_path: The desired Zarr output location :param conversion_config: Configuration data for the conversion :param benchmark_runner: BenchmarkRunner object to be used for benchmarking process :type input_vcf_path: str :type output_zarr_path: str :type conversion_config: config.VCFtoZarrConfigurationRepresentation :type benchmark_runner: core.BenchmarkProfiler """ if conversion_config is not None: # Ensure var is string, not pathlib.Path output_zarr_path = str(output_zarr_path) # Get fields to extract (for unit testing only) fields = conversion_config.fields # Get alt number if conversion_config.alt_number is None: print( "[VCF-Zarr] Determining maximum number of ALT alleles by scaling all variants in the VCF file." ) if benchmark_profiler is not None: benchmark_profiler.start_benchmark( operation_name="Read VCF file into memory for alt number") # Scan VCF file to find max number of alleles in any variant callset = allel.read_vcf(input_vcf_path, fields=['numalt'], log=sys.stdout) if benchmark_profiler is not None: benchmark_profiler.end_benchmark() numalt = callset['variants/numalt'] if benchmark_profiler is not None: benchmark_profiler.start_benchmark( operation_name="Determine maximum alt number") alt_number = np.max(numalt) if benchmark_profiler is not None: benchmark_profiler.end_benchmark() else: print("[VCF-Zarr] Using alt number provided in configuration.") # Use the configuration-provided alt number alt_number = conversion_config.alt_number print("[VCF-Zarr] Alt number: {}".format(alt_number)) # Get chunk length chunk_length = allel.io.vcf_read.DEFAULT_CHUNK_LENGTH if conversion_config.chunk_length is not None: chunk_length = conversion_config.chunk_length print("[VCF-Zarr] Chunk length: {}".format(chunk_length)) # Get chunk width chunk_width = allel.io.vcf_read.DEFAULT_CHUNK_WIDTH if conversion_config.chunk_width is not None: chunk_width = conversion_config.chunk_width print("[VCF-Zarr] Chunk width: {}".format(chunk_width)) if conversion_config.compressor == "Blosc": compressor = Blosc( cname=conversion_config.blosc_compression_algorithm, clevel=conversion_config.blosc_compression_level, shuffle=conversion_config.blosc_shuffle_mode) else: raise ValueError("Unexpected compressor type specified.") if benchmark_profiler is not None: benchmark_profiler.start_benchmark( operation_name="Convert VCF to Zarr") # Perform the VCF to Zarr conversion allel.vcf_to_zarr(input_vcf_path, output_zarr_path, alt_number=alt_number, overwrite=True, fields=fields, log=sys.stdout, compressor=compressor, chunk_length=chunk_length, chunk_width=chunk_width) if benchmark_profiler is not None: benchmark_profiler.end_benchmark()
def calc_obsStats(vcfpath, chrom, pops, coord_bed, zarrpath, outpath): """Calculate stats from a VCF file.""" # if reuse_zarr is true if zarrpath.exists(): zarrfile = zarrpath else: zarrfile = zarrpath allel.vcf_to_zarr(str(vcfpath), str(zarrpath), group=chrom, fields='*', alt_number=2, log=sys.stdout, compressor=numcodecs.Blosc(cname='zstd', clevel=1, shuffle=False)) # load pop info panel = pd.read_csv(pops, sep='\t', usecols=['sampleID', 'population']) # load zarr callset = zarr.open_group(str(zarrfile), mode='r') samples = callset[f'{chrom}/samples'][:] samples_list = list(samples) samples_callset_index = [samples_list.index(s) for s in panel['sampleID']] panel['callset_index'] = samples_callset_index panel = panel.sort_values(by='callset_index') # load gt pos = allel.SortedIndex(callset[f'{chrom}/variants/POS']) gt = allel.GenotypeArray(callset[f'{chrom}/calldata/GT']) # separate gt for each population ix_s = 0 pop_dt = {} pop_ix = [] for i, p in enumerate(panel["population"].unique()): p_ix = panel[panel["population"] == p]["callset_index"].values ix_e = len(p_ix) * 2 + ix_s pop_ix.append(list(range(ix_s, ix_e))) pop_dt[p] = gt.take(p_ix, axis=1).to_haplotypes() ix_s = ix_e # combine and transpose haps = np.concatenate(list(pop_dt.values()), axis=1).T # prep progress bar ln_count = 0 with open(coord_bed, 'r') as cb: for line in cb: if not line.startswith("chrom"): ln_count += 1 progressbar = tqdm(total=ln_count, desc="window numb", unit='window') # update stats_dt stats_dt["num_haps"] = haps.shape[0] stats_dt["pop_config"] = pop_ix stats_dt["length_bp"] = int( line.split()[-1]) # may be shorter than expected due to last window stats_dt["reps"] = ln_count # write headers outfile = outpath.parent / f"{outpath.stem}.Obs.pop_stats.txt" pops_outfile = open(outfile, 'w') pops_outfile, header_, header_ls = headers(pops_outfile, stats_dt, pop_names=list(pop_dt.keys()), obs=True) # calc stats # TODO: parallel chrom_ls = [] i = 0 stat_mat = np.zeros([ln_count, len(header_ls) - 1]) with open(coord_bed, 'r') as cb: for line in cb: if not line.startswith("chrom"): cb_lin = line.split() chrom = cb_lin[0] chrom_ls.append(chrom) start = int(cb_lin[1]) stop = int(cb_lin[2]) len_bp = stop - start stats_dt["length_bp"] = len_bp sites = int(cb_lin[3]) try: pos_ix = pos.locate_range(start, stop) except KeyError: continue pos_t = pos[pos_ix] - start haps_t = haps[:, pos_ix] counts_t = haps_t.sum(axis=0).astype(int) # run stats stats_ls = [start, stop, sites] popsumstats = PopSumStats(pos_t, haps_t, counts_t, stats_dt) for stat in stats_dt["calc_stats"]: stat_fx = getattr(popsumstats, stat) try: ss = stat_fx() # print(f"{stat} = {len(ss)}") except IndexError: ss = [np.nan] * len(stats_dt["pw_quants"]) stats_ls.extend(ss) try: stat_mat[i, :] = stats_ls i += 1 progressbar.update() except ValueError: continue # write stats out stat_mean = np.round(np.nanmean(stat_mat, axis=0), 5) stats_str = "\t".join(map(str, stat_mean[3:])) pops_outfile.write( f"mean_{chrom}\t{int(stat_mat[0, 0])}\t{stop}\t{np.sum(stat_mat[:, 2])}\t{stats_str}\n" ) for stat in range(stat_mat.shape[0]): chrom = chrom_ls[stat] start = int(stat_mat[stat, 0]) stop = int(stat_mat[stat, 1]) sites = int(stat_mat[stat, 2]) rd = [round(num, 5) for num in stat_mat[stat, 3:]] stats_str = "\t".join(map(str, rd)) pops_outfile.write(f"{chrom}\t{start}\t{stop}\t{sites}\t{stats_str}\n") progressbar.close() pops_outfile.close() return outfile
# 'variants/MEND', # 'variants/MLEN', # 'variants/MSTART', # 'variants/SVLEN', # 'variants/SVTYPE', # 'variants/TSD', # 'variants/AC', # 'variants/AF', # 'variants/NS', # 'variants/AN', # 'variants/EAS_AF', # 'variants/EUR_AF', # 'variants/AFR_AF', # 'variants/AMR_AF', # 'variants/SAS_AF', # 'variants/DP', # 'variants/AA', # 'variants/VT', # 'variants/EX_TARGET', # 'variants/MULTI_ALLELIC'] # test_fields += ['variants/numalt', 'variants/svlen', 'variants/is_snp'] test_fields += ['variants/numalt','variants/is_snp', 'variants/svlen'] # test_fields = ['variants/*'] ska.vcf_to_zarr(vcf_file, vcf_file.replace('.vcf.gz', '.zarr'), fields=test_fields, alt_number=8, overwrite=True, compressor=numcodecs.Blosc(cname='zstd', clevel=1, shuffle=False))
#Initial configuration, probably overkill in imports. import sys, os, re import numpy as np import allel import zarr import dask import numcodecs import warnings from pathlib import Path import pandas as pd import argparse parser = argparse.ArgumentParser(description='Convert vcf to zarr.') parser.add_argument('input') parser.add_argument('output') args = parser.parse_args() print("starting", args.input, args.output) allel.vcf_to_zarr(args.input, args.output, fields='*', overwrite=False)
#zarr.__version__ #numcodecs.__version__ #np.__version__ ## Data source ID_table = pd.read_csv('../Data/match_combined_filtered/ID_table.csv') VCFdata_fp = "../Data/match_combined_filtered/all_chr22.vcf.gz" zarr_path = '../Data/match_combined_filtered/' # !ls -lh {VCFdata_fp} # list the files in the directory # format conversion allel.vcf_to_zarr(VCFdata_fp, zarr_path + 'all_chr22.zarr', group='chr22', fields='*', log=sys.stdout, overwrite=True) callset = zarr.open_group(zarr_path + 'all_chr22.zarr', mode='r') callset.tree(expand=True) gt_zarr = callset['chr22/calldata/GT'] gt_zarr.info #pos = callset['chr22/variants/POS'] #loc_region = pos.locate_range(20000, 20100) samples = callset['chr22/samples'] # columns gt = allel.GenotypeArray(gt_zarr) # genotypes
# run in parallel ################################################################################ # conda activate vcf # VCF_DIR="/mnt/md0/malariaGen_genomic/pf3k_v5/5.1" # OUT_DIR="/mnt/md0/malariaGen_genomic/pf3k_v5/pf3k_zarr"" # parallel "python3 pf3k_vcf2zarr.py $VCF_DIR $OUT_DIR {}":::{01..14} ################################################################################ import os, sys import zarr import allel VCF_DIR, OUT_DIR, CHROM = sys.argv[:] allel.vcf_to_zarr( "/".join([ VCF_DIR, "_".join(["SNP_INDEL_Pf3D7", CHROM, "v3.combined.filtered.vcf.gz"]) ]), "/".join([OUT_DIR, "_".join(["SNP_INDEL_Pf3D7", CHROM, "v3.zarr"])]), fields='*', overwrite=False)