def GetVCFHeader(self, filepath_vcf): print("begin read header") headers = allel.read_vcf_headers(filepath_vcf) print("read header complete") infofields = [] ontologyfield = [ 'HPO', 'DO', 'SO', 'MC', 'GO', ] i = 0 for ontoele in ontologyfield: if ontoele in headers.infos: i += 1 if i == len(ontologyfield ): # has ontology, these ontology should be put behind for ele in headers.infos: if ele not in ontologyfield: infofields.append(ele) infofields.extend(ontologyfield) else: for ele in headers.infos: infofields.append(ele) return infofields
def generate_vcf_classes(vcfs): print("Parsing VCFs") parsed_vcf_bodies = list(map(lambda x: allel.read_vcf(x, fields="*"), vcfs)) parsed_vcf_bodies = list(filter(None, parsed_vcf_bodies)) deque( map( lambda x: x.update(samples=numpy.char.upper(x['samples'].tolist()) ), parsed_vcf_bodies)) deque(map(lambda x, y: x.update(FILE=y), parsed_vcf_bodies, vcfs)) add_headers = lambda x, y: x.update(header=allel.read_vcf_headers(y)) deque(map(add_headers, parsed_vcf_bodies, vcfs)) return parsed_vcf_bodies
req_grp.add_argument("--outpre", "-o", dest="outpre", help="output prefix", type=str, required=True) args = parser.parse_args() import numpy as np import random import allel random.seed(args.iteration) #read vcf header which contains list of sample names etc. vcf_header = allel.read_vcf_headers(args.vcf) #extract sample names from header samples = vcf_header.samples #get the deme id for each sample demes = [i for i in range(0, 36) for j in range(0, 250)] # ####### Make mate pairs # # (1)randomly across the entire grid # # pairs=[] # this will store mom and dad's IDs # # # list to store the population id for the sibling # #this will be randomly picked to be the pop of one of the parents # sibs_pop=[]
import matplotlib.cm as cm #Initializing variables headers = [] vcfs = glob.glob("VcfData/./*.vcf") vcf_num = len(vcfs) snps = [None] * vcf_num colour_snps = [None] * vcf_num pos = [None] * vcf_num labels = [None] * vcf_num plt.axis('off') for i in range(vcf_num): headers.append(allel.read_vcf_headers(vcfs[i])) labels[i] = vcfs[i].split('_')[-1] file = allel.read_vcf(vcfs[i]) gt = allel.GenotypeArray(file['calldata/GT']) dim = gt.shape #print(gt[0,0,0]) alt = file['variants/ALT'] ref = file['variants/REF'] #print(dim,alt.shape,ref.shape) #print(file['variants/numalt']) #print(alt) allele_to_color = { 'A': (255 / 2, 255 / 2, 0), 'C': (0, 255 / 2, 255 / 2),
def check_and_validate_args(args): # CHECK FOR TABIX tabix_path = shutil.which("tabix") if tabix_path is None: raise Exception('[pixy] ERROR: tabix is not installed (or cannot be located in the path). Install tabix with "conda install -c bioconda htslib".') if args.vcf is None: raise Exception('[pixy] ERROR: The --vcf argument is missing or incorrectly specified.') if args.populations is None: raise Exception('[pixy] ERROR: The --populations argument is missing or incorrectly specified.') # reformat file paths for compatibility args.vcf = os.path.expanduser(args.vcf) args.populations = os.path.expanduser(args.populations) if args.output_folder != '': output_folder = args.output_folder + "/" else: output_folder = os.path.expanduser(os.getcwd() + "/") output_prefix = output_folder + args.output_prefix # get vcf header info vcf_headers = allel.read_vcf_headers(args.vcf) print("\n[pixy] Validating VCF and input parameters...") # CHECK OUTPUT FOLDER print("[pixy] Checking write access...", end = '') check_message = "OK" # attempt to create the output folder if os.path.exists(output_folder) is not True: os.makedirs(output_folder) # check if output folder is writable #if not os.access(re.sub(r"[^\/]+$", "", args.outfile_prefix), os.W_OK): if not os.access(output_folder, os.W_OK): raise Exception('[pixy] ERROR: The output folder ' + output_folder + ' is not writable') # check if output_prefix is correctly specified if "/" in str(args.output_prefix) or "\\" in str(args.output_prefix): raise Exception('[pixy] ERROR: The output prefix \'' + str(args.output_prefix) + '\' contains slashes. Remove them and specify output folder structure with --output_folder if necessary.') # generate a name for a unique temp file for collecting output temp_file = output_folder + "pixy_tmpfile_" + str(uuid.uuid4().hex) + ".tmp" # check if temp file is writable with open(temp_file, 'w') as f: pass if check_message == "OK": print(check_message) # CHECK CPU CONFIGURATION print("[pixy] Checking CPU configuration...", end = '') check_message = "OK" if (args.n_cores > mp.cpu_count()): check_message = "WARNING" print(check_message) print('[pixy] WARNING: ' + str(args.n_cores) + ' CPU cores requested but only ' + str(mp.cpu_count()) + ' are available. Using '+ str(mp.cpu_count()) +'.') args.n_cores = mp.cpu_count() if check_message == "OK": print(check_message) # CHECK FOR EXISTANCE OF INPUT FILES if os.path.exists(args.vcf) is not True: raise Exception('[pixy] ERROR: The specified VCF ' + str(args.vcf) + ' does not exist') if not re.search(".gz", args.vcf): raise Exception('[pixy] ERROR: The vcf is not compressed with bgzip (or has no .gz extension). To fix this, run "bgzip [filename].vcf" first (and then index with "tabix [filename].vcf.gz" if necessary)') if not os.path.exists(args.vcf + ".tbi"): raise Exception('[pixy] ERROR: The vcf is not indexed with tabix. To fix this, run "tabix [filename].vcf.gz" first') if os.path.exists(args.populations) is not True: raise Exception('[pixy] ERROR: The specified populations file ' + str(args.populations) + ' does not exist') if args.bed_file is not None: args.bed_file = os.path.expanduser(args.bed_file) if os.path.exists(args.bed_file) is not True: raise Exception('[pixy] ERROR: The specified BED file ' + str(args.bed_file) + ' does not exist') else: bed_df = [] if args.sites_file is not None: args.sites_file = os.path.expanduser(args.sites_file) if os.path.exists(args.sites_file) is not True: raise Exception('[pixy] ERROR: The specified sites file ' + str(args.sites_file) + ' does not exist') else: sites_df = [] # VALIDATE THE VCF # check if the vcf contains any invariant sites # a very basic check: just looks for at least one invariant site in the alt field print("[pixy] Checking for invariant sites...", end = '') check_message = "OK" if args.bypass_invariant_check=='no': alt_list = subprocess.check_output("gunzip -c " + args.vcf + " | grep -v '#' | head -n 10000 | awk '{print $5}' | sort | uniq", shell=True).decode("utf-8").split() if "." not in alt_list: raise Exception('[pixy] ERROR: the provided VCF appears to contain no invariant sites (ALT = \".\"). This check can be bypassed via --bypass_invariant_check \'yes\'.') if "." in alt_list and len(alt_list) == 1 : raise Exception('[pixy] ERROR: the provided VCF appears to contain no variable sites. It may have been filtered incorrectly, or otherwise corrupted.') else: if not (len(args.stats) == 1 and (args.stats[0] == 'fst')): check_message = "WARNING" print(check_message) print("[pixy] EXTREME WARNING: --bypass_invariant_check is set to \'yes\'. Note that a lack of invariant sites will result in incorrect estimates.") if check_message == "OK": print(check_message) # check if requested chromosomes exist in vcf # parses the whole CHROM column (!) print("[pixy] Checking chromosome data...", end = '') # get the list of all chromosomes in the dataset chrom_all = subprocess.check_output("tabix -l " + args.vcf, shell=True).decode("utf-8").split() if args.chromosomes != 'all': chrom_list = list(args.chromosomes.split(",")) # pretabix method, can remove # chrom_all = subprocess.check_output("gunzip -c " + args.vcf + " | grep -v '#' | awk '{print $1}' | uniq", shell=True).decode("utf-8").split() chrom_all = subprocess.check_output("tabix -l " + args.vcf, shell=True).decode("utf-8").split() missing = list(set(chrom_list)-set(chrom_all)) if len(missing) >0: raise Exception('[pixy] ERROR: the following chromosomes were specified but not occur in the VCF: ', missing) else: #added this else statement (klk) chrom_list = subprocess.check_output("tabix -l " + args.vcf, shell=True).decode("utf-8").split() chrom_all = chrom_list print("OK") # INTERVALS # check if intervals are correctly specified # validate the BED file (if present) print("[pixy] Checking intervals/sites...", end = '') check_message = "OK" if args.bed_file is None: if args.window_size is None: raise Exception('[pixy] ERROR: In the absence of a BED file, a --window_size must be specified.') if args.interval_start is None and args.interval_end is not None: raise Exception('[pixy] ERROR: When specifying an interval, both --interval_start and --interval_end are required.') if args.interval_start is not None and args.interval_end is None: raise Exception('[pixy] ERROR: When specifying an interval, both --interval_start and --interval_end are required.') if (args.interval_start is not None or args.interval_end is not None) and len(chrom_list) > 1: raise Exception('[pixy] ERROR: --interval_start and --interval_end are not valid when calculating over multiple chromosomes. Remove both arguments or specify a single chromosome.') if (args.interval_start is not None and args.interval_end is not None) and ((int(args.interval_end) - int(args.interval_start)) <= int(args.window_size)): check_message = "WARNING" print('[pixy] WARNING: The specified interval ' + str(args.interval_start) + '-' + str(args.interval_end) + ' is smaller than the window size (' + str(args.window_size) + '). A single window will be returned.') else: if args.interval_start is not None or args.interval_end is not None or args.window_size is not None: check_message = "ERROR" print(check_message) raise Exception('[pixy] ERROR: --interval_start, --interval_end, and --window_size are not valid when a BED file of windows is provided.') # read in the bed file and extract the chromosome column bed_df = pandas.read_csv(args.bed_file, sep='\t', usecols=[0,1,2], names=['chrom', 'pos1', 'pos2']) bed_df['chrom'] = bed_df['chrom'].astype(str) # force chromosomes to strings if bed_df.isnull().values.any(): check_message = "ERROR" print(check_message) raise Exception('[pixy] ERROR: your bed file contains missing data, confirm all rows have three fields (chrom, pos1, pos2).') if len(bed_df.columns) != 3: check_message = "ERROR" print(check_message) raise Exception('[pixy] ERROR: The bed file has the wrong number of columns (should be 3, is ' + str(len(bed_df.columns)) + ')') else: bed_df.columns = ['chrom', 'chromStart', 'chromEnd'] bed_chrom = list(bed_df['chrom']) missing = list(set(bed_chrom)-set(chrom_all)) chrom_list = list(set(chrom_all) & set(bed_chrom)) if len(missing) >0: check_message = "WARNING" print(check_message) print('[pixy] WARNING: the following chromosomes in the BED file do not occur in the VCF and will be ignored: ' + str(missing)) if args.sites_file is not None: sites_df = pandas.read_csv(args.sites_file, sep='\t', usecols=[0,1], names=['chrom', 'pos']) sites_df['chrom'] = sites_df['chrom'].astype(str) if sites_df.isnull().values.any(): check_message = "ERROR" print(check_message) raise Exception('[pixy] ERROR: your sites file contains missing data, confirm all rows have two fields (chrom, pos).') if len(sites_df.columns) != 2: raise Exception('[pixy] ERROR: The sites file has the wrong number of columns (should be 2, is ' + str(len(sites_df.columns)) + ')') else: sites_df.columns = ['CHROM', 'POS'] chrom_sites = list(sites_df['CHROM']) missing = list(set(chrom_sites)-set(chrom_all)) chrom_list = list(set(chrom_all) & set(chrom_sites)) if len(missing) >0: check_message = "WARNING" print(check_message) print('[pixy] WARNING: the following chromosomes in the sites file do not occur in the VCF and will be ignored: ' + str(missing)) if check_message == "OK": print(check_message) # SAMPLES # check if requested samples exist in vcf print("[pixy] Checking sample data...", end = '') # - parse + validate the population file # - format is IND POP (tab separated) # - throws an error if individuals are missing from VCF # read in the list of samples/populations poppanel = pandas.read_csv(args.populations, sep='\t', usecols=[0,1], names=['ID', 'Population']) poppanel['ID'] = poppanel['ID'].astype(str) # check for missing values if poppanel.isnull().values.any(): check_message = "ERROR" print(check_message) raise Exception('[pixy] ERROR: your populations file contains missing data, confirm all samples have population IDs (and vice versa).') # get a list of samples from the callset samples_list = vcf_headers.samples # make sure every indiv in the pop file is in the VCF callset IDs = list(poppanel['ID']) missing = list(set(IDs)-set(samples_list)) # find the samples in the callset index by matching up the order of samples between the population file and the callset # also check if there are invalid samples in the popfile try: samples_callset_index = [samples_list.index(s) for s in poppanel['ID']] except ValueError as e: check_message = "ERROR" print(check_message) raise Exception('[pixy] ERROR: the following samples are listed in the population file but not in the VCF: ', missing) from e else: poppanel['callset_index'] = samples_callset_index # use the popindices dictionary to keep track of the indices for each population popindices={} popnames = poppanel.Population.unique() for name in popnames: popindices[name] = poppanel[poppanel.Population == name].callset_index.values if len(popnames) == 1 and ("fst" in args.stats or "dxy" in args.stats): check_message = "ERROR" print(check_message) raise Exception('[pixy] ERROR: calcuation of fst and/or dxy requires at least two populations to be defined in the population file.') print("OK") print("[pixy] All initial checks past!") return popnames, popindices, chrom_list, IDs, temp_file, output_folder, output_prefix, bed_df, sites_df
cmd = f"impute2 -m {geneticMap} -known_haps_g {prephased} -h {self.panel} -l {self.legend} -Ne 20000 -int {regionStart} {regionEnd} -o {outfile} -allow_large_regions" print(cmd) #subprocess.call(cmd) chrom = 22 uaeCount = 153 qtrCount = 1005 kgpCount = 2504 panelBase = 'uqk_all_chr22' # 'uqk_arabSNPs_chr22' vcf = f"{panelBase}.vcf.gz" ## to be updated with merged file ## get SampleIds FullGenome(sampleFile): data = allel.read_vcf_headers(f"{datadir}/{vcf}") ## TODO: filter dataset by SNPs from one chromosome, for starters pi = PlinkInterface(wdir=datadir, base=panelBase, parseChromoPos=False) pi.readDistanceMatrix(upgma=False) ## increasing neighbor set splitter = ShuffleSplit(n_splits=5, test_size=.2, random_state=0) #splitter.get_n_splits(data.samples) fold = 0 ## only split UAE samples ## Cross validation, each round leaves some UAE test samples out for trainIdx0, testIdx in splitter.split(data.samples[:uaeCount]): ## choose around 30 UAE samples as test set, remaining 120 + others as superset from which training sets are selected using trainIdx = list(trainIdx0) + list(range(len(data.samples[uaeCount:]))) Xsuper = [
import matplotlib.cm as cm import glob # create variables containing the vcfs vcfs = glob.glob("/Users/nathanrobins/Documents/UG_proj/EDAR_Data_Splicing/*.vcf") head = [] num_vcfs = len(vcfs) position = [None]*num_vcfs var = [None]*num_vcfs label = [None]*num_vcfs #plt.axis('off') # for loop to seperate the vcfs for n in range(num_vcfs): head.append(allel.read_vcf_headers(vcfs[n])) callset = allel.read_vcf(vcfs[n]) # print(sorted(callset.keys())) # samples represents individuals # POS represents the position # calldata/GT = genotype calls GT = allel.GenotypeArray(callset['calldata/GT']) shape = GT.shape # print(shape) alt = callset['variants/ALT'] ###### DOUBLE CHECK WITH MATTEO THAT I ONLY WANT TO TREAT THINGS AS BIALLELIC? --> ###### AS THEN I CAN USE ...(callset, numbers={'ALT :1'}) # print(alt) ref = callset['variants/REF'] # print(ref)
def main(args=None): if args is None: args = sys.argv[1:] # the ascii help image help_image = "█▀▀█ ░▀░ █░█ █░░█\n" "█░░█ ▀█▀ ▄▀▄ █▄▄█\n" "█▀▀▀ ▀▀▀ ▀░▀ ▄▄▄█\n" help_text = 'pixy: sensible estimates of pi and dxy from a VCF' version_text = 'version 0.95.0' # initialize arguments parser = argparse.ArgumentParser( description=help_image + help_text + '\n' + version_text, formatter_class=argparse.RawTextHelpFormatter) parser.add_argument('--version', action='version', version=version_text) parser.add_argument( '--stats', nargs='+', choices=['pi', 'dxy', 'fst'], help= 'Which statistics to calculate from the VCF (pi, dxy, and/or fst, separated by spaces)', required=True) parser.add_argument('--vcf', type=str, nargs='?', help='Path to the input VCF', required=True) parser.add_argument('--zarr_path', type=str, nargs='?', help='Folder in which to build the Zarr array(s)', required=True) parser.add_argument( '--reuse_zarr', choices=['yes', 'no'], default='no', help='Use existing Zarr array(s) (saves time if re-running)') parser.add_argument('--populations', type=str, nargs='?', help='Path to the populations file', required=True) parser.add_argument( '--window_size', type=int, nargs='?', help='Window size in base pairs over which to calculate pi/dxy') parser.add_argument( '--chromosomes', type=str, nargs='?', default='all', help= 'A single-quoted, comma separated list of chromosome(s) (e.g. \'X,1,2\')', required=False) parser.add_argument( '--interval_start', type=str, nargs='?', help= 'The start of the interval over which to calculate pi/dxy. Only valid when calculating over a single chromosome.' ) parser.add_argument( '--interval_end', type=str, nargs='?', help= 'The end of the interval over which to calculate pi/dxy. Only valid when calculating over a single chromosome.' ) parser.add_argument( '--variant_filter_expression', type=str, nargs='?', help= 'A single-quoted, comma separated list of genotype filters (e.g. \'DP>=10,GQ>=20\') to apply to SNPs', required=False) parser.add_argument( '--invariant_filter_expression', type=str, nargs='?', help= 'A single-quoted, comma separated list of genotype filters (e.g. \'DP>=10,RGQ>=20\') to apply to invariant sites', required=False) parser.add_argument( '--outfile_prefix', type=str, nargs='?', default='./pixy_output', help='Path and prefix for the output file, e.g. path/to/outfile') parser.add_argument( '--bypass_filtration', choices=['yes', 'no'], default='no', help= 'Bypass all variant filtration (for data lacking FORMAT fields, use with caution)' ) parser.add_argument( '--bypass_invariant_check', choices=['yes', 'no'], default='no', help= 'Allow computation of stats without invariant sites, will result in wildly incorrect estimates most of the time. Use with extreme caution.' ) parser.add_argument( '--fst_maf_filter', default=0.05, type=float, nargs='?', help= 'Minor allele frequency filter for FST calculations, with value 0.0-1.0 (default 0.05).' ) # ag1000g test data # args = parser.parse_args('--stats fst --vcf data/vcf/multi_chr.vcf.gz --zarr_path data/vcf/multi --window_size 10000 --populations data/vcf/ag1000/Ag1000_sampleIDs_popfile_3.txt --variant_filter_expression DP>=10,GQ>20 --invariant_filter_expression DP>=10,RGQ>20 --outfile_prefix output/pixy_out'.split()) # filter test data # args = parser.parse_args('--stats pi --vcf data/vcf/filter_test.vcf.gz --zarr_path data/vcf/filter_test --window_size 3 --populations data/vcf/ag1000/Ag1000_sampleIDs_popfile_3.txt --variant_filter_expression DP>=10,GQ>20 --invariant_filter_expression DP>=10,RGQ>20 --fst_maf_filter 0.05 --outfile_prefix output/pixy_out'.split()) # catch arguments from the command line args = parser.parse_args() # CHECK FOR TABIX # (disabled until we implement site level and BED support) #tabix_path = shutil.which("tabix") #if tabix_path is None: # warnings.warn('[pixy] WARNING: tabix is not installed (or cannot be located) -- this may reduce performance. install tabix with "conda install -c bioconda tabix"') #if not os.path.exists(args.vcf + ".tbi") and tabix_path is not None: # raise Exception('[pixy] ERROR: your vcf is not indexed with tabix, index the bgzipped vcf with "tabix your.vcf.gz"') # VALIDATE ARGUMENTS print("[pixy] pixy " + version_text) print( "[pixy] Validating VCF and input parameters (this may take some time)..." ) # expand all file paths args.vcf = os.path.expanduser(args.vcf) args.zarr_path = os.path.expanduser(args.zarr_path) args.populations = os.path.expanduser(args.populations) args.outfile_prefix = os.path.expanduser(args.outfile_prefix) # CHECK FOR EXISTANCE OF VCF AND POPFILES if os.path.exists(args.vcf) is not True: raise Exception('[pixy] ERROR: The specified VCF ' + str(args.vcf) + ' does not exist') if os.path.exists(args.populations) is not True: raise Exception('[pixy] ERROR: The specified populations file ' + str(args.populations) + ' does not exist') # VALIDATE FILTER EXPRESSIONS # get vcf header info vcf_headers = allel.read_vcf_headers(args.vcf) # skip invariant check if only asking for FST if len(args.stats) == 1 and (args.stats[0] == 'fst'): args.bypass_invariant_check = "yes" # if we are bypassing the invariant check, spoof in a invariant filter if args.bypass_invariant_check == "yes": args.invariant_filter_expression = "DP>=0" if args.bypass_filtration == 'no' and ( args.variant_filter_expression is None or args.invariant_filter_expression is None): raise Exception( '[pixy] ERROR: One or more filter expression is missing. Provide two filter expressions, or set --bypass_filtration to \'yes\'' ) if args.bypass_filtration == 'no': # get the list of format fields and requested filter fields format_fields = vcf_headers.formats.keys() filter_fields = list() for x in args.variant_filter_expression.split(","): filter_fields.append(re.sub("[^A-Za-z]+", "", x)) for x in args.invariant_filter_expression.split(","): filter_fields.append(re.sub("[^A-Za-z]+", "", x)) missing = list(set(filter_fields) - set(format_fields)) if len(missing) > 0: raise Exception( '[pixy] ERROR: the following genotype filters were requested but not occur in the VCF: ', missing) else: print( "[pixy] WARNING: --bypass_filtration is set to \'yes\', genotype filtration will be not be performed." ) # VALIDATE THE VCF # check if the vcf is zipped if re.search(".gz", args.vcf): cat_prog = "gunzip -c " else: cat_prog = "cat " # check if the vcf contains any invariant sites # a very basic check: just looks for at least one invariant site in the alt field if args.bypass_invariant_check == 'no': alt_list = subprocess.check_output( cat_prog + args.vcf + " | grep -v '#' | head -n 10000 | awk '{print $5}' | sort | uniq", shell=True).decode("utf-8").split() if "." not in alt_list: raise Exception( '[pixy] ERROR: the provided VCF appears to contain no invariant sites (ALT = \".\"). This check can be bypassed via --bypass_invariant_check \'yes\'.' ) else: if not (len(args.stats) == 1 and (args.stats[0] == 'fst')): print( "[pixy] EXTREME WARNING: --bypass_invariant_check is set to \'yes\', which assumes that your VCF contains invariant sites. Lack of invariant sites will result in incorrect estimates." ) # check if requested chromosomes exist in vcf # defaults to all the chromosomes contained in the VCF (first data column) if args.chromosomes == 'all': chrom_list = subprocess.check_output( cat_prog + args.vcf + " | grep -v '#' | awk '{print $1}' | uniq", shell=True).decode("utf-8").split() chrom_all = chrom_list if args.chromosomes == 'all': chrom_list = subprocess.check_output( cat_prog + args.vcf + " | grep -v '#' | awk '{print $1}' | uniq", shell=True).decode("utf-8").split() chrom_all = chrom_list else: chrom_list = list(args.chromosomes.split(",")) chrom_all = subprocess.check_output( cat_prog + args.vcf + " | grep -v '#' | awk '{print $1}' | uniq", shell=True).decode("utf-8").split() missing = list(set(chrom_list) - set(chrom_all)) if len(missing) > 0: raise Exception( '[pixy] ERROR: the following chromosomes were requested but not occur in the VCF: ', missing) # INTERVALS # check if intervals are correctly specified if args.interval_start is not None and args.interval_end is None: raise Exception( '[pixy] ERROR: Both --interval_start and --interval_end must be specified' ) if args.interval_start is None and args.interval_end is not None: raise Exception( '[pixy] ERROR: Both --interval_start and --interval_end must be specified' ) if args.interval_start is not None and args.interval_end is not None and len( chrom_list) > 1: raise Exception( '[pixy] ERROR: --interval_start and --interval_end are not valid when calculating over multiple chromosomes. Remove both arguments or specify a single chromosome.' ) # SAMPLES # check if requested samples exist in vcf # - parse + validate the population file # - format is IND POP (tab separated) # - throws an error if individuals are missing from VCF # read in the list of samples/populations poppanel = pandas.read_csv(args.populations, sep='\t', usecols=[0, 1], names=['ID', 'Population']) poppanel.head() # get a list of samples from the callset samples_list = vcf_headers.samples # make sure every indiv in the pop file is in the VCF callset IDs = list(poppanel['ID']) missing = list(set(IDs) - set(samples_list)) # find the samples in the callset index by matching up the order of samples between the population file and the callset # also check if there are invalid samples in the popfile try: samples_callset_index = [samples_list.index(s) for s in poppanel['ID']] except ValueError as e: raise Exception( '[pixy] ERROR: the following samples are listed in the population file but not in the VCF: ', missing) from e else: poppanel['callset_index'] = samples_callset_index # use the popindices dictionary to keep track of the indices for each population popindices = {} popnames = poppanel.Population.unique() for name in popnames: popindices[name] = poppanel[poppanel.Population == name].callset_index.values print("[pixy] Preparing for calculation of summary statistics: " + ','.join(map(str, args.stats))) print("[pixy] Data set contains " + str(len(popnames)) + " population(s), " + str(len(chrom_list)) + " chromosome(s), and " + str(len(IDs)) + " sample(s)") # initialize and remove any previous output files if os.path.exists(re.sub(r"[^\/]+$", "", args.outfile_prefix)) is not True: os.mkdir(re.sub(r"[^\/]+$", "", args.outfile_prefix)) # initialize the output files for writing if 'pi' in args.stats: pi_file = str(args.outfile_prefix) + "_pi.txt" if os.path.exists(pi_file): os.remove(pi_file) outfile = open(pi_file, 'a') outfile.write("pop" + "\t" + "chromosome" + "\t" + "window_pos_1" + "\t" + "window_pos_2" + "\t" + "avg_pi" + "\t" + "no_sites" + "\t" + "count_diffs" + "\t" + "count_comparisons" + "\t" + "count_missing" + "\n") outfile.close() if 'dxy' in args.stats: dxy_file = str(args.outfile_prefix) + "_dxy.txt" if os.path.exists(dxy_file): os.remove(dxy_file) outfile = open(dxy_file, 'a') outfile.write("pop1" + "\t" + "pop2" + "\t" + "chromosome" + "\t" + "window_pos_1" + "\t" + "window_pos_2" + "\t" + "avg_dxy" + "\t" + "no_sites" + "\t" + "count_diffs" + "\t" + "count_comparisons" + "\t" + "count_missing" + "\n") outfile.close() if 'fst' in args.stats: fst_file = str(args.outfile_prefix) + "_fst.txt" if os.path.exists(fst_file): os.remove(fst_file) outfile = open(fst_file, 'a') outfile.write("pop1" + "\t" + "pop2" + "\t" + "chromosome" + "\t" + "window_pos_1" + "\t" + "window_pos_2" + "\t" + "avg_wc_fst" + "\t" + "no_snps" + "\n") outfile.close() # initialize the folder structure for the zarr array if os.path.exists(args.zarr_path) is not True: pathlib.Path(args.zarr_path).mkdir(parents=True, exist_ok=True) # main loop for computing summary stats # time the calculations start_time = time.time() print("[pixy] Started calculations at " + time.strftime("%H:%M:%S", time.localtime(start_time))) for chromosome in chrom_list: # Zarr array conversion # the chromosome specific zarr path zarr_path = args.zarr_path + "/" + chromosome # determine the fields that will be included # TBD: just reading all fields currently # vcf_fields = ['variants/CHROM', 'variants/POS'] + ['calldata/' + s for s in np.unique(filter_fields)] # build region string (if using an interval) if args.interval_start is not None: targ_region = chromosome + ":" + str( args.interval_start) + "-" + str(args.interval_end) else: targ_region = chromosome # allow for resuse of previously calculated zarr arrays if args.reuse_zarr == 'yes' and os.path.exists(zarr_path): print( "[pixy] If a zarr array exists, it will be reused for chromosome " + chromosome + "...") elif args.reuse_zarr == 'no' or os.path.exists(zarr_path) is not True: print("[pixy] Building zarr array for chromosome " + chromosome + "...") warnings.filterwarnings("ignore") allel.vcf_to_zarr(args.vcf, zarr_path, region=targ_region, fields='*', overwrite=True) warnings.resetwarnings() print("[pixy] Calculating statistics for chromosome " + targ_region + "...") # open the zarr callset = zarr.open_group(zarr_path, mode='r') # parse the filtration expression and build the boolean filter array # define an operator dictionary for parsing the operator strings ops = { "<": operator.lt, "<=": operator.le, ">": operator.gt, ">=": operator.ge, "==": operator.eq } # determine the complete list of available calldata fields usable for filtration calldata_fields = sorted(callset['/calldata/'].array_keys()) # check if bypassing filtration, otherwise filter if args.bypass_filtration == 'no': # VARIANT SITE FILTERS var_filters = [] # iterate over each requested variant filter for x in args.variant_filter_expression.split(","): stat = re.sub("[^A-Za-z]+", "", x) value = int(re.sub("[^0-9]+", "", x)) compare = re.sub("[A-Za-z0-9]+", "", x) # check if the requested filter/format exists in the VCF try: stat_index = calldata_fields.index(stat) except ValueError as e: raise Exception( "[pixy] ERROR: The requested filter \'" + stat + "\' is not annotated in the input VCF FORMAT field" ) from e else: if type(var_filters) is list: var_filters = ops[compare](callset['/calldata/' + stat][:], value) elif type(var_filters) is not list: var_filters = np.logical_and( var_filters, ops[compare](callset['/calldata/' + stat][:], value)) # create a mask for variants only # is snp is a site level (1d) array # np.tile below creates a column of "is_snp" once for each sample # (i.e. makes it the same dimensions as the genotype table) is_snp = np.array([callset['/variants/is_snp'][:].flatten() ]).transpose() snp_mask = np.tile(is_snp, (1, var_filters.shape[1])) # force only variant sites (snps, remember we ignore indels) to be included in the filter var_filters = np.logical_and(var_filters, snp_mask) # INVARIANT SITE FILTERS invar_filters = [] for x in args.invariant_filter_expression.split(","): stat = re.sub("[^A-Za-z]+", "", x) value = int(re.sub("[^0-9]+", "", x)) compare = re.sub("[A-Za-z0-9]+", "", x) # check if the requested filter/format exists in the VCF try: stat_index = calldata_fields.index(stat) except ValueError as e: raise Exception( "[pixy] ERROR: The requested filter \'" + stat + "\' is not annotated in the input VCF") from e else: if type(invar_filters) is list: invar_filters = ops[compare](callset['/calldata/' + stat][:], value) elif type(var_filters) is not list: invar_filters = np.logical_and( invar_filters, ops[compare](callset['/calldata/' + stat][:], value)) # create a mask for invariant sites by inverting the snp filter # join that to the invariant sites filter invar_filters = np.logical_and(invar_filters, np.invert(snp_mask)) # join the variant and invariant filter masks (logical OR) filters = np.logical_or(invar_filters, var_filters) # applying the filter to the data # all the filters are in a boolean array ('filters' above) # first, recode the gt matrix as a Dask array (saves memory) -> packed # create a packed genotype array # this is a array with dims snps x samples # genotypes are represented by single byte codes # critically, as the same dims as the filters array below gt_array = allel.GenotypeArray( allel.GenotypeDaskArray(callset['/calldata/GT'])).to_packed() # apply filters # only if not bypassing filtration if args.bypass_filtration == 'no': # set all genotypes that fail filters (the inversion of the array) # to 'missing', 239 = -1 (i.e. missing) for packed arrays gt_array[np.invert(filters)] = 239 # convert the packed array back to a GenotypeArray gt_array = allel.GenotypeArray.from_packed(gt_array) # build the position array pos_array = allel.SortedIndex(callset['/variants/POS']) # a mask for snps and invariant sites snp_invar_mask = np.logical_or( np.logical_and(callset['/variants/is_snp'][:] == 1, callset['/variants/numalt'][:] == 1), callset['/variants/numalt'][:] == 0) # remove rows that are NOT snps or invariant sites from the genotype array gt_array = np.delete(gt_array, np.where(np.invert(snp_invar_mask)), axis=0) gt_array = allel.GenotypeArray(gt_array) # select rows that ARE snps or invariant sites in the position array pos_array = pos_array[snp_invar_mask] #Basic functions for comparing the genotypes at each site in a region: counts differences out of sites with data #For the given region: return average pi, # of differences, # of comparisons, and # missing. # this function loops over every site in a region passed to it #Basic functions for comparing the genotypes at each site in a region: counts differences out of sites with data #For the given region: return average pi, # of differences, # of comparisons, and # missing. # this function loops over every site in a region passed to it def tallyRegion(gt_region): total_diffs = 0 total_comps = 0 total_missing = 0 for site in gt_region: vec = site.flatten() #now we have an individual site as a numpy.ndarray, pass it to the comparison function site_diffs, site_comps, missing = compareGTs(vec) total_diffs += site_diffs total_comps += site_comps total_missing += missing if total_comps > 0: avg_pi = total_diffs / total_comps else: avg_pi = 0 return (avg_pi, total_diffs, total_comps, total_missing) #For the given region: return average dxy, # of differences, # of comparisons, and # missing. # this function loops over every site in a region passed to it def dxyTallyRegion(pop1_gt_region, pop2_gt_region): total_diffs = 0 total_comps = 0 total_missing = 0 for x in range(0, len(pop1_gt_region)): site1 = pop1_gt_region[x] site2 = pop2_gt_region[x] vec1 = site1.flatten() vec2 = site2.flatten() #now we have an individual site as 2 numpy.ndarrays, pass them to the comparison function site_diffs, site_comps, missing = dxyCompareGTs(vec1, vec2) total_diffs += site_diffs total_comps += site_comps total_missing += missing if total_comps > 0: avg_pi = total_diffs / total_comps else: avg_pi = 0 return (avg_pi, total_diffs, total_comps, total_missing) #Return the number of differences, the number of comparisons, and missing data count. def compareGTs(vec): #for pi c = Counter(vec) diffs = c[1] * c[0] gts = c[1] + c[0] missing = ( len(vec) ) - gts #anything that's not 1 or 0 is ignored and counted as missing comps = int(special.comb(gts, 2)) return (diffs, comps, missing) def dxyCompareGTs(vec1, vec2): #for dxy c1 = Counter(vec1) c2 = Counter(vec2) gt1zeros = c1[0] gt1ones = c1[1] gts1 = c1[1] + c1[0] gt2zeros = c2[0] gt2ones = c2[1] gts2 = c2[1] + c2[0] missing = (len(vec1) + len(vec2)) - ( gts1 + gts2 ) #anything that's not 1 or 0 is ignored and counted as missing diffs = (gt1zeros * gt2ones) + (gt1ones * gt2zeros) comps = gts1 * gts2 return (diffs, comps, missing) # Interval specification check # check if computing over specific intervals (otherwise, compute over whole chromosome) # window size window_size = args.window_size # set intervals based on args if (args.interval_end is None): interval_end = max(pos_array) else: interval_end = int(args.interval_end) if (args.interval_start is None): interval_start = min(pos_array) else: interval_start = int(args.interval_start) try: if (interval_start > interval_end): raise ValueError() except ValueError as e: raise Exception("[pixy] ERROR: The specified interval start (" + str(interval_start) + ") exceeds the interval end (" + str(interval_end) + ")") from e # catch misspecified intervals # TBD: harmonize this with the new interval method for the zarr array if (interval_end > max(pos_array)): print( "[pixy] WARNING: The specified interval end (" + str(interval_end) + ") exceeds the last position of the chromosome and has been substituted with " + str(max(pos_array))) interval_end = max(pos_array) if (interval_start < min(pos_array)): print( "[pixy] WARNING: The specified interval start (" + str(interval_start) + ") begins before the first position of the chromosome and has been substituted with " + str(min(pos_array))) interval_start = min(pos_array) if ((interval_end - interval_start + 1) < window_size): print( "[pixy] WARNING: The requested interval or total number of sites in the VCF (" + str(interval_start) + "-" + str(interval_end) + ") is smaller than the requested window size (" + str(window_size) + ")") # PI: # AVERAGE NUCLEOTIDE VARIATION WITHIN POPULATIONS # Compute pi over a chosen interval and window size if (args.populations is not None) and ('pi' in args.stats): # open the pi output file for writing outfile = open(pi_file, 'a') for pop in popnames: # window size: window_size = args.window_size # initialize window_pos_2 window_pos_2 = (interval_start + window_size) - 1 # loop over populations and windows, compute stats and write to file for window_pos_1 in range(interval_start, interval_end, window_size): # if the window has no sites, assign all NAs, # otherwise calculate pi if len(pos_array[(pos_array > window_pos_1) & (pos_array < window_pos_2)]) == 0: avg_pi, total_diffs, total_comps, total_missing, no_sites = "NA", "NA", "NA", "NA", 0 else: # pull out the genotypes for the window loc_region = pos_array.locate_range( window_pos_1, window_pos_2) gt_region1 = gt_array[loc_region] no_sites = len(gt_region1) # subset the window for the individuals in each population gt_pop = gt_region1.take(popindices[pop], axis=1) avg_pi, total_diffs, total_comps, total_missing = tallyRegion( gt_pop) outfile.write( str(pop) + "\t" + str(chromosome) + "\t" + str(window_pos_1) + "\t" + str(window_pos_2) + "\t" + str(avg_pi) + "\t" + str(no_sites) + "\t" + str(total_diffs) + "\t" + str(total_comps) + "\t" + str(total_missing) + "\n") window_pos_2 += window_size if window_pos_2 > interval_end: window_pos_2 = interval_end # close output file and print complete message outfile.close() print("[pixy] Pi calculations for chromosome " + chromosome + " complete and written to " + args.outfile_prefix + "_pi.txt") # DXY: # AVERAGE NUCLEOTIDE VARIATION BETWEEN POPULATIONS if (args.populations is not None) and ('dxy' in args.stats): # create a list of all pairwise comparisons between populations in the popfile dxy_pop_list = list(combinations(popnames, 2)) # open the dxy output file for writing outfile = open(dxy_file, 'a') # interate over all population pairs and compute dxy for pop_pair in dxy_pop_list: pop1 = pop_pair[0] pop2 = pop_pair[1] # window size: window_size = args.window_size # initialize window_pos_2 window_pos_2 = (interval_start + window_size) - 1 # perform the dxy calculation for all windows in the range for window_pos_1 in range(interval_start, interval_end, window_size): if len(pos_array[(pos_array > window_pos_1) & (pos_array < window_pos_2)]) == 0: avg_dxy, total_diffs, total_comps, total_missing, no_sites = "NA", "NA", "NA", "NA", 0 else: loc_region = pos_array.locate_range( window_pos_1, window_pos_2) gt_region1 = gt_array[loc_region] no_sites = len(gt_region1) # use the popGTs dictionary to keep track of this region's GTs for each population popGTs = {} for name in pop_pair: gt_pop = gt_region1.take(popindices[name], axis=1) popGTs[name] = gt_pop pop1_gt_region1 = popGTs[pop1] pop2_gt_region1 = popGTs[pop2] avg_dxy, total_diffs, total_comps, total_missing = dxyTallyRegion( pop1_gt_region1, pop2_gt_region1) outfile.write( str(pop1) + "\t" + str(pop2) + "\t" + str(chromosome) + "\t" + str(window_pos_1) + "\t" + str(window_pos_2) + "\t" + str(avg_dxy) + "\t" + str(no_sites) + "\t" + str(total_diffs) + "\t" + str(total_comps) + "\t" + str(total_missing) + "\n") window_pos_2 += window_size if window_pos_2 > interval_end: window_pos_2 = interval_end outfile.close() print("[pixy] Dxy calculations chromosome " + chromosome + " complete and written to " + args.outfile_prefix + "_dxy.txt") # FST: # WEIR AND COCKERHAMS FST # This is just a plain wrapper for the scikit-allel fst function if (args.populations is not None) and ('fst' in args.stats): # open the fst output file for writing outfile = open(fst_file, 'a') # determine all the possible population pairings pop_names = list(popindices.keys()) fst_pop_list = list(combinations(pop_names, 2)) #calculate maf allele_counts = gt_array.count_alleles() allele_freqs = allele_counts.to_frequencies() maf_array = allele_freqs[:, 1] > args.fst_maf_filter # apply the maf filter to the genotype array] gt_array_fst = gt_array[maf_array] gt_array_fst = allel.GenotypeArray(gt_array_fst) # apply the maf filter to the position array pos_array_fst = pos_array[maf_array] # for each pair, compute fst for pop_pair in fst_pop_list: # the indices for the individuals in each population fst_pop_indicies = [ popindices[pop_pair[0]].tolist(), popindices[pop_pair[1]].tolist() ] # compute FST # windowed_weir_cockerham_fst seems to generate (spurious?) warnings about div/0, so suppressing warnings # (this assumes that the scikit-allel function is working as intended) np.seterr(divide='ignore', invalid='ignore') a, b, c = allel.windowed_weir_cockerham_fst( pos_array_fst, gt_array_fst, subpops=fst_pop_indicies, size=args.window_size, start=interval_start, stop=interval_end) for fst, wind, snps in zip(a, b, c): outfile.write( str(pop_pair[0]) + "\t" + str(pop_pair[1]) + "\t" + str(chromosome) + "\t" + str(wind[0]) + "\t" + str(wind[1]) + "\t" + str(fst) + "\t" + str(snps) + "\n") outfile.close() print("[pixy] Fst calculations chromosome " + chromosome + " complete and written to " + args.outfile_prefix + "_fst.txt") print("\n[pixy] All calculations complete at " + time.strftime("%H:%M:%S", time.localtime(start_time))) end_time = (time.time() - start_time) print("[pixy] Time elapsed: " + time.strftime("%H:%M:%S", time.gmtime(end_time)))