예제 #1
0
 def GetVCFHeader(self, filepath_vcf):
     print("begin read header")
     headers = allel.read_vcf_headers(filepath_vcf)
     print("read header complete")
     infofields = []
     ontologyfield = [
         'HPO',
         'DO',
         'SO',
         'MC',
         'GO',
     ]
     i = 0
     for ontoele in ontologyfield:
         if ontoele in headers.infos:
             i += 1
     if i == len(ontologyfield
                 ):  # has ontology, these ontology should be put behind
         for ele in headers.infos:
             if ele not in ontologyfield:
                 infofields.append(ele)
         infofields.extend(ontologyfield)
     else:
         for ele in headers.infos:
             infofields.append(ele)
     return infofields
예제 #2
0
def generate_vcf_classes(vcfs):
    print("Parsing VCFs")
    parsed_vcf_bodies = list(map(lambda x: allel.read_vcf(x, fields="*"),
                                 vcfs))
    parsed_vcf_bodies = list(filter(None, parsed_vcf_bodies))
    deque(
        map(
            lambda x: x.update(samples=numpy.char.upper(x['samples'].tolist())
                               ), parsed_vcf_bodies))
    deque(map(lambda x, y: x.update(FILE=y), parsed_vcf_bodies, vcfs))
    add_headers = lambda x, y: x.update(header=allel.read_vcf_headers(y))
    deque(map(add_headers, parsed_vcf_bodies, vcfs))
    return parsed_vcf_bodies
예제 #3
0
req_grp.add_argument("--outpre",
                     "-o",
                     dest="outpre",
                     help="output prefix",
                     type=str,
                     required=True)
args = parser.parse_args()

import numpy as np
import random
import allel

random.seed(args.iteration)

#read vcf header which contains list of sample names etc.
vcf_header = allel.read_vcf_headers(args.vcf)

#extract sample names from header
samples = vcf_header.samples

#get the deme id for each sample
demes = [i for i in range(0, 36) for j in range(0, 250)]

# ####### Make mate pairs
# # (1)randomly across the entire grid
#
# pairs=[] # this will store mom and dad's IDs
#
# # list to store the population id for the sibling
# #this will be randomly picked to be the pop of one of the parents
# sibs_pop=[]
예제 #4
0
import matplotlib.cm as cm
#Initializing variables

headers = []
vcfs = glob.glob("VcfData/./*.vcf")
vcf_num = len(vcfs)

snps = [None] * vcf_num
colour_snps = [None] * vcf_num
pos = [None] * vcf_num
labels = [None] * vcf_num

plt.axis('off')

for i in range(vcf_num):
    headers.append(allel.read_vcf_headers(vcfs[i]))
    labels[i] = vcfs[i].split('_')[-1]
    file = allel.read_vcf(vcfs[i])
    gt = allel.GenotypeArray(file['calldata/GT'])
    dim = gt.shape
    #print(gt[0,0,0])
    alt = file['variants/ALT']
    ref = file['variants/REF']

    #print(dim,alt.shape,ref.shape)
    #print(file['variants/numalt'])

    #print(alt)
    allele_to_color = {
        'A': (255 / 2, 255 / 2, 0),
        'C': (0, 255 / 2, 255 / 2),
예제 #5
0
파일: core.py 프로젝트: ksamuk/pixy
def check_and_validate_args(args):
    
    # CHECK FOR TABIX
    tabix_path = shutil.which("tabix")

    if tabix_path is None:
        raise Exception('[pixy] ERROR: tabix is not installed (or cannot be located in the path). Install tabix with "conda install -c bioconda htslib".') 

    if args.vcf is None:
        raise Exception('[pixy] ERROR: The --vcf argument is missing or incorrectly specified.') 
        
    if args.populations is None:
        raise Exception('[pixy] ERROR: The --populations argument is missing or incorrectly specified.') 
    
    # reformat file paths for compatibility 
    args.vcf = os.path.expanduser(args.vcf)
    args.populations = os.path.expanduser(args.populations)
    
    if args.output_folder != '':
        output_folder = args.output_folder + "/"
    else:
        output_folder = os.path.expanduser(os.getcwd() + "/")
        
    output_prefix = output_folder + args.output_prefix
    
    # get vcf header info
    vcf_headers = allel.read_vcf_headers(args.vcf)
    
    print("\n[pixy] Validating VCF and input parameters...")
    
    # CHECK OUTPUT FOLDER 
    print("[pixy] Checking write access...", end = '')
    check_message = "OK"
    
    # attempt to create the output folder
    if os.path.exists(output_folder) is not True:
        os.makedirs(output_folder)
    
    # check if output folder is writable
    #if not os.access(re.sub(r"[^\/]+$", "", args.outfile_prefix), os.W_OK):
    if not os.access(output_folder, os.W_OK):
        raise Exception('[pixy] ERROR: The output folder ' + output_folder + ' is not writable')
        
    # check if output_prefix is correctly specified
    if "/" in str(args.output_prefix) or "\\" in str(args.output_prefix):
        raise Exception('[pixy] ERROR: The output prefix \'' + str(args.output_prefix) + '\' contains slashes. Remove them and specify output folder structure with --output_folder if necessary.')
        

    # generate a name for a unique temp file for collecting output
    temp_file = output_folder + "pixy_tmpfile_" + str(uuid.uuid4().hex) + ".tmp"
    
    # check if temp file is writable
    with open(temp_file, 'w') as f:
        pass

    if check_message == "OK":
        print(check_message)

    # CHECK CPU CONFIGURATION
    print("[pixy] Checking CPU configuration...", end = '')
    check_message = "OK"
    
    if (args.n_cores > mp.cpu_count()):
        check_message = "WARNING"
        print(check_message)
        print('[pixy] WARNING: ' + str(args.n_cores) + ' CPU cores requested but only ' + str(mp.cpu_count()) + ' are available. Using '+ str(mp.cpu_count()) +'.')
        args.n_cores = mp.cpu_count()
    
    if check_message == "OK":
        print(check_message)

    # CHECK FOR EXISTANCE OF INPUT FILES

    if os.path.exists(args.vcf) is not True:
        raise Exception('[pixy] ERROR: The specified VCF ' + str(args.vcf) + ' does not exist') 
        
    if not re.search(".gz", args.vcf):
        raise Exception('[pixy] ERROR: The vcf is not compressed with bgzip (or has no .gz extension). To fix this, run "bgzip [filename].vcf" first (and then index with "tabix [filename].vcf.gz" if necessary)') 
    
    if not os.path.exists(args.vcf + ".tbi"):
        raise Exception('[pixy] ERROR: The vcf is not indexed with tabix. To fix this, run "tabix [filename].vcf.gz" first') 

    if os.path.exists(args.populations) is not True:
        raise Exception('[pixy] ERROR: The specified populations file ' + str(args.populations) + ' does not exist') 
    
    if args.bed_file is not None:
        args.bed_file = os.path.expanduser(args.bed_file)
        
        if os.path.exists(args.bed_file) is not True:
            raise Exception('[pixy] ERROR: The specified BED file ' + str(args.bed_file) + ' does not exist') 
            
    else:
        bed_df = []
        
    if args.sites_file is not None:
        args.sites_file = os.path.expanduser(args.sites_file)
        
        if os.path.exists(args.sites_file) is not True:
            raise Exception('[pixy] ERROR: The specified sites file ' + str(args.sites_file) + ' does not exist') 
    else:
        sites_df = []
        
    # VALIDATE THE VCF

    # check if the vcf contains any invariant sites
    # a very basic check: just looks for at least one invariant site in the alt field
    print("[pixy] Checking for invariant sites...", end = '')
    check_message = "OK"

    if args.bypass_invariant_check=='no':
        alt_list = subprocess.check_output("gunzip -c " + args.vcf + " | grep -v '#' | head -n 10000 | awk '{print $5}' | sort | uniq", shell=True).decode("utf-8").split()
        if "." not in alt_list:
            raise Exception('[pixy] ERROR: the provided VCF appears to contain no invariant sites (ALT = \".\"). This check can be bypassed via --bypass_invariant_check \'yes\'.') 
        if "." in alt_list and len(alt_list) == 1 :
            raise Exception('[pixy] ERROR: the provided VCF appears to contain no variable sites. It may have been filtered incorrectly, or otherwise corrupted.') 
    else:
        if not (len(args.stats) == 1 and (args.stats[0] == 'fst')):
            check_message = "WARNING"
            print(check_message)
            print("[pixy] EXTREME WARNING: --bypass_invariant_check is set to \'yes\'. Note that a lack of invariant sites will result in incorrect estimates.")

    if check_message == "OK":
        print(check_message)

    # check if requested chromosomes exist in vcf
    # parses the whole CHROM column (!)

    print("[pixy] Checking chromosome data...", end = '')
    
    # get the list of all chromosomes in the dataset
    chrom_all = subprocess.check_output("tabix -l " + args.vcf, shell=True).decode("utf-8").split()
    
    if args.chromosomes != 'all': 

        chrom_list = list(args.chromosomes.split(","))
        # pretabix method, can remove
        # chrom_all = subprocess.check_output("gunzip -c " + args.vcf + " | grep -v '#' | awk '{print $1}' | uniq", shell=True).decode("utf-8").split()
        chrom_all = subprocess.check_output("tabix -l " + args.vcf, shell=True).decode("utf-8").split()
        missing = list(set(chrom_list)-set(chrom_all))
        if len(missing) >0:
            raise Exception('[pixy] ERROR: the following chromosomes were specified but not occur in the VCF: ', missing) 

    else: #added this else statement (klk)
        chrom_list = subprocess.check_output("tabix -l " + args.vcf, shell=True).decode("utf-8").split()
        chrom_all = chrom_list

    print("OK")

    # INTERVALS
    # check if intervals are correctly specified
    # validate the BED file (if present)

    print("[pixy] Checking intervals/sites...", end = '')
    check_message = "OK"

    if args.bed_file is None:
        
        if args.window_size is None:
            raise Exception('[pixy] ERROR: In the absence of a BED file, a --window_size must be specified.') 

        if args.interval_start is None and args.interval_end is not None:
            raise Exception('[pixy] ERROR: When specifying an interval, both --interval_start and --interval_end are required.') 
            
        if args.interval_start is not None and args.interval_end is None:
            raise Exception('[pixy] ERROR: When specifying an interval, both --interval_start and --interval_end are required.') 

        if (args.interval_start is not None or args.interval_end is not None) and len(chrom_list) > 1:
            raise Exception('[pixy] ERROR: --interval_start and --interval_end are not valid when calculating over multiple chromosomes. Remove both arguments or specify a single chromosome.')  

        if (args.interval_start is not None and args.interval_end is not None) and ((int(args.interval_end) - int(args.interval_start)) <= int(args.window_size)):
            check_message = "WARNING"
            print('[pixy] WARNING: The specified interval ' + str(args.interval_start) + '-' + str(args.interval_end) + ' is smaller than the window size (' + str(args.window_size) + '). A single window will be returned.')
            

    else:
        if args.interval_start is not None or args.interval_end is not None or args.window_size is not None:
            check_message = "ERROR"
            print(check_message)
            raise Exception('[pixy] ERROR: --interval_start, --interval_end, and --window_size are not valid when a BED file of windows is provided.') 

        # read in the bed file and extract the chromosome column
        bed_df = pandas.read_csv(args.bed_file, sep='\t', usecols=[0,1,2], names=['chrom', 'pos1', 'pos2'])
        bed_df['chrom'] = bed_df['chrom'].astype(str)
        
        # force chromosomes to strings
        
        
        if bed_df.isnull().values.any():
            check_message = "ERROR"
            print(check_message)
            raise Exception('[pixy] ERROR: your bed file contains missing data, confirm all rows have three fields (chrom, pos1, pos2).') 

        if len(bed_df.columns) != 3:
            check_message = "ERROR"
            print(check_message)
            raise Exception('[pixy] ERROR: The bed file has the wrong number of columns (should be 3, is ' + str(len(bed_df.columns)) + ')') 
        
        else:
            bed_df.columns = ['chrom', 'chromStart', 'chromEnd']
            bed_chrom = list(bed_df['chrom'])
            missing = list(set(bed_chrom)-set(chrom_all))
            chrom_list = list(set(chrom_all) & set(bed_chrom))

        if len(missing) >0:
            check_message = "WARNING"
            print(check_message)
            print('[pixy] WARNING: the following chromosomes in the BED file do not occur in the VCF and will be ignored: ' + str(missing))
        
            
    if args.sites_file is not None:
        sites_df = pandas.read_csv(args.sites_file, sep='\t', usecols=[0,1], names=['chrom', 'pos'])
        sites_df['chrom'] = sites_df['chrom'].astype(str)
        
        if sites_df.isnull().values.any():
            check_message = "ERROR"
            print(check_message)
            raise Exception('[pixy] ERROR: your sites file contains missing data, confirm all rows have two fields (chrom, pos).') 
        
        if len(sites_df.columns) != 2:
            raise Exception('[pixy] ERROR: The sites file has the wrong number of columns (should be 2, is ' + str(len(sites_df.columns)) + ')') 
            
        else:
            sites_df.columns = ['CHROM', 'POS']
            chrom_sites = list(sites_df['CHROM'])
            missing = list(set(chrom_sites)-set(chrom_all))
            chrom_list = list(set(chrom_all) & set(chrom_sites))
            
        if len(missing) >0:
            check_message = "WARNING"
            print(check_message)
            print('[pixy] WARNING: the following chromosomes in the sites file do not occur in the VCF and will be ignored: ' + str(missing))

    if check_message == "OK":
        print(check_message)
    
    # SAMPLES
    # check if requested samples exist in vcf

    print("[pixy] Checking sample data...", end = '')

    # - parse + validate the population file
    # - format is IND POP (tab separated)
    # - throws an error if individuals are missing from VCF

    # read in the list of samples/populations
    poppanel = pandas.read_csv(args.populations, sep='\t', usecols=[0,1], names=['ID', 'Population'])
    poppanel['ID'] = poppanel['ID'].astype(str)
    
    # check for missing values
    
    if poppanel.isnull().values.any():
        check_message = "ERROR"
        print(check_message)
        raise Exception('[pixy] ERROR: your populations file contains missing data, confirm all samples have population IDs (and vice versa).') 

    # get a list of samples from the callset
    samples_list = vcf_headers.samples

    # make sure every indiv in the pop file is in the VCF callset
    IDs = list(poppanel['ID'])
    missing = list(set(IDs)-set(samples_list))

    # find the samples in the callset index by matching up the order of samples between the population file and the callset
    # also check if there are invalid samples in the popfile
    try:
        samples_callset_index = [samples_list.index(s) for s in poppanel['ID']]
    except ValueError as e:
        check_message = "ERROR"
        print(check_message)
        raise Exception('[pixy] ERROR: the following samples are listed in the population file but not in the VCF: ', missing) from e
    else:   
        poppanel['callset_index'] = samples_callset_index

        # use the popindices dictionary to keep track of the indices for each population
        popindices={}
        popnames = poppanel.Population.unique()
        for name in popnames:
            popindices[name] = poppanel[poppanel.Population == name].callset_index.values
            
    if len(popnames) == 1 and ("fst" in args.stats or "dxy" in args.stats):
        check_message = "ERROR"
        print(check_message)
        raise Exception('[pixy] ERROR: calcuation of fst and/or dxy requires at least two populations to be defined in the population file.') 
    
    print("OK")
    print("[pixy] All initial checks past!")
    
    return popnames, popindices, chrom_list, IDs, temp_file, output_folder, output_prefix, bed_df, sites_df
        cmd = f"impute2 -m {geneticMap} -known_haps_g {prephased} -h {self.panel} -l {self.legend} -Ne 20000 -int {regionStart} {regionEnd} -o {outfile} -allow_large_regions"
        print(cmd)
        #subprocess.call(cmd)


chrom = 22

uaeCount = 153
qtrCount = 1005
kgpCount = 2504

panelBase = 'uqk_all_chr22'  # 'uqk_arabSNPs_chr22'
vcf = f"{panelBase}.vcf.gz"  ## to be updated with merged file

## get SampleIds FullGenome(sampleFile):
data = allel.read_vcf_headers(f"{datadir}/{vcf}")

## TODO: filter dataset by SNPs from one chromosome, for starters
pi = PlinkInterface(wdir=datadir, base=panelBase, parseChromoPos=False)
pi.readDistanceMatrix(upgma=False)

## increasing neighbor set
splitter = ShuffleSplit(n_splits=5, test_size=.2, random_state=0)
#splitter.get_n_splits(data.samples)
fold = 0
## only split UAE samples
## Cross validation, each round leaves some UAE test samples out
for trainIdx0, testIdx in splitter.split(data.samples[:uaeCount]):
    ## choose around 30 UAE samples as test set, remaining 120 + others as superset from which training sets are selected using
    trainIdx = list(trainIdx0) + list(range(len(data.samples[uaeCount:])))
    Xsuper = [
예제 #7
0
import matplotlib.cm as cm
import glob

# create variables containing the vcfs
vcfs = glob.glob("/Users/nathanrobins/Documents/UG_proj/EDAR_Data_Splicing/*.vcf")
head = []
num_vcfs = len(vcfs)
position = [None]*num_vcfs
var = [None]*num_vcfs
label = [None]*num_vcfs

#plt.axis('off')

# for loop to seperate the vcfs
for n in range(num_vcfs):
	head.append(allel.read_vcf_headers(vcfs[n]))
	callset = allel.read_vcf(vcfs[n])
#	print(sorted(callset.keys()))
	# samples represents individuals
	# POS represents the position
	# calldata/GT = genotype calls
	GT = allel.GenotypeArray(callset['calldata/GT'])
	shape = GT.shape
#	print(shape)
	alt = callset['variants/ALT']
###### DOUBLE CHECK WITH MATTEO THAT I ONLY WANT TO TREAT THINGS AS BIALLELIC? --> 
###### AS THEN I CAN USE ...(callset, numbers={'ALT :1'})
#	print(alt)
	ref = callset['variants/REF']
#	print(ref)
예제 #8
0
파일: __main__.py 프로젝트: huangsunan/pixy
def main(args=None):

    if args is None:
        args = sys.argv[1:]

    # the ascii help image
    help_image = "█▀▀█ ░▀░ █░█ █░░█\n" "█░░█ ▀█▀ ▄▀▄ █▄▄█\n" "█▀▀▀ ▀▀▀ ▀░▀ ▄▄▄█\n"

    help_text = 'pixy: sensible estimates of pi and dxy from a VCF'
    version_text = 'version 0.95.0'

    # initialize arguments
    parser = argparse.ArgumentParser(
        description=help_image + help_text + '\n' + version_text,
        formatter_class=argparse.RawTextHelpFormatter)
    parser.add_argument('--version', action='version', version=version_text)
    parser.add_argument(
        '--stats',
        nargs='+',
        choices=['pi', 'dxy', 'fst'],
        help=
        'Which statistics to calculate from the VCF (pi, dxy, and/or fst, separated by spaces)',
        required=True)
    parser.add_argument('--vcf',
                        type=str,
                        nargs='?',
                        help='Path to the input VCF',
                        required=True)
    parser.add_argument('--zarr_path',
                        type=str,
                        nargs='?',
                        help='Folder in which to build the Zarr array(s)',
                        required=True)
    parser.add_argument(
        '--reuse_zarr',
        choices=['yes', 'no'],
        default='no',
        help='Use existing Zarr array(s) (saves time if re-running)')
    parser.add_argument('--populations',
                        type=str,
                        nargs='?',
                        help='Path to the populations file',
                        required=True)
    parser.add_argument(
        '--window_size',
        type=int,
        nargs='?',
        help='Window size in base pairs over which to calculate pi/dxy')
    parser.add_argument(
        '--chromosomes',
        type=str,
        nargs='?',
        default='all',
        help=
        'A single-quoted, comma separated list of chromosome(s) (e.g. \'X,1,2\')',
        required=False)
    parser.add_argument(
        '--interval_start',
        type=str,
        nargs='?',
        help=
        'The start of the interval over which to calculate pi/dxy. Only valid when calculating over a single chromosome.'
    )
    parser.add_argument(
        '--interval_end',
        type=str,
        nargs='?',
        help=
        'The end of the interval over which to calculate pi/dxy. Only valid when calculating over a single chromosome.'
    )
    parser.add_argument(
        '--variant_filter_expression',
        type=str,
        nargs='?',
        help=
        'A single-quoted, comma separated list of genotype filters (e.g. \'DP>=10,GQ>=20\') to apply to SNPs',
        required=False)
    parser.add_argument(
        '--invariant_filter_expression',
        type=str,
        nargs='?',
        help=
        'A single-quoted, comma separated list of genotype filters (e.g. \'DP>=10,RGQ>=20\') to apply to invariant sites',
        required=False)
    parser.add_argument(
        '--outfile_prefix',
        type=str,
        nargs='?',
        default='./pixy_output',
        help='Path and prefix for the output file, e.g. path/to/outfile')
    parser.add_argument(
        '--bypass_filtration',
        choices=['yes', 'no'],
        default='no',
        help=
        'Bypass all variant filtration (for data lacking FORMAT fields, use with caution)'
    )
    parser.add_argument(
        '--bypass_invariant_check',
        choices=['yes', 'no'],
        default='no',
        help=
        'Allow computation of stats without invariant sites, will result in wildly incorrect estimates most of the time. Use with extreme caution.'
    )
    parser.add_argument(
        '--fst_maf_filter',
        default=0.05,
        type=float,
        nargs='?',
        help=
        'Minor allele frequency filter for FST calculations, with value 0.0-1.0 (default 0.05).'
    )

    # ag1000g test data
    # args = parser.parse_args('--stats fst --vcf data/vcf/multi_chr.vcf.gz --zarr_path data/vcf/multi --window_size 10000 --populations data/vcf/ag1000/Ag1000_sampleIDs_popfile_3.txt --variant_filter_expression DP>=10,GQ>20 --invariant_filter_expression DP>=10,RGQ>20 --outfile_prefix output/pixy_out'.split())

    # filter test data
    # args = parser.parse_args('--stats pi --vcf data/vcf/filter_test.vcf.gz --zarr_path data/vcf/filter_test --window_size 3 --populations data/vcf/ag1000/Ag1000_sampleIDs_popfile_3.txt --variant_filter_expression DP>=10,GQ>20 --invariant_filter_expression DP>=10,RGQ>20 --fst_maf_filter 0.05 --outfile_prefix output/pixy_out'.split())

    # catch arguments from the command line
    args = parser.parse_args()

    # CHECK FOR TABIX
    # (disabled until we implement site level and BED support)
    #tabix_path = shutil.which("tabix")

    #if tabix_path is None:
    #    warnings.warn('[pixy] WARNING: tabix is not installed (or cannot be located) -- this may reduce performance. install tabix with "conda install -c bioconda tabix"')
    #if not os.path.exists(args.vcf + ".tbi") and tabix_path is not None:
    #    raise Exception('[pixy] ERROR: your vcf is not indexed with tabix, index the bgzipped vcf with "tabix your.vcf.gz"')

    # VALIDATE ARGUMENTS

    print("[pixy] pixy " + version_text)
    print(
        "[pixy] Validating VCF and input parameters (this may take some time)..."
    )

    # expand all file paths
    args.vcf = os.path.expanduser(args.vcf)
    args.zarr_path = os.path.expanduser(args.zarr_path)
    args.populations = os.path.expanduser(args.populations)
    args.outfile_prefix = os.path.expanduser(args.outfile_prefix)

    # CHECK FOR EXISTANCE OF VCF AND POPFILES

    if os.path.exists(args.vcf) is not True:
        raise Exception('[pixy] ERROR: The specified VCF ' + str(args.vcf) +
                        ' does not exist')

    if os.path.exists(args.populations) is not True:
        raise Exception('[pixy] ERROR: The specified populations file ' +
                        str(args.populations) + ' does not exist')

    # VALIDATE FILTER EXPRESSIONS

    # get vcf header info
    vcf_headers = allel.read_vcf_headers(args.vcf)

    # skip invariant check if only asking for FST
    if len(args.stats) == 1 and (args.stats[0] == 'fst'):
        args.bypass_invariant_check = "yes"

    # if we are bypassing the invariant check, spoof in a invariant filter
    if args.bypass_invariant_check == "yes":
        args.invariant_filter_expression = "DP>=0"

    if args.bypass_filtration == 'no' and (
            args.variant_filter_expression is None
            or args.invariant_filter_expression is None):
        raise Exception(
            '[pixy] ERROR: One or more filter expression is missing. Provide two filter expressions, or set --bypass_filtration to \'yes\''
        )

    if args.bypass_filtration == 'no':
        # get the list of format fields and requested filter fields
        format_fields = vcf_headers.formats.keys()
        filter_fields = list()

        for x in args.variant_filter_expression.split(","):
            filter_fields.append(re.sub("[^A-Za-z]+", "", x))

        for x in args.invariant_filter_expression.split(","):
            filter_fields.append(re.sub("[^A-Za-z]+", "", x))

        missing = list(set(filter_fields) - set(format_fields))

        if len(missing) > 0:
            raise Exception(
                '[pixy] ERROR: the following genotype filters were requested but not occur in the VCF: ',
                missing)
    else:
        print(
            "[pixy] WARNING: --bypass_filtration is set to \'yes\', genotype filtration will be not be performed."
        )

    # VALIDATE THE VCF

    # check if the vcf is zipped
    if re.search(".gz", args.vcf):
        cat_prog = "gunzip -c "
    else:
        cat_prog = "cat "

    # check if the vcf contains any invariant sites
    # a very basic check: just looks for at least one invariant site in the alt field

    if args.bypass_invariant_check == 'no':
        alt_list = subprocess.check_output(
            cat_prog + args.vcf +
            " | grep -v '#' | head -n 10000 | awk '{print $5}' | sort | uniq",
            shell=True).decode("utf-8").split()
        if "." not in alt_list:
            raise Exception(
                '[pixy] ERROR: the provided VCF appears to contain no invariant sites (ALT = \".\"). This check can be bypassed via --bypass_invariant_check \'yes\'.'
            )
    else:
        if not (len(args.stats) == 1 and (args.stats[0] == 'fst')):
            print(
                "[pixy] EXTREME WARNING: --bypass_invariant_check is set to \'yes\', which assumes that your VCF contains invariant sites. Lack of invariant sites will result in incorrect estimates."
            )

    # check if requested chromosomes exist in vcf
    # defaults to all the chromosomes contained in the VCF (first data column)

    if args.chromosomes == 'all':
        chrom_list = subprocess.check_output(
            cat_prog + args.vcf + " | grep -v '#' | awk '{print $1}' | uniq",
            shell=True).decode("utf-8").split()
        chrom_all = chrom_list

    if args.chromosomes == 'all':
        chrom_list = subprocess.check_output(
            cat_prog + args.vcf + " | grep -v '#' | awk '{print $1}' | uniq",
            shell=True).decode("utf-8").split()
        chrom_all = chrom_list
    else:
        chrom_list = list(args.chromosomes.split(","))
        chrom_all = subprocess.check_output(
            cat_prog + args.vcf + " | grep -v '#' | awk '{print $1}' | uniq",
            shell=True).decode("utf-8").split()
        missing = list(set(chrom_list) - set(chrom_all))
        if len(missing) > 0:
            raise Exception(
                '[pixy] ERROR: the following chromosomes were requested but not occur in the VCF: ',
                missing)

    # INTERVALS
    # check if intervals are correctly specified

    if args.interval_start is not None and args.interval_end is None:
        raise Exception(
            '[pixy] ERROR: Both --interval_start and --interval_end must be specified'
        )

    if args.interval_start is None and args.interval_end is not None:
        raise Exception(
            '[pixy] ERROR: Both --interval_start and --interval_end must be specified'
        )

    if args.interval_start is not None and args.interval_end is not None and len(
            chrom_list) > 1:
        raise Exception(
            '[pixy] ERROR: --interval_start and --interval_end are not valid when calculating over multiple chromosomes. Remove both arguments or specify a single chromosome.'
        )

    # SAMPLES
    # check if requested samples exist in vcf

    # - parse + validate the population file
    # - format is IND POP (tab separated)
    # - throws an error if individuals are missing from VCF

    # read in the list of samples/populations
    poppanel = pandas.read_csv(args.populations,
                               sep='\t',
                               usecols=[0, 1],
                               names=['ID', 'Population'])
    poppanel.head()

    # get a list of samples from the callset
    samples_list = vcf_headers.samples

    # make sure every indiv in the pop file is in the VCF callset
    IDs = list(poppanel['ID'])
    missing = list(set(IDs) - set(samples_list))

    # find the samples in the callset index by matching up the order of samples between the population file and the callset
    # also check if there are invalid samples in the popfile
    try:
        samples_callset_index = [samples_list.index(s) for s in poppanel['ID']]
    except ValueError as e:
        raise Exception(
            '[pixy] ERROR: the following samples are listed in the population file but not in the VCF: ',
            missing) from e
    else:
        poppanel['callset_index'] = samples_callset_index

        # use the popindices dictionary to keep track of the indices for each population
        popindices = {}
        popnames = poppanel.Population.unique()
        for name in popnames:
            popindices[name] = poppanel[poppanel.Population ==
                                        name].callset_index.values

    print("[pixy] Preparing for calculation of summary statistics: " +
          ','.join(map(str, args.stats)))
    print("[pixy] Data set contains " + str(len(popnames)) +
          " population(s), " + str(len(chrom_list)) + " chromosome(s), and " +
          str(len(IDs)) + " sample(s)")

    # initialize and remove any previous output files
    if os.path.exists(re.sub(r"[^\/]+$", "", args.outfile_prefix)) is not True:
        os.mkdir(re.sub(r"[^\/]+$", "", args.outfile_prefix))

    # initialize the output files for writing
    if 'pi' in args.stats:

        pi_file = str(args.outfile_prefix) + "_pi.txt"

        if os.path.exists(pi_file):
            os.remove(pi_file)

        outfile = open(pi_file, 'a')
        outfile.write("pop" + "\t" + "chromosome" + "\t" + "window_pos_1" +
                      "\t" + "window_pos_2" + "\t" + "avg_pi" + "\t" +
                      "no_sites" + "\t" + "count_diffs" + "\t" +
                      "count_comparisons" + "\t" + "count_missing" + "\n")
        outfile.close()

    if 'dxy' in args.stats:

        dxy_file = str(args.outfile_prefix) + "_dxy.txt"

        if os.path.exists(dxy_file):
            os.remove(dxy_file)

        outfile = open(dxy_file, 'a')
        outfile.write("pop1" + "\t" + "pop2" + "\t" + "chromosome" + "\t" +
                      "window_pos_1" + "\t" + "window_pos_2" + "\t" +
                      "avg_dxy" + "\t" + "no_sites" + "\t" + "count_diffs" +
                      "\t" + "count_comparisons" + "\t" + "count_missing" +
                      "\n")
        outfile.close()

    if 'fst' in args.stats:

        fst_file = str(args.outfile_prefix) + "_fst.txt"

        if os.path.exists(fst_file):
            os.remove(fst_file)

        outfile = open(fst_file, 'a')
        outfile.write("pop1" + "\t" + "pop2" + "\t" + "chromosome" + "\t" +
                      "window_pos_1" + "\t" + "window_pos_2" + "\t" +
                      "avg_wc_fst" + "\t" + "no_snps" + "\n")
        outfile.close()

    # initialize the folder structure for the zarr array
    if os.path.exists(args.zarr_path) is not True:
        pathlib.Path(args.zarr_path).mkdir(parents=True, exist_ok=True)

    # main loop for computing summary stats

    # time the calculations
    start_time = time.time()
    print("[pixy] Started calculations at " +
          time.strftime("%H:%M:%S", time.localtime(start_time)))

    for chromosome in chrom_list:

        # Zarr array conversion

        # the chromosome specific zarr path
        zarr_path = args.zarr_path + "/" + chromosome

        # determine the fields that will be included
        # TBD: just reading all fields currently
        # vcf_fields = ['variants/CHROM', 'variants/POS'] + ['calldata/' + s for s in np.unique(filter_fields)]

        # build region string (if using an interval)
        if args.interval_start is not None:
            targ_region = chromosome + ":" + str(
                args.interval_start) + "-" + str(args.interval_end)
        else:
            targ_region = chromosome

        # allow for resuse of previously calculated zarr arrays
        if args.reuse_zarr == 'yes' and os.path.exists(zarr_path):
            print(
                "[pixy] If a zarr array exists, it will be reused for chromosome "
                + chromosome + "...")
        elif args.reuse_zarr == 'no' or os.path.exists(zarr_path) is not True:
            print("[pixy] Building zarr array for chromosome " + chromosome +
                  "...")
            warnings.filterwarnings("ignore")
            allel.vcf_to_zarr(args.vcf,
                              zarr_path,
                              region=targ_region,
                              fields='*',
                              overwrite=True)
            warnings.resetwarnings()

        print("[pixy] Calculating statistics for chromosome " + targ_region +
              "...")

        # open the zarr
        callset = zarr.open_group(zarr_path, mode='r')

        # parse the filtration expression and build the boolean filter array

        # define an operator dictionary for parsing the operator strings
        ops = {
            "<": operator.lt,
            "<=": operator.le,
            ">": operator.gt,
            ">=": operator.ge,
            "==": operator.eq
        }

        # determine the complete list of available calldata fields usable for filtration
        calldata_fields = sorted(callset['/calldata/'].array_keys())

        # check if bypassing filtration, otherwise filter
        if args.bypass_filtration == 'no':

            # VARIANT SITE FILTERS
            var_filters = []

            # iterate over each requested variant filter
            for x in args.variant_filter_expression.split(","):
                stat = re.sub("[^A-Za-z]+", "", x)
                value = int(re.sub("[^0-9]+", "", x))
                compare = re.sub("[A-Za-z0-9]+", "", x)

                # check if the requested filter/format exists in the VCF
                try:
                    stat_index = calldata_fields.index(stat)
                except ValueError as e:
                    raise Exception(
                        "[pixy] ERROR: The requested filter \'" + stat +
                        "\' is not annotated in the input VCF FORMAT field"
                    ) from e
                else:
                    if type(var_filters) is list:
                        var_filters = ops[compare](callset['/calldata/' +
                                                           stat][:], value)
                    elif type(var_filters) is not list:
                        var_filters = np.logical_and(
                            var_filters,
                            ops[compare](callset['/calldata/' + stat][:],
                                         value))

            # create a mask for variants only
            # is snp is a site level (1d) array
            # np.tile below creates a column of "is_snp" once for each sample
            # (i.e. makes it the same dimensions as the genotype table)
            is_snp = np.array([callset['/variants/is_snp'][:].flatten()
                               ]).transpose()
            snp_mask = np.tile(is_snp, (1, var_filters.shape[1]))

            # force only variant sites (snps, remember we ignore indels) to be included in the filter
            var_filters = np.logical_and(var_filters, snp_mask)

            # INVARIANT SITE FILTERS
            invar_filters = []

            for x in args.invariant_filter_expression.split(","):
                stat = re.sub("[^A-Za-z]+", "", x)
                value = int(re.sub("[^0-9]+", "", x))
                compare = re.sub("[A-Za-z0-9]+", "", x)

                # check if the requested filter/format exists in the VCF
                try:
                    stat_index = calldata_fields.index(stat)
                except ValueError as e:
                    raise Exception(
                        "[pixy] ERROR: The requested filter \'" + stat +
                        "\' is not annotated in the input VCF") from e
                else:
                    if type(invar_filters) is list:
                        invar_filters = ops[compare](callset['/calldata/' +
                                                             stat][:], value)
                    elif type(var_filters) is not list:
                        invar_filters = np.logical_and(
                            invar_filters,
                            ops[compare](callset['/calldata/' + stat][:],
                                         value))

            # create a mask for invariant sites by inverting the snp filter
            # join that to the invariant sites filter

            invar_filters = np.logical_and(invar_filters, np.invert(snp_mask))

            # join the variant and invariant filter masks (logical OR)
            filters = np.logical_or(invar_filters, var_filters)

        # applying the filter to the data
        # all the filters are in a boolean array ('filters' above)

        # first, recode the gt matrix as a Dask array (saves memory) -> packed
        # create a packed genotype array
        # this is a array with dims snps x samples
        # genotypes are represented by single byte codes
        # critically, as the same dims as the filters array below

        gt_array = allel.GenotypeArray(
            allel.GenotypeDaskArray(callset['/calldata/GT'])).to_packed()

        # apply filters
        # only if not bypassing filtration
        if args.bypass_filtration == 'no':
            # set all genotypes that fail filters (the inversion of the array)
            # to 'missing', 239 = -1 (i.e. missing) for packed arrays
            gt_array[np.invert(filters)] = 239

        # convert the packed array back to a GenotypeArray
        gt_array = allel.GenotypeArray.from_packed(gt_array)

        # build the position array
        pos_array = allel.SortedIndex(callset['/variants/POS'])

        # a mask for snps and invariant sites
        snp_invar_mask = np.logical_or(
            np.logical_and(callset['/variants/is_snp'][:] == 1,
                           callset['/variants/numalt'][:] == 1),
            callset['/variants/numalt'][:] == 0)

        # remove rows that are NOT snps or invariant sites from the genotype array
        gt_array = np.delete(gt_array,
                             np.where(np.invert(snp_invar_mask)),
                             axis=0)
        gt_array = allel.GenotypeArray(gt_array)

        # select rows that ARE snps or invariant sites in the position array
        pos_array = pos_array[snp_invar_mask]

        #Basic functions for comparing the genotypes at each site in a region: counts differences out of sites with data

        #For the given region: return average pi, # of differences, # of comparisons, and # missing.
        # this function loops over every site in a region passed to it

        #Basic functions for comparing the genotypes at each site in a region: counts differences out of sites with data

        #For the given region: return average pi, # of differences, # of comparisons, and # missing.
        # this function loops over every site in a region passed to it
        def tallyRegion(gt_region):
            total_diffs = 0
            total_comps = 0
            total_missing = 0
            for site in gt_region:
                vec = site.flatten()
                #now we have an individual site as a numpy.ndarray, pass it to the comparison function
                site_diffs, site_comps, missing = compareGTs(vec)
                total_diffs += site_diffs
                total_comps += site_comps
                total_missing += missing
            if total_comps > 0:
                avg_pi = total_diffs / total_comps
            else:
                avg_pi = 0
            return (avg_pi, total_diffs, total_comps, total_missing)

        #For the given region: return average dxy, # of differences, # of comparisons, and # missing.
        # this function loops over every site in a region passed to it
        def dxyTallyRegion(pop1_gt_region, pop2_gt_region):
            total_diffs = 0
            total_comps = 0
            total_missing = 0
            for x in range(0, len(pop1_gt_region)):
                site1 = pop1_gt_region[x]
                site2 = pop2_gt_region[x]
                vec1 = site1.flatten()
                vec2 = site2.flatten()
                #now we have an individual site as 2 numpy.ndarrays, pass them to the comparison function
                site_diffs, site_comps, missing = dxyCompareGTs(vec1, vec2)
                total_diffs += site_diffs
                total_comps += site_comps
                total_missing += missing
            if total_comps > 0:
                avg_pi = total_diffs / total_comps
            else:
                avg_pi = 0
            return (avg_pi, total_diffs, total_comps, total_missing)

        #Return the number of differences, the number of comparisons, and missing data count.
        def compareGTs(vec):  #for pi
            c = Counter(vec)
            diffs = c[1] * c[0]
            gts = c[1] + c[0]
            missing = (
                len(vec)
            ) - gts  #anything that's not 1 or 0 is ignored and counted as missing
            comps = int(special.comb(gts, 2))
            return (diffs, comps, missing)

        def dxyCompareGTs(vec1, vec2):  #for dxy
            c1 = Counter(vec1)
            c2 = Counter(vec2)
            gt1zeros = c1[0]
            gt1ones = c1[1]
            gts1 = c1[1] + c1[0]
            gt2zeros = c2[0]
            gt2ones = c2[1]
            gts2 = c2[1] + c2[0]
            missing = (len(vec1) + len(vec2)) - (
                gts1 + gts2
            )  #anything that's not 1 or 0 is ignored and counted as missing
            diffs = (gt1zeros * gt2ones) + (gt1ones * gt2zeros)
            comps = gts1 * gts2
            return (diffs, comps, missing)

        # Interval specification check
        # check if computing over specific intervals (otherwise, compute over whole chromosome)

        # window size
        window_size = args.window_size

        # set intervals based on args
        if (args.interval_end is None):
            interval_end = max(pos_array)
        else:
            interval_end = int(args.interval_end)

        if (args.interval_start is None):
            interval_start = min(pos_array)
        else:
            interval_start = int(args.interval_start)

        try:
            if (interval_start > interval_end):
                raise ValueError()
        except ValueError as e:
            raise Exception("[pixy] ERROR: The specified interval start (" +
                            str(interval_start) +
                            ") exceeds the interval end (" +
                            str(interval_end) + ")") from e

        # catch misspecified intervals
        # TBD: harmonize this with the new interval method for the zarr array
        if (interval_end > max(pos_array)):
            print(
                "[pixy] WARNING: The specified interval end (" +
                str(interval_end) +
                ") exceeds the last position of the chromosome and has been substituted with "
                + str(max(pos_array)))
            interval_end = max(pos_array)

        if (interval_start < min(pos_array)):
            print(
                "[pixy] WARNING: The specified interval start (" +
                str(interval_start) +
                ") begins before the first position of the chromosome and has been substituted with "
                + str(min(pos_array)))
            interval_start = min(pos_array)

        if ((interval_end - interval_start + 1) < window_size):
            print(
                "[pixy] WARNING: The requested interval or total number of sites in the VCF ("
                + str(interval_start) + "-" + str(interval_end) +
                ") is smaller than the requested window size (" +
                str(window_size) + ")")

        # PI:
        # AVERAGE NUCLEOTIDE VARIATION WITHIN POPULATIONS

        # Compute pi over a chosen interval and window size

        if (args.populations is not None) and ('pi' in args.stats):

            # open the pi output file for writing
            outfile = open(pi_file, 'a')

            for pop in popnames:

                # window size:
                window_size = args.window_size

                # initialize window_pos_2
                window_pos_2 = (interval_start + window_size) - 1

                # loop over populations and windows, compute stats and write to file
                for window_pos_1 in range(interval_start, interval_end,
                                          window_size):

                    # if the window has no sites, assign all NAs,
                    # otherwise calculate pi
                    if len(pos_array[(pos_array > window_pos_1)
                                     & (pos_array < window_pos_2)]) == 0:
                        avg_pi, total_diffs, total_comps, total_missing, no_sites = "NA", "NA", "NA", "NA", 0
                    else:

                        # pull out the genotypes for the window
                        loc_region = pos_array.locate_range(
                            window_pos_1, window_pos_2)
                        gt_region1 = gt_array[loc_region]
                        no_sites = len(gt_region1)

                        # subset the window for the individuals in each population
                        gt_pop = gt_region1.take(popindices[pop], axis=1)
                        avg_pi, total_diffs, total_comps, total_missing = tallyRegion(
                            gt_pop)

                    outfile.write(
                        str(pop) + "\t" + str(chromosome) + "\t" +
                        str(window_pos_1) + "\t" + str(window_pos_2) + "\t" +
                        str(avg_pi) + "\t" + str(no_sites) + "\t" +
                        str(total_diffs) + "\t" + str(total_comps) + "\t" +
                        str(total_missing) + "\n")
                    window_pos_2 += window_size

                    if window_pos_2 > interval_end:
                        window_pos_2 = interval_end

                # close output file and print complete message
            outfile.close()

            print("[pixy] Pi calculations for chromosome " + chromosome +
                  " complete and written to " + args.outfile_prefix +
                  "_pi.txt")

        # DXY:
        # AVERAGE NUCLEOTIDE VARIATION BETWEEN POPULATIONS

        if (args.populations is not None) and ('dxy' in args.stats):

            # create a list of all pairwise comparisons between populations in the popfile
            dxy_pop_list = list(combinations(popnames, 2))

            # open the dxy output file for writing
            outfile = open(dxy_file, 'a')

            # interate over all population pairs and compute dxy
            for pop_pair in dxy_pop_list:
                pop1 = pop_pair[0]
                pop2 = pop_pair[1]

                # window size:
                window_size = args.window_size

                # initialize window_pos_2
                window_pos_2 = (interval_start + window_size) - 1

                # perform the dxy calculation for all windows in the range
                for window_pos_1 in range(interval_start, interval_end,
                                          window_size):

                    if len(pos_array[(pos_array > window_pos_1)
                                     & (pos_array < window_pos_2)]) == 0:
                        avg_dxy, total_diffs, total_comps, total_missing, no_sites = "NA", "NA", "NA", "NA", 0
                    else:
                        loc_region = pos_array.locate_range(
                            window_pos_1, window_pos_2)
                        gt_region1 = gt_array[loc_region]
                        no_sites = len(gt_region1)

                        # use the popGTs dictionary to keep track of this region's GTs for each population
                        popGTs = {}
                        for name in pop_pair:
                            gt_pop = gt_region1.take(popindices[name], axis=1)
                            popGTs[name] = gt_pop

                        pop1_gt_region1 = popGTs[pop1]
                        pop2_gt_region1 = popGTs[pop2]
                        avg_dxy, total_diffs, total_comps, total_missing = dxyTallyRegion(
                            pop1_gt_region1, pop2_gt_region1)

                    outfile.write(
                        str(pop1) + "\t" + str(pop2) + "\t" + str(chromosome) +
                        "\t" + str(window_pos_1) + "\t" + str(window_pos_2) +
                        "\t" + str(avg_dxy) + "\t" + str(no_sites) + "\t" +
                        str(total_diffs) + "\t" + str(total_comps) + "\t" +
                        str(total_missing) + "\n")

                    window_pos_2 += window_size

                    if window_pos_2 > interval_end:
                        window_pos_2 = interval_end

            outfile.close()
            print("[pixy] Dxy calculations chromosome " + chromosome +
                  " complete and written to " + args.outfile_prefix +
                  "_dxy.txt")

        # FST:
        # WEIR AND COCKERHAMS FST
        # This is just a plain wrapper for the scikit-allel fst function

        if (args.populations is not None) and ('fst' in args.stats):

            # open the fst output file for writing
            outfile = open(fst_file, 'a')

            # determine all the possible population pairings
            pop_names = list(popindices.keys())
            fst_pop_list = list(combinations(pop_names, 2))

            #calculate maf
            allele_counts = gt_array.count_alleles()
            allele_freqs = allele_counts.to_frequencies()
            maf_array = allele_freqs[:, 1] > args.fst_maf_filter

            # apply the maf filter to the genotype array]
            gt_array_fst = gt_array[maf_array]
            gt_array_fst = allel.GenotypeArray(gt_array_fst)

            # apply the maf filter to the position array
            pos_array_fst = pos_array[maf_array]

            # for each pair, compute fst
            for pop_pair in fst_pop_list:

                # the indices for the individuals in each population
                fst_pop_indicies = [
                    popindices[pop_pair[0]].tolist(),
                    popindices[pop_pair[1]].tolist()
                ]

                # compute FST
                # windowed_weir_cockerham_fst seems to generate (spurious?) warnings about div/0, so suppressing warnings
                # (this assumes that the scikit-allel function is working as intended)
                np.seterr(divide='ignore', invalid='ignore')

                a, b, c = allel.windowed_weir_cockerham_fst(
                    pos_array_fst,
                    gt_array_fst,
                    subpops=fst_pop_indicies,
                    size=args.window_size,
                    start=interval_start,
                    stop=interval_end)

                for fst, wind, snps in zip(a, b, c):
                    outfile.write(
                        str(pop_pair[0]) + "\t" + str(pop_pair[1]) + "\t" +
                        str(chromosome) + "\t" + str(wind[0]) + "\t" +
                        str(wind[1]) + "\t" + str(fst) + "\t" + str(snps) +
                        "\n")
            outfile.close()
            print("[pixy] Fst calculations chromosome " + chromosome +
                  " complete and written to " + args.outfile_prefix +
                  "_fst.txt")

    print("\n[pixy] All calculations complete at " +
          time.strftime("%H:%M:%S", time.localtime(start_time)))
    end_time = (time.time() - start_time)
    print("[pixy] Time elapsed: " +
          time.strftime("%H:%M:%S", time.gmtime(end_time)))