Beispiel #1
0
def vcf2zarr(vcf_files, pop_file, zarr_path):
    # Two veery good tutorials:
    # http://alimanfoo.github.io/2018/04/09/selecting-variants.html
    # http://alimanfoo.github.io/2017/06/14/read-vcf.html
    # TODO: Refactor to work without pysam and allel

    # Get a list of the wanted samples from one population which are found in
    # the VCF files. The files must be numbered, and that number must be
    # substituted in the input path string with {n}.
    first_vcf = pysam.VariantFile(vcf_files.replace("{n}", "1"))
    wanted_samples = samples_from_population(pop_file)
    found_samples = list(
        set(wanted_samples).intersection(list(first_vcf.header.samples))
    )

    # Create one zarr folder for each chromosome
    for chrom in range(1, 23):
        vcf = vcf_files.replace("{n}", str(chrom))
        print(f"Creating zarr object for chromosome {chrom}")
        allel.vcf_to_zarr(
            vcf,
            zarr_path,
            group=str(chrom),
            region=str(chrom),
            fields=["POS", "ALT", "samples", "GT"],
            samples=found_samples,
            overwrite=True,
        )

    print("VCF data transformed into Zarr objects")
Beispiel #2
0
def vcf_to_zarr(vcf_path, zarr_path, fields=None):
    if fields is None:
        fields = DEF_VCF_FIELDS

    # convert our fields to allele zarr fields
    zarr_fields = [VARIATION_ZARR_FIELD_MAPPING[field] for field in fields]
    if 'samples' not in zarr_fields:
        zarr_fields.append('samples')

    allel.vcf_to_zarr(str(vcf_path), str(zarr_path), fields=zarr_fields)
Beispiel #3
0
def vcf_to_zarr_func(ch):
    allel.vcf_to_zarr(INFN,
                      OUTFN,
                      region=ch,
                      group=ch,
                      log=sys.stderr,
                      fields=FIELDS,
                      exclude_fields=EXCLUDE_FIELDS,
                      tabix=TABIX_EXEC,
                      transformers=transformers)
Beispiel #4
0
def create_allel_vcfzarr(
    shared_datadir: Path,
    tmpdir: Path,
    *,
    vcf_file: str = "sample.vcf.gz",
    **kwargs: Any,
) -> Path:
    """Create a vcfzarr file using scikit-allel"""
    vcf_path = shared_datadir / vcf_file
    output_path = tmpdir / f"allel_{vcf_file}.zarr"
    allel.vcf_to_zarr(str(vcf_path), str(output_path), **kwargs)
    return output_path
Beispiel #5
0
def create_vcfzarr(
    shared_datadir, tmpdir, *, fields=None, grouped_by_contig=False, consolidated=False
):
    """Create a vcfzarr file using scikit-allel"""
    vcf_path = shared_datadir / "sample.vcf"
    output_path = tmpdir / "sample.vcf.zarr"
    if grouped_by_contig:
        for contig in ["19", "20", "X"]:
            allel.vcf_to_zarr(
                str(vcf_path),
                str(output_path),
                fields=fields,
                group=contig,
                region=contig,
            )
    else:
        allel.vcf_to_zarr(str(vcf_path), str(output_path), fields=fields)
    if consolidated:
        zarr.consolidate_metadata(str(output_path))
    return output_path
Beispiel #6
0
def vcf_to_zarr(vcf_in, tabix_exec, chrom):
    """Convert on-disk VCF to on-disk Zarr database using
    scikit-allele and zarr modules

    Zarr database written to same directory as input VCF
    
    Args:
        vcf_in (str): Path to input VCF on disk
        tabix_exec (str): Full path to tabix executable
        chrom (str): Chromosome for which Zarr database should be created

    Returns:
        None
    """
    vcf_path = os.path.dirname(vcf_in)

    # allel.vcf_to_zarr returns a directory with Zarr databse
    # Set Zarr database outdir
    zarr_base = os.path.basename(vcf_in).split('.')[0]
    zarr_out = vcf_path + '/' + zarr_base + '.zarr'

    # Rename 'numalt' field. Required by Zarr to distinguish `NUMALT` from `numalt`
    # `numalt` is automatically computed by scikit-allel
    rename_dict = {'variants/numalt':'variants/numalt_sci'}

    # Use vcf_to_zarr function from scikit-allel to create zarr database
    # Currently optimized for biallelic SNP VCF but easy to extend functionality
    allel.vcf_to_zarr(
            input=vcf_in,
            output=zarr_out,
            overwrite=True,
            group=chrom,
            rename_fields=rename_dict,
            fields='*',
            alt_number=1,
            tabix=tabix_exec,
            region=chrom,
            compressor=numcodecs.Blosc(cname='zstd', clevel=1, shuffle=False)
            )
Beispiel #7
0
def vcf2zarr(chrom, zarr_path, vcf_path):
    """Convert vcf to zarr.

    Parameters
    ----------
    chroms : TYPE
        DESCRIPTION.
    zarr_path : TYPE
        DESCRIPTION.
    vcf_path : TYPE
        DESCRIPTION.

    Returns
    -------
    None.

    """
    if path.isdir(path.join(zarr_path, chrom)):
        pass
    else:
        allel.vcf_to_zarr(vcf_path, zarr_path, group=chrom,
                          fields='*', alt_number=2, log=sys.stdout,
                          compressor=numcodecs.Blosc(cname='zstd', clevel=1, shuffle=False))
    return None
Beispiel #8
0
import allel, argparse
parser = argparse.ArgumentParser()
parser.add_argument("--vcf", help="path to VCF (or .vcf.gz)")
parser.add_argument("--zarr", help="path for zarr output")
args = parser.parse_args()

allel.vcf_to_zarr(args.vcf, args.zarr)
Beispiel #9
0
    # How does it work with tri-allelic variants?

############################
## testing scikit-allel
allel.__version__
zarr.__version__
numcodecs.__version__
np.__version__

# !ls -lh {VCFdata_fp} # list the files in the directory

# format conversion
zarr_path = '../Data/match_combined_filtered/'
allel.vcf_to_zarr(VCFdata_fp + 'a_106_filtered.vcf.gz',
                  zarr_path + 'a_106_filtered_genotypes.zarr',
                  group='106',
                  fields='*',
                  log=sys.stdout,
                  overwrite=True)

callset = zarr.open_group(zarr_path + 'a_106_filtered_genotypes.zarr',
                          mode='r')
callset.tree(expand=True)

gt_zarr = callset['106/calldata/GT']
gt_zarr.info

pos = callset['106/variants/POS']

loc_region = pos.locate_range(20000000, 20100000)

gt_region = allel.GenotypeArray(gt_zarr)
Beispiel #10
0
from os.path import join
import zarr
import allel

VCF_FILE, OUT_FILE = sys.argv[:]
allel.vcf_to_zarr(VCF_FILE, OUT_FILE, fields='*', overwrite=False)
Beispiel #11
0
    -i: 
    -o:
Date:
"""

# Import Modules
import allel
import subprocess
import argparse
import os
import pdb
import zarr
import shutil
os.environ["NUMEXPR_MAX_THREADS"]="272"

# Set up command line execution
if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='')
    parser.add_argument('-vcf', type=str, metavar='vcf_file', required=True, help='')
    parser.add_argument('-name', type=str, metavar='vcf_name', required=True, help='')
    parser.add_argument('-o', type=str, metavar='output_path', required=True, help='Output_path')
    args = parser.parse_args()

    outdir = os.path.dirname(args.o)

    if not os.path.exists(outdir): os.mkdir(outdir)

    print("compressing chromosome " + args.name)
    allel.vcf_to_zarr(args.vcf, args.o, group=str(args.name), fields='*', overwrite=True)
    print("done compressing chromosome " + args.name)
def main():

    import argparse
    parser = argparse.ArgumentParser(
        description="Convert a single sample VCF to Zarr using "
        "Blosc compression")
    parser.add_argument("--input",
                        required=True,
                        help="path to input VCF file.")
    parser.add_argument("--output",
                        required=True,
                        help="path to output Zarr directory.")
    parser.add_argument("--sample", required=True, help="Sample identifier.")
    parser.add_argument(
        "--contig",
        required=True,
        action='append',
        dest='contigs',
        help="Contig to extract. Multiple values may be provided.")
    parser.add_argument(
        "--field",
        required=True,
        action='append',
        dest='fields',
        help="Field to extract, e.g., 'variants/MQ' or 'calldata/GT'. Multiple "
        "values may be provided.")
    parser.add_argument(
        "--compress-algo",
        help="Blosc compression algorithm.  Choose from [zstd, blosclz, lz4, "
        "lz4hc, zlib, snappy].",
        default="zstd")
    parser.add_argument("--compress-level",
                        help="Compression level. Choose integer from [0, 9].",
                        default=1,
                        type=int)
    parser.add_argument(
        "--compress-shuffle",
        help="Type of data shuffling used to obtain contiguous runs of same "
        "values for improving compression. Choose integer value from "
        "NOSHUFFLE (0), SHUFFLE (1), BITSHUFFLE (2) or AUTOSHUFFLE (-1). "
        "If -1 (default), bit-shuffle will be used for buffers with "
        "itemsize 1, and byte-shuffle will be used otherwise.",
        default=-1,
        type=int)
    parser.add_argument("--alt-number",
                        help="Expected maximum number of alternate alleles.",
                        default=3,
                        type=int)
    parser.add_argument("--tabix",
                        help="Path to tabix executable v0.2.5+.",
                        default="tabix")
    parser.add_argument("--chunk-length",
                        help="Chunk length in number of variants.",
                        default=2**18,
                        type=int)
    parser.add_argument("--chunk-width",
                        help="Chunk width in number of samples.",
                        default=64,
                        type=int)
    parser.add_argument(
        "--log",
        help="Path to logfile, stdout or stderr. Default: stdout.",
        default="stdout")
    parser.add_argument(
        "--zip",
        action="store_true",
        help="If flag exists, entire zarr folder is zipped with no "
        "compression, and the original zarr folder is deleted. "
        "The zip file name is the value given for the --output "
        "argument, appended with '.zip'.")

    args = parser.parse_args()
    input_vcf_path = args.input
    output_zarr_path = args.output
    sample = args.sample
    compress_algo = args.compress_algo
    compress_level = args.compress_level
    compress_shuffle = args.compress_shuffle
    alt_number = args.alt_number
    tabix = args.tabix
    chunk_length = args.chunk_length
    chunk_width = args.chunk_width
    do_zip = args.zip
    contigs = args.contigs
    fields = args.fields
    log = args.log.strip()

    log_file_needs_closing = False
    if log == "stderr":
        log_file = sys.stderr
    elif log == "stdout":
        log_file = sys.stdout
    else:
        log_file = open(log, "w")
        log_file_needs_closing = True

    try:

        for contig in contigs:

            allel.vcf_to_zarr(
                input=input_vcf_path,
                output=output_zarr_path,
                group=f"{sample}/{contig}",
                region=contig,
                samples=[sample],
                compressor=zarr.Blosc(cname=compress_algo,
                                      clevel=compress_level,
                                      shuffle=compress_shuffle),
                overwrite=True,
                tabix=tabix,
                fields=fields,
                alt_number=alt_number,
                chunk_length=chunk_length,
                chunk_width=chunk_width,
                log=log_file,
            )

    finally:
        if log_file_needs_closing:
            log_file.close()

    if do_zip:
        zip_zarr(zarr_path=output_zarr_path, del_orig=True)
Beispiel #13
0
def main(args=None):

    if args is None:
        args = sys.argv[1:]

    # the ascii help image
    help_image = "█▀▀█ ░▀░ █░█ █░░█\n" "█░░█ ▀█▀ ▄▀▄ █▄▄█\n" "█▀▀▀ ▀▀▀ ▀░▀ ▄▄▄█\n"

    help_text = 'pixy: sensible estimates of pi and dxy from a VCF'
    version_text = 'version 0.95.0'

    # initialize arguments
    parser = argparse.ArgumentParser(
        description=help_image + help_text + '\n' + version_text,
        formatter_class=argparse.RawTextHelpFormatter)
    parser.add_argument('--version', action='version', version=version_text)
    parser.add_argument(
        '--stats',
        nargs='+',
        choices=['pi', 'dxy', 'fst'],
        help=
        'Which statistics to calculate from the VCF (pi, dxy, and/or fst, separated by spaces)',
        required=True)
    parser.add_argument('--vcf',
                        type=str,
                        nargs='?',
                        help='Path to the input VCF',
                        required=True)
    parser.add_argument('--zarr_path',
                        type=str,
                        nargs='?',
                        help='Folder in which to build the Zarr array(s)',
                        required=True)
    parser.add_argument(
        '--reuse_zarr',
        choices=['yes', 'no'],
        default='no',
        help='Use existing Zarr array(s) (saves time if re-running)')
    parser.add_argument('--populations',
                        type=str,
                        nargs='?',
                        help='Path to the populations file',
                        required=True)
    parser.add_argument(
        '--window_size',
        type=int,
        nargs='?',
        help='Window size in base pairs over which to calculate pi/dxy')
    parser.add_argument(
        '--chromosomes',
        type=str,
        nargs='?',
        default='all',
        help=
        'A single-quoted, comma separated list of chromosome(s) (e.g. \'X,1,2\')',
        required=False)
    parser.add_argument(
        '--interval_start',
        type=str,
        nargs='?',
        help=
        'The start of the interval over which to calculate pi/dxy. Only valid when calculating over a single chromosome.'
    )
    parser.add_argument(
        '--interval_end',
        type=str,
        nargs='?',
        help=
        'The end of the interval over which to calculate pi/dxy. Only valid when calculating over a single chromosome.'
    )
    parser.add_argument(
        '--variant_filter_expression',
        type=str,
        nargs='?',
        help=
        'A single-quoted, comma separated list of genotype filters (e.g. \'DP>=10,GQ>=20\') to apply to SNPs',
        required=False)
    parser.add_argument(
        '--invariant_filter_expression',
        type=str,
        nargs='?',
        help=
        'A single-quoted, comma separated list of genotype filters (e.g. \'DP>=10,RGQ>=20\') to apply to invariant sites',
        required=False)
    parser.add_argument(
        '--outfile_prefix',
        type=str,
        nargs='?',
        default='./pixy_output',
        help='Path and prefix for the output file, e.g. path/to/outfile')
    parser.add_argument(
        '--bypass_filtration',
        choices=['yes', 'no'],
        default='no',
        help=
        'Bypass all variant filtration (for data lacking FORMAT fields, use with caution)'
    )
    parser.add_argument(
        '--bypass_invariant_check',
        choices=['yes', 'no'],
        default='no',
        help=
        'Allow computation of stats without invariant sites, will result in wildly incorrect estimates most of the time. Use with extreme caution.'
    )
    parser.add_argument(
        '--fst_maf_filter',
        default=0.05,
        type=float,
        nargs='?',
        help=
        'Minor allele frequency filter for FST calculations, with value 0.0-1.0 (default 0.05).'
    )

    # ag1000g test data
    # args = parser.parse_args('--stats fst --vcf data/vcf/multi_chr.vcf.gz --zarr_path data/vcf/multi --window_size 10000 --populations data/vcf/ag1000/Ag1000_sampleIDs_popfile_3.txt --variant_filter_expression DP>=10,GQ>20 --invariant_filter_expression DP>=10,RGQ>20 --outfile_prefix output/pixy_out'.split())

    # filter test data
    # args = parser.parse_args('--stats pi --vcf data/vcf/filter_test.vcf.gz --zarr_path data/vcf/filter_test --window_size 3 --populations data/vcf/ag1000/Ag1000_sampleIDs_popfile_3.txt --variant_filter_expression DP>=10,GQ>20 --invariant_filter_expression DP>=10,RGQ>20 --fst_maf_filter 0.05 --outfile_prefix output/pixy_out'.split())

    # catch arguments from the command line
    args = parser.parse_args()

    # CHECK FOR TABIX
    # (disabled until we implement site level and BED support)
    #tabix_path = shutil.which("tabix")

    #if tabix_path is None:
    #    warnings.warn('[pixy] WARNING: tabix is not installed (or cannot be located) -- this may reduce performance. install tabix with "conda install -c bioconda tabix"')
    #if not os.path.exists(args.vcf + ".tbi") and tabix_path is not None:
    #    raise Exception('[pixy] ERROR: your vcf is not indexed with tabix, index the bgzipped vcf with "tabix your.vcf.gz"')

    # VALIDATE ARGUMENTS

    print("[pixy] pixy " + version_text)
    print(
        "[pixy] Validating VCF and input parameters (this may take some time)..."
    )

    # expand all file paths
    args.vcf = os.path.expanduser(args.vcf)
    args.zarr_path = os.path.expanduser(args.zarr_path)
    args.populations = os.path.expanduser(args.populations)
    args.outfile_prefix = os.path.expanduser(args.outfile_prefix)

    # CHECK FOR EXISTANCE OF VCF AND POPFILES

    if os.path.exists(args.vcf) is not True:
        raise Exception('[pixy] ERROR: The specified VCF ' + str(args.vcf) +
                        ' does not exist')

    if os.path.exists(args.populations) is not True:
        raise Exception('[pixy] ERROR: The specified populations file ' +
                        str(args.populations) + ' does not exist')

    # VALIDATE FILTER EXPRESSIONS

    # get vcf header info
    vcf_headers = allel.read_vcf_headers(args.vcf)

    # skip invariant check if only asking for FST
    if len(args.stats) == 1 and (args.stats[0] == 'fst'):
        args.bypass_invariant_check = "yes"

    # if we are bypassing the invariant check, spoof in a invariant filter
    if args.bypass_invariant_check == "yes":
        args.invariant_filter_expression = "DP>=0"

    if args.bypass_filtration == 'no' and (
            args.variant_filter_expression is None
            or args.invariant_filter_expression is None):
        raise Exception(
            '[pixy] ERROR: One or more filter expression is missing. Provide two filter expressions, or set --bypass_filtration to \'yes\''
        )

    if args.bypass_filtration == 'no':
        # get the list of format fields and requested filter fields
        format_fields = vcf_headers.formats.keys()
        filter_fields = list()

        for x in args.variant_filter_expression.split(","):
            filter_fields.append(re.sub("[^A-Za-z]+", "", x))

        for x in args.invariant_filter_expression.split(","):
            filter_fields.append(re.sub("[^A-Za-z]+", "", x))

        missing = list(set(filter_fields) - set(format_fields))

        if len(missing) > 0:
            raise Exception(
                '[pixy] ERROR: the following genotype filters were requested but not occur in the VCF: ',
                missing)
    else:
        print(
            "[pixy] WARNING: --bypass_filtration is set to \'yes\', genotype filtration will be not be performed."
        )

    # VALIDATE THE VCF

    # check if the vcf is zipped
    if re.search(".gz", args.vcf):
        cat_prog = "gunzip -c "
    else:
        cat_prog = "cat "

    # check if the vcf contains any invariant sites
    # a very basic check: just looks for at least one invariant site in the alt field

    if args.bypass_invariant_check == 'no':
        alt_list = subprocess.check_output(
            cat_prog + args.vcf +
            " | grep -v '#' | head -n 10000 | awk '{print $5}' | sort | uniq",
            shell=True).decode("utf-8").split()
        if "." not in alt_list:
            raise Exception(
                '[pixy] ERROR: the provided VCF appears to contain no invariant sites (ALT = \".\"). This check can be bypassed via --bypass_invariant_check \'yes\'.'
            )
    else:
        if not (len(args.stats) == 1 and (args.stats[0] == 'fst')):
            print(
                "[pixy] EXTREME WARNING: --bypass_invariant_check is set to \'yes\', which assumes that your VCF contains invariant sites. Lack of invariant sites will result in incorrect estimates."
            )

    # check if requested chromosomes exist in vcf
    # defaults to all the chromosomes contained in the VCF (first data column)

    if args.chromosomes == 'all':
        chrom_list = subprocess.check_output(
            cat_prog + args.vcf + " | grep -v '#' | awk '{print $1}' | uniq",
            shell=True).decode("utf-8").split()
        chrom_all = chrom_list

    if args.chromosomes == 'all':
        chrom_list = subprocess.check_output(
            cat_prog + args.vcf + " | grep -v '#' | awk '{print $1}' | uniq",
            shell=True).decode("utf-8").split()
        chrom_all = chrom_list
    else:
        chrom_list = list(args.chromosomes.split(","))
        chrom_all = subprocess.check_output(
            cat_prog + args.vcf + " | grep -v '#' | awk '{print $1}' | uniq",
            shell=True).decode("utf-8").split()
        missing = list(set(chrom_list) - set(chrom_all))
        if len(missing) > 0:
            raise Exception(
                '[pixy] ERROR: the following chromosomes were requested but not occur in the VCF: ',
                missing)

    # INTERVALS
    # check if intervals are correctly specified

    if args.interval_start is not None and args.interval_end is None:
        raise Exception(
            '[pixy] ERROR: Both --interval_start and --interval_end must be specified'
        )

    if args.interval_start is None and args.interval_end is not None:
        raise Exception(
            '[pixy] ERROR: Both --interval_start and --interval_end must be specified'
        )

    if args.interval_start is not None and args.interval_end is not None and len(
            chrom_list) > 1:
        raise Exception(
            '[pixy] ERROR: --interval_start and --interval_end are not valid when calculating over multiple chromosomes. Remove both arguments or specify a single chromosome.'
        )

    # SAMPLES
    # check if requested samples exist in vcf

    # - parse + validate the population file
    # - format is IND POP (tab separated)
    # - throws an error if individuals are missing from VCF

    # read in the list of samples/populations
    poppanel = pandas.read_csv(args.populations,
                               sep='\t',
                               usecols=[0, 1],
                               names=['ID', 'Population'])
    poppanel.head()

    # get a list of samples from the callset
    samples_list = vcf_headers.samples

    # make sure every indiv in the pop file is in the VCF callset
    IDs = list(poppanel['ID'])
    missing = list(set(IDs) - set(samples_list))

    # find the samples in the callset index by matching up the order of samples between the population file and the callset
    # also check if there are invalid samples in the popfile
    try:
        samples_callset_index = [samples_list.index(s) for s in poppanel['ID']]
    except ValueError as e:
        raise Exception(
            '[pixy] ERROR: the following samples are listed in the population file but not in the VCF: ',
            missing) from e
    else:
        poppanel['callset_index'] = samples_callset_index

        # use the popindices dictionary to keep track of the indices for each population
        popindices = {}
        popnames = poppanel.Population.unique()
        for name in popnames:
            popindices[name] = poppanel[poppanel.Population ==
                                        name].callset_index.values

    print("[pixy] Preparing for calculation of summary statistics: " +
          ','.join(map(str, args.stats)))
    print("[pixy] Data set contains " + str(len(popnames)) +
          " population(s), " + str(len(chrom_list)) + " chromosome(s), and " +
          str(len(IDs)) + " sample(s)")

    # initialize and remove any previous output files
    if os.path.exists(re.sub(r"[^\/]+$", "", args.outfile_prefix)) is not True:
        os.mkdir(re.sub(r"[^\/]+$", "", args.outfile_prefix))

    # initialize the output files for writing
    if 'pi' in args.stats:

        pi_file = str(args.outfile_prefix) + "_pi.txt"

        if os.path.exists(pi_file):
            os.remove(pi_file)

        outfile = open(pi_file, 'a')
        outfile.write("pop" + "\t" + "chromosome" + "\t" + "window_pos_1" +
                      "\t" + "window_pos_2" + "\t" + "avg_pi" + "\t" +
                      "no_sites" + "\t" + "count_diffs" + "\t" +
                      "count_comparisons" + "\t" + "count_missing" + "\n")
        outfile.close()

    if 'dxy' in args.stats:

        dxy_file = str(args.outfile_prefix) + "_dxy.txt"

        if os.path.exists(dxy_file):
            os.remove(dxy_file)

        outfile = open(dxy_file, 'a')
        outfile.write("pop1" + "\t" + "pop2" + "\t" + "chromosome" + "\t" +
                      "window_pos_1" + "\t" + "window_pos_2" + "\t" +
                      "avg_dxy" + "\t" + "no_sites" + "\t" + "count_diffs" +
                      "\t" + "count_comparisons" + "\t" + "count_missing" +
                      "\n")
        outfile.close()

    if 'fst' in args.stats:

        fst_file = str(args.outfile_prefix) + "_fst.txt"

        if os.path.exists(fst_file):
            os.remove(fst_file)

        outfile = open(fst_file, 'a')
        outfile.write("pop1" + "\t" + "pop2" + "\t" + "chromosome" + "\t" +
                      "window_pos_1" + "\t" + "window_pos_2" + "\t" +
                      "avg_wc_fst" + "\t" + "no_snps" + "\n")
        outfile.close()

    # initialize the folder structure for the zarr array
    if os.path.exists(args.zarr_path) is not True:
        pathlib.Path(args.zarr_path).mkdir(parents=True, exist_ok=True)

    # main loop for computing summary stats

    # time the calculations
    start_time = time.time()
    print("[pixy] Started calculations at " +
          time.strftime("%H:%M:%S", time.localtime(start_time)))

    for chromosome in chrom_list:

        # Zarr array conversion

        # the chromosome specific zarr path
        zarr_path = args.zarr_path + "/" + chromosome

        # determine the fields that will be included
        # TBD: just reading all fields currently
        # vcf_fields = ['variants/CHROM', 'variants/POS'] + ['calldata/' + s for s in np.unique(filter_fields)]

        # build region string (if using an interval)
        if args.interval_start is not None:
            targ_region = chromosome + ":" + str(
                args.interval_start) + "-" + str(args.interval_end)
        else:
            targ_region = chromosome

        # allow for resuse of previously calculated zarr arrays
        if args.reuse_zarr == 'yes' and os.path.exists(zarr_path):
            print(
                "[pixy] If a zarr array exists, it will be reused for chromosome "
                + chromosome + "...")
        elif args.reuse_zarr == 'no' or os.path.exists(zarr_path) is not True:
            print("[pixy] Building zarr array for chromosome " + chromosome +
                  "...")
            warnings.filterwarnings("ignore")
            allel.vcf_to_zarr(args.vcf,
                              zarr_path,
                              region=targ_region,
                              fields='*',
                              overwrite=True)
            warnings.resetwarnings()

        print("[pixy] Calculating statistics for chromosome " + targ_region +
              "...")

        # open the zarr
        callset = zarr.open_group(zarr_path, mode='r')

        # parse the filtration expression and build the boolean filter array

        # define an operator dictionary for parsing the operator strings
        ops = {
            "<": operator.lt,
            "<=": operator.le,
            ">": operator.gt,
            ">=": operator.ge,
            "==": operator.eq
        }

        # determine the complete list of available calldata fields usable for filtration
        calldata_fields = sorted(callset['/calldata/'].array_keys())

        # check if bypassing filtration, otherwise filter
        if args.bypass_filtration == 'no':

            # VARIANT SITE FILTERS
            var_filters = []

            # iterate over each requested variant filter
            for x in args.variant_filter_expression.split(","):
                stat = re.sub("[^A-Za-z]+", "", x)
                value = int(re.sub("[^0-9]+", "", x))
                compare = re.sub("[A-Za-z0-9]+", "", x)

                # check if the requested filter/format exists in the VCF
                try:
                    stat_index = calldata_fields.index(stat)
                except ValueError as e:
                    raise Exception(
                        "[pixy] ERROR: The requested filter \'" + stat +
                        "\' is not annotated in the input VCF FORMAT field"
                    ) from e
                else:
                    if type(var_filters) is list:
                        var_filters = ops[compare](callset['/calldata/' +
                                                           stat][:], value)
                    elif type(var_filters) is not list:
                        var_filters = np.logical_and(
                            var_filters,
                            ops[compare](callset['/calldata/' + stat][:],
                                         value))

            # create a mask for variants only
            # is snp is a site level (1d) array
            # np.tile below creates a column of "is_snp" once for each sample
            # (i.e. makes it the same dimensions as the genotype table)
            is_snp = np.array([callset['/variants/is_snp'][:].flatten()
                               ]).transpose()
            snp_mask = np.tile(is_snp, (1, var_filters.shape[1]))

            # force only variant sites (snps, remember we ignore indels) to be included in the filter
            var_filters = np.logical_and(var_filters, snp_mask)

            # INVARIANT SITE FILTERS
            invar_filters = []

            for x in args.invariant_filter_expression.split(","):
                stat = re.sub("[^A-Za-z]+", "", x)
                value = int(re.sub("[^0-9]+", "", x))
                compare = re.sub("[A-Za-z0-9]+", "", x)

                # check if the requested filter/format exists in the VCF
                try:
                    stat_index = calldata_fields.index(stat)
                except ValueError as e:
                    raise Exception(
                        "[pixy] ERROR: The requested filter \'" + stat +
                        "\' is not annotated in the input VCF") from e
                else:
                    if type(invar_filters) is list:
                        invar_filters = ops[compare](callset['/calldata/' +
                                                             stat][:], value)
                    elif type(var_filters) is not list:
                        invar_filters = np.logical_and(
                            invar_filters,
                            ops[compare](callset['/calldata/' + stat][:],
                                         value))

            # create a mask for invariant sites by inverting the snp filter
            # join that to the invariant sites filter

            invar_filters = np.logical_and(invar_filters, np.invert(snp_mask))

            # join the variant and invariant filter masks (logical OR)
            filters = np.logical_or(invar_filters, var_filters)

        # applying the filter to the data
        # all the filters are in a boolean array ('filters' above)

        # first, recode the gt matrix as a Dask array (saves memory) -> packed
        # create a packed genotype array
        # this is a array with dims snps x samples
        # genotypes are represented by single byte codes
        # critically, as the same dims as the filters array below

        gt_array = allel.GenotypeArray(
            allel.GenotypeDaskArray(callset['/calldata/GT'])).to_packed()

        # apply filters
        # only if not bypassing filtration
        if args.bypass_filtration == 'no':
            # set all genotypes that fail filters (the inversion of the array)
            # to 'missing', 239 = -1 (i.e. missing) for packed arrays
            gt_array[np.invert(filters)] = 239

        # convert the packed array back to a GenotypeArray
        gt_array = allel.GenotypeArray.from_packed(gt_array)

        # build the position array
        pos_array = allel.SortedIndex(callset['/variants/POS'])

        # a mask for snps and invariant sites
        snp_invar_mask = np.logical_or(
            np.logical_and(callset['/variants/is_snp'][:] == 1,
                           callset['/variants/numalt'][:] == 1),
            callset['/variants/numalt'][:] == 0)

        # remove rows that are NOT snps or invariant sites from the genotype array
        gt_array = np.delete(gt_array,
                             np.where(np.invert(snp_invar_mask)),
                             axis=0)
        gt_array = allel.GenotypeArray(gt_array)

        # select rows that ARE snps or invariant sites in the position array
        pos_array = pos_array[snp_invar_mask]

        #Basic functions for comparing the genotypes at each site in a region: counts differences out of sites with data

        #For the given region: return average pi, # of differences, # of comparisons, and # missing.
        # this function loops over every site in a region passed to it

        #Basic functions for comparing the genotypes at each site in a region: counts differences out of sites with data

        #For the given region: return average pi, # of differences, # of comparisons, and # missing.
        # this function loops over every site in a region passed to it
        def tallyRegion(gt_region):
            total_diffs = 0
            total_comps = 0
            total_missing = 0
            for site in gt_region:
                vec = site.flatten()
                #now we have an individual site as a numpy.ndarray, pass it to the comparison function
                site_diffs, site_comps, missing = compareGTs(vec)
                total_diffs += site_diffs
                total_comps += site_comps
                total_missing += missing
            if total_comps > 0:
                avg_pi = total_diffs / total_comps
            else:
                avg_pi = 0
            return (avg_pi, total_diffs, total_comps, total_missing)

        #For the given region: return average dxy, # of differences, # of comparisons, and # missing.
        # this function loops over every site in a region passed to it
        def dxyTallyRegion(pop1_gt_region, pop2_gt_region):
            total_diffs = 0
            total_comps = 0
            total_missing = 0
            for x in range(0, len(pop1_gt_region)):
                site1 = pop1_gt_region[x]
                site2 = pop2_gt_region[x]
                vec1 = site1.flatten()
                vec2 = site2.flatten()
                #now we have an individual site as 2 numpy.ndarrays, pass them to the comparison function
                site_diffs, site_comps, missing = dxyCompareGTs(vec1, vec2)
                total_diffs += site_diffs
                total_comps += site_comps
                total_missing += missing
            if total_comps > 0:
                avg_pi = total_diffs / total_comps
            else:
                avg_pi = 0
            return (avg_pi, total_diffs, total_comps, total_missing)

        #Return the number of differences, the number of comparisons, and missing data count.
        def compareGTs(vec):  #for pi
            c = Counter(vec)
            diffs = c[1] * c[0]
            gts = c[1] + c[0]
            missing = (
                len(vec)
            ) - gts  #anything that's not 1 or 0 is ignored and counted as missing
            comps = int(special.comb(gts, 2))
            return (diffs, comps, missing)

        def dxyCompareGTs(vec1, vec2):  #for dxy
            c1 = Counter(vec1)
            c2 = Counter(vec2)
            gt1zeros = c1[0]
            gt1ones = c1[1]
            gts1 = c1[1] + c1[0]
            gt2zeros = c2[0]
            gt2ones = c2[1]
            gts2 = c2[1] + c2[0]
            missing = (len(vec1) + len(vec2)) - (
                gts1 + gts2
            )  #anything that's not 1 or 0 is ignored and counted as missing
            diffs = (gt1zeros * gt2ones) + (gt1ones * gt2zeros)
            comps = gts1 * gts2
            return (diffs, comps, missing)

        # Interval specification check
        # check if computing over specific intervals (otherwise, compute over whole chromosome)

        # window size
        window_size = args.window_size

        # set intervals based on args
        if (args.interval_end is None):
            interval_end = max(pos_array)
        else:
            interval_end = int(args.interval_end)

        if (args.interval_start is None):
            interval_start = min(pos_array)
        else:
            interval_start = int(args.interval_start)

        try:
            if (interval_start > interval_end):
                raise ValueError()
        except ValueError as e:
            raise Exception("[pixy] ERROR: The specified interval start (" +
                            str(interval_start) +
                            ") exceeds the interval end (" +
                            str(interval_end) + ")") from e

        # catch misspecified intervals
        # TBD: harmonize this with the new interval method for the zarr array
        if (interval_end > max(pos_array)):
            print(
                "[pixy] WARNING: The specified interval end (" +
                str(interval_end) +
                ") exceeds the last position of the chromosome and has been substituted with "
                + str(max(pos_array)))
            interval_end = max(pos_array)

        if (interval_start < min(pos_array)):
            print(
                "[pixy] WARNING: The specified interval start (" +
                str(interval_start) +
                ") begins before the first position of the chromosome and has been substituted with "
                + str(min(pos_array)))
            interval_start = min(pos_array)

        if ((interval_end - interval_start + 1) < window_size):
            print(
                "[pixy] WARNING: The requested interval or total number of sites in the VCF ("
                + str(interval_start) + "-" + str(interval_end) +
                ") is smaller than the requested window size (" +
                str(window_size) + ")")

        # PI:
        # AVERAGE NUCLEOTIDE VARIATION WITHIN POPULATIONS

        # Compute pi over a chosen interval and window size

        if (args.populations is not None) and ('pi' in args.stats):

            # open the pi output file for writing
            outfile = open(pi_file, 'a')

            for pop in popnames:

                # window size:
                window_size = args.window_size

                # initialize window_pos_2
                window_pos_2 = (interval_start + window_size) - 1

                # loop over populations and windows, compute stats and write to file
                for window_pos_1 in range(interval_start, interval_end,
                                          window_size):

                    # if the window has no sites, assign all NAs,
                    # otherwise calculate pi
                    if len(pos_array[(pos_array > window_pos_1)
                                     & (pos_array < window_pos_2)]) == 0:
                        avg_pi, total_diffs, total_comps, total_missing, no_sites = "NA", "NA", "NA", "NA", 0
                    else:

                        # pull out the genotypes for the window
                        loc_region = pos_array.locate_range(
                            window_pos_1, window_pos_2)
                        gt_region1 = gt_array[loc_region]
                        no_sites = len(gt_region1)

                        # subset the window for the individuals in each population
                        gt_pop = gt_region1.take(popindices[pop], axis=1)
                        avg_pi, total_diffs, total_comps, total_missing = tallyRegion(
                            gt_pop)

                    outfile.write(
                        str(pop) + "\t" + str(chromosome) + "\t" +
                        str(window_pos_1) + "\t" + str(window_pos_2) + "\t" +
                        str(avg_pi) + "\t" + str(no_sites) + "\t" +
                        str(total_diffs) + "\t" + str(total_comps) + "\t" +
                        str(total_missing) + "\n")
                    window_pos_2 += window_size

                    if window_pos_2 > interval_end:
                        window_pos_2 = interval_end

                # close output file and print complete message
            outfile.close()

            print("[pixy] Pi calculations for chromosome " + chromosome +
                  " complete and written to " + args.outfile_prefix +
                  "_pi.txt")

        # DXY:
        # AVERAGE NUCLEOTIDE VARIATION BETWEEN POPULATIONS

        if (args.populations is not None) and ('dxy' in args.stats):

            # create a list of all pairwise comparisons between populations in the popfile
            dxy_pop_list = list(combinations(popnames, 2))

            # open the dxy output file for writing
            outfile = open(dxy_file, 'a')

            # interate over all population pairs and compute dxy
            for pop_pair in dxy_pop_list:
                pop1 = pop_pair[0]
                pop2 = pop_pair[1]

                # window size:
                window_size = args.window_size

                # initialize window_pos_2
                window_pos_2 = (interval_start + window_size) - 1

                # perform the dxy calculation for all windows in the range
                for window_pos_1 in range(interval_start, interval_end,
                                          window_size):

                    if len(pos_array[(pos_array > window_pos_1)
                                     & (pos_array < window_pos_2)]) == 0:
                        avg_dxy, total_diffs, total_comps, total_missing, no_sites = "NA", "NA", "NA", "NA", 0
                    else:
                        loc_region = pos_array.locate_range(
                            window_pos_1, window_pos_2)
                        gt_region1 = gt_array[loc_region]
                        no_sites = len(gt_region1)

                        # use the popGTs dictionary to keep track of this region's GTs for each population
                        popGTs = {}
                        for name in pop_pair:
                            gt_pop = gt_region1.take(popindices[name], axis=1)
                            popGTs[name] = gt_pop

                        pop1_gt_region1 = popGTs[pop1]
                        pop2_gt_region1 = popGTs[pop2]
                        avg_dxy, total_diffs, total_comps, total_missing = dxyTallyRegion(
                            pop1_gt_region1, pop2_gt_region1)

                    outfile.write(
                        str(pop1) + "\t" + str(pop2) + "\t" + str(chromosome) +
                        "\t" + str(window_pos_1) + "\t" + str(window_pos_2) +
                        "\t" + str(avg_dxy) + "\t" + str(no_sites) + "\t" +
                        str(total_diffs) + "\t" + str(total_comps) + "\t" +
                        str(total_missing) + "\n")

                    window_pos_2 += window_size

                    if window_pos_2 > interval_end:
                        window_pos_2 = interval_end

            outfile.close()
            print("[pixy] Dxy calculations chromosome " + chromosome +
                  " complete and written to " + args.outfile_prefix +
                  "_dxy.txt")

        # FST:
        # WEIR AND COCKERHAMS FST
        # This is just a plain wrapper for the scikit-allel fst function

        if (args.populations is not None) and ('fst' in args.stats):

            # open the fst output file for writing
            outfile = open(fst_file, 'a')

            # determine all the possible population pairings
            pop_names = list(popindices.keys())
            fst_pop_list = list(combinations(pop_names, 2))

            #calculate maf
            allele_counts = gt_array.count_alleles()
            allele_freqs = allele_counts.to_frequencies()
            maf_array = allele_freqs[:, 1] > args.fst_maf_filter

            # apply the maf filter to the genotype array]
            gt_array_fst = gt_array[maf_array]
            gt_array_fst = allel.GenotypeArray(gt_array_fst)

            # apply the maf filter to the position array
            pos_array_fst = pos_array[maf_array]

            # for each pair, compute fst
            for pop_pair in fst_pop_list:

                # the indices for the individuals in each population
                fst_pop_indicies = [
                    popindices[pop_pair[0]].tolist(),
                    popindices[pop_pair[1]].tolist()
                ]

                # compute FST
                # windowed_weir_cockerham_fst seems to generate (spurious?) warnings about div/0, so suppressing warnings
                # (this assumes that the scikit-allel function is working as intended)
                np.seterr(divide='ignore', invalid='ignore')

                a, b, c = allel.windowed_weir_cockerham_fst(
                    pos_array_fst,
                    gt_array_fst,
                    subpops=fst_pop_indicies,
                    size=args.window_size,
                    start=interval_start,
                    stop=interval_end)

                for fst, wind, snps in zip(a, b, c):
                    outfile.write(
                        str(pop_pair[0]) + "\t" + str(pop_pair[1]) + "\t" +
                        str(chromosome) + "\t" + str(wind[0]) + "\t" +
                        str(wind[1]) + "\t" + str(fst) + "\t" + str(snps) +
                        "\n")
            outfile.close()
            print("[pixy] Fst calculations chromosome " + chromosome +
                  " complete and written to " + args.outfile_prefix +
                  "_fst.txt")

    print("\n[pixy] All calculations complete at " +
          time.strftime("%H:%M:%S", time.localtime(start_time)))
    end_time = (time.time() - start_time)
    print("[pixy] Time elapsed: " +
          time.strftime("%H:%M:%S", time.gmtime(end_time)))
#! /usr/bin/python

# This script reads a vcf file and converts it into zarr format, for faster access from scikit-allel.

# Usage: ./vcf2zarr.py <file.vcf>

import os
import numexpr

os.environ["NUMEXPR_MAX_THREADS"] = "272"
#if 'NUMEXPR_MAX_THREADS' in os.environ: os.environ.pop('NUMEXPR_MAX_THREADS')
#import numexpr
#print('NumExpr.nthreads =' + str(numexpr.nthreads))
import allel as al
import zarr
import numpy as np
import sys

vcfPath = sys.argv[1]

variants = al.read_vcf(vcfPath, numbers={
    'GT': 2,
    'ALT': 1
}, fields='*')  # with all (*) fields read

zarrPath = str(vcfPath.split('Bypop.')[0] + '.zarr')
al.vcf_to_zarr(vcfPath, zarrPath, fields='*', log=sys.stdout, overwrite=True)
Beispiel #15
0
def convert_to_zarr(input_vcf_path,
                    output_zarr_path,
                    conversion_config,
                    benchmark_profiler=None):
    """ Converts the original data (VCF) to a Zarr format. Only converts a single VCF file.
    If a BenchmarkRunner is provided, the actual VCF to Zarr conversion process will be benchmarked.
    :param input_vcf_path: The input VCF file location
    :param output_zarr_path: The desired Zarr output location
    :param conversion_config: Configuration data for the conversion
    :param benchmark_runner: BenchmarkRunner object to be used for benchmarking process
    :type input_vcf_path: str
    :type output_zarr_path: str
    :type conversion_config: config.VCFtoZarrConfigurationRepresentation
    :type benchmark_runner: core.BenchmarkProfiler
    """
    if conversion_config is not None:
        # Ensure var is string, not pathlib.Path
        output_zarr_path = str(output_zarr_path)

        # Get fields to extract (for unit testing only)
        fields = conversion_config.fields

        # Get alt number
        if conversion_config.alt_number is None:
            print(
                "[VCF-Zarr] Determining maximum number of ALT alleles by scaling all variants in the VCF file."
            )

            if benchmark_profiler is not None:
                benchmark_profiler.start_benchmark(
                    operation_name="Read VCF file into memory for alt number")

            # Scan VCF file to find max number of alleles in any variant
            callset = allel.read_vcf(input_vcf_path,
                                     fields=['numalt'],
                                     log=sys.stdout)

            if benchmark_profiler is not None:
                benchmark_profiler.end_benchmark()

            numalt = callset['variants/numalt']

            if benchmark_profiler is not None:
                benchmark_profiler.start_benchmark(
                    operation_name="Determine maximum alt number")

            alt_number = np.max(numalt)

            if benchmark_profiler is not None:
                benchmark_profiler.end_benchmark()
        else:
            print("[VCF-Zarr] Using alt number provided in configuration.")
            # Use the configuration-provided alt number
            alt_number = conversion_config.alt_number
        print("[VCF-Zarr] Alt number: {}".format(alt_number))

        # Get chunk length
        chunk_length = allel.io.vcf_read.DEFAULT_CHUNK_LENGTH
        if conversion_config.chunk_length is not None:
            chunk_length = conversion_config.chunk_length
        print("[VCF-Zarr] Chunk length: {}".format(chunk_length))

        # Get chunk width
        chunk_width = allel.io.vcf_read.DEFAULT_CHUNK_WIDTH
        if conversion_config.chunk_width is not None:
            chunk_width = conversion_config.chunk_width
        print("[VCF-Zarr] Chunk width: {}".format(chunk_width))

        if conversion_config.compressor == "Blosc":
            compressor = Blosc(
                cname=conversion_config.blosc_compression_algorithm,
                clevel=conversion_config.blosc_compression_level,
                shuffle=conversion_config.blosc_shuffle_mode)
        else:
            raise ValueError("Unexpected compressor type specified.")

        if benchmark_profiler is not None:
            benchmark_profiler.start_benchmark(
                operation_name="Convert VCF to Zarr")

        # Perform the VCF to Zarr conversion
        allel.vcf_to_zarr(input_vcf_path,
                          output_zarr_path,
                          alt_number=alt_number,
                          overwrite=True,
                          fields=fields,
                          log=sys.stdout,
                          compressor=compressor,
                          chunk_length=chunk_length,
                          chunk_width=chunk_width)

        if benchmark_profiler is not None:
            benchmark_profiler.end_benchmark()
Beispiel #16
0
def calc_obsStats(vcfpath, chrom, pops, coord_bed, zarrpath, outpath):
    """Calculate stats from a VCF file."""
    # if reuse_zarr is true
    if zarrpath.exists():
        zarrfile = zarrpath
    else:
        zarrfile = zarrpath
        allel.vcf_to_zarr(str(vcfpath),
                          str(zarrpath),
                          group=chrom,
                          fields='*',
                          alt_number=2,
                          log=sys.stdout,
                          compressor=numcodecs.Blosc(cname='zstd',
                                                     clevel=1,
                                                     shuffle=False))

    # load pop info
    panel = pd.read_csv(pops, sep='\t', usecols=['sampleID', 'population'])

    # load zarr
    callset = zarr.open_group(str(zarrfile), mode='r')
    samples = callset[f'{chrom}/samples'][:]
    samples_list = list(samples)
    samples_callset_index = [samples_list.index(s) for s in panel['sampleID']]
    panel['callset_index'] = samples_callset_index
    panel = panel.sort_values(by='callset_index')

    # load gt
    pos = allel.SortedIndex(callset[f'{chrom}/variants/POS'])
    gt = allel.GenotypeArray(callset[f'{chrom}/calldata/GT'])

    # separate gt for each population
    ix_s = 0
    pop_dt = {}
    pop_ix = []
    for i, p in enumerate(panel["population"].unique()):
        p_ix = panel[panel["population"] == p]["callset_index"].values
        ix_e = len(p_ix) * 2 + ix_s
        pop_ix.append(list(range(ix_s, ix_e)))
        pop_dt[p] = gt.take(p_ix, axis=1).to_haplotypes()
        ix_s = ix_e

    # combine and transpose
    haps = np.concatenate(list(pop_dt.values()), axis=1).T

    # prep progress bar
    ln_count = 0
    with open(coord_bed, 'r') as cb:
        for line in cb:
            if not line.startswith("chrom"):
                ln_count += 1

    progressbar = tqdm(total=ln_count, desc="window numb", unit='window')

    # update stats_dt
    stats_dt["num_haps"] = haps.shape[0]
    stats_dt["pop_config"] = pop_ix
    stats_dt["length_bp"] = int(
        line.split()[-1])  # may be shorter than expected due to last window
    stats_dt["reps"] = ln_count

    # write headers
    outfile = outpath.parent / f"{outpath.stem}.Obs.pop_stats.txt"
    pops_outfile = open(outfile, 'w')
    pops_outfile, header_, header_ls = headers(pops_outfile,
                                               stats_dt,
                                               pop_names=list(pop_dt.keys()),
                                               obs=True)

    # calc stats
    # TODO: parallel
    chrom_ls = []
    i = 0
    stat_mat = np.zeros([ln_count, len(header_ls) - 1])
    with open(coord_bed, 'r') as cb:
        for line in cb:
            if not line.startswith("chrom"):
                cb_lin = line.split()
                chrom = cb_lin[0]
                chrom_ls.append(chrom)
                start = int(cb_lin[1])
                stop = int(cb_lin[2])
                len_bp = stop - start
                stats_dt["length_bp"] = len_bp
                sites = int(cb_lin[3])
                try:
                    pos_ix = pos.locate_range(start, stop)
                except KeyError:
                    continue
                pos_t = pos[pos_ix] - start
                haps_t = haps[:, pos_ix]
                counts_t = haps_t.sum(axis=0).astype(int)
                # run stats
                stats_ls = [start, stop, sites]
                popsumstats = PopSumStats(pos_t, haps_t, counts_t, stats_dt)
                for stat in stats_dt["calc_stats"]:
                    stat_fx = getattr(popsumstats, stat)
                    try:
                        ss = stat_fx()
                        # print(f"{stat} =  {len(ss)}")
                    except IndexError:
                        ss = [np.nan] * len(stats_dt["pw_quants"])
                    stats_ls.extend(ss)
                try:
                    stat_mat[i, :] = stats_ls
                    i += 1
                    progressbar.update()
                except ValueError:
                    continue
    # write stats out
    stat_mean = np.round(np.nanmean(stat_mat, axis=0), 5)
    stats_str = "\t".join(map(str, stat_mean[3:]))
    pops_outfile.write(
        f"mean_{chrom}\t{int(stat_mat[0, 0])}\t{stop}\t{np.sum(stat_mat[:, 2])}\t{stats_str}\n"
    )
    for stat in range(stat_mat.shape[0]):
        chrom = chrom_ls[stat]
        start = int(stat_mat[stat, 0])
        stop = int(stat_mat[stat, 1])
        sites = int(stat_mat[stat, 2])
        rd = [round(num, 5) for num in stat_mat[stat, 3:]]
        stats_str = "\t".join(map(str, rd))
        pops_outfile.write(f"{chrom}\t{start}\t{stop}\t{sites}\t{stats_str}\n")
    progressbar.close()
    pops_outfile.close()

    return outfile
Beispiel #17
0
# 'variants/MEND',
# 'variants/MLEN',
# 'variants/MSTART',
# 'variants/SVLEN',
# 'variants/SVTYPE',
# 'variants/TSD',
# 'variants/AC',
# 'variants/AF',
# 'variants/NS',
# 'variants/AN',
# 'variants/EAS_AF',
# 'variants/EUR_AF',
# 'variants/AFR_AF',
# 'variants/AMR_AF',
# 'variants/SAS_AF',
# 'variants/DP',
# 'variants/AA',
# 'variants/VT',
# 'variants/EX_TARGET',
# 'variants/MULTI_ALLELIC']

# test_fields += ['variants/numalt', 'variants/svlen', 'variants/is_snp']
test_fields += ['variants/numalt','variants/is_snp', 'variants/svlen']


# test_fields = ['variants/*']

ska.vcf_to_zarr(vcf_file, vcf_file.replace('.vcf.gz', '.zarr'),
                  fields=test_fields, alt_number=8, overwrite=True,
                  compressor=numcodecs.Blosc(cname='zstd', clevel=1, shuffle=False))
Beispiel #18
0
#Initial configuration, probably overkill in imports.
import sys, os, re
import numpy as np
import allel
import zarr
import dask
import numcodecs
import warnings
from pathlib import Path
import pandas as pd
import argparse

parser = argparse.ArgumentParser(description='Convert vcf to zarr.')
parser.add_argument('input')
parser.add_argument('output')

args = parser.parse_args()

print("starting", args.input, args.output)
allel.vcf_to_zarr(args.input, args.output, fields='*', overwrite=False)
Beispiel #19
0
#zarr.__version__
#numcodecs.__version__
#np.__version__

## Data source
ID_table = pd.read_csv('../Data/match_combined_filtered/ID_table.csv')
VCFdata_fp = "../Data/match_combined_filtered/all_chr22.vcf.gz"

zarr_path = '../Data/match_combined_filtered/'

# !ls -lh {VCFdata_fp} # list the files in the directory

# format conversion
allel.vcf_to_zarr(VCFdata_fp,
                  zarr_path + 'all_chr22.zarr',
                  group='chr22',
                  fields='*',
                  log=sys.stdout,
                  overwrite=True)

callset = zarr.open_group(zarr_path + 'all_chr22.zarr', mode='r')
callset.tree(expand=True)

gt_zarr = callset['chr22/calldata/GT']
gt_zarr.info

#pos = callset['chr22/variants/POS']

#loc_region = pos.locate_range(20000, 20100)
samples = callset['chr22/samples']  # columns
gt = allel.GenotypeArray(gt_zarr)  # genotypes
Beispiel #20
0
# run in parallel
################################################################################
# conda activate vcf
# VCF_DIR="/mnt/md0/malariaGen_genomic/pf3k_v5/5.1"
# OUT_DIR="/mnt/md0/malariaGen_genomic/pf3k_v5/pf3k_zarr""
# parallel "python3 pf3k_vcf2zarr.py $VCF_DIR $OUT_DIR {}":::{01..14}

################################################################################
import os, sys
import zarr
import allel

VCF_DIR, OUT_DIR, CHROM = sys.argv[:]
allel.vcf_to_zarr(
    "/".join([
        VCF_DIR,
        "_".join(["SNP_INDEL_Pf3D7", CHROM, "v3.combined.filtered.vcf.gz"])
    ]),
    "/".join([OUT_DIR, "_".join(["SNP_INDEL_Pf3D7", CHROM, "v3.zarr"])]),
    fields='*',
    overwrite=False)