Beispiel #1
0
def main(vcf_file):
    """Driver function."""
    tmp_filename = os.path.basename(vcf_file)
    h5_file = ''.join([
        os.path.dirname(vcf_file), '/',
        os.path.splitext(tmp_filename)[0], '.h5'
    ])
    # Convert VCF to hdf5
    allel.vcf_to_hdf5(vcf_file, h5_file, fields='*', overwrite=True)
Beispiel #2
0
def vcf_to_hdf5(vcf_path, zarr_path, fields=None):
    if fields is None:
        fields = DEF_VCF_FIELDS

    # convert our fields to allele zarr fields
    zarr_fields = [VARIATION_ZARR_FIELD_MAPPING[field] for field in fields]
    if 'samples' not in zarr_fields:
        zarr_fields.append('samples')

    allel.vcf_to_hdf5(str(vcf_path), str(zarr_path), fields=zarr_fields)
Beispiel #3
0
def create_h5_file(vcf_file):
    h5_file_path = vcf_file.split(".vcf")[0] + ".h5"
    if not check_file_existence(h5_file_path):
        allel.vcf_to_hdf5(
            vcf_file,
            h5_file_path,
            fields="*",
            exclude_fields=["calldata/GQ"],
            overwrite=True,
        )
Beispiel #4
0
def sim_load_vcf_to_h5(vcf_path, h5_path):
    '''
    transform data from vcf to h5
    input: paths
    output: new file on h5_path
    '''
    #download ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/supporting/hd_genotype_chip/
    #Reference: http://alimanfoo.github.io/2016/06/10/scikit-allel-tour.html
    #Example: http://alimanfoo.github.io/2017/06/14/read-vcf.html
    #vcf_path = 'C://Users//raque//Documents//GitHub//ParKCa//data_s//ALL.chip.omni_broad_sanger_combined.20140818.snps.genotypes.vcf.gz'
    #h5_path = 'data_s//ALL.chip.omni_broad_sanger_combined.20140818.snps.genotypes.h5'
    allel.vcf_to_hdf5(vcf_path, h5_path, fields='*', overwrite=True)
Beispiel #5
0
def loadvcf(vcfFile):
    """Reads VCF using scikit-allel, object is stored as pandas DF
    """
    print("loading vcf file...")
    print("using scitkit allele version:", allel.__version__)
    h5 = "{}h5".format(vcfFile.strip("vcf"))
    if os.path.isfile(h5):
        callset = h5py.File(h5, 'r')
    else:
        callset = allel.read_vcf(vcfFile)
        print("creating h5 for faster loading")
        allel.vcf_to_hdf5(vcfFile, h5)
    return(callset)
Beispiel #6
0
def loadvcf(vcfFile):
    """Reads VCF using scikit-allel, object is stored as pandas DF
    """
    print("loading vcf file...")
    print("using scitkit allele version:", allel.__version__)
    h5 = "{}h5".format(vcfFile.strip("vcf"))
    if os.path.isfile(h5):
        callset = h5py.File(h5, 'r')
    else:
        callset = allel.read_vcf(vcfFile)
        print("creating h5 for faster loading")
        allel.vcf_to_hdf5(vcfFile, h5)
    return (callset)
Beispiel #7
0
def makeh5fromvcf(vcfin, chromlist):
    """
    """
    for c in chromlist:
        h5out = "{}{}.h5".format(vcfin.rstrip("vcf"), c)
        fieldsfromvcf = ['samples', 'calldata/GQ', 'variants/ALT',
                         'variants/REF', 'variants/QUAL', 'variants/CHROM',
                         'variants/POS', 'variants/AF', 'variants/AB',
                         'variants/MQM', 'variants/DP', 'calldata/DP',
                         'calldata/AD', 'calldata/GT']
        allel.vcf_to_hdf5(vcfin, h5out, fields=fieldsfromvcf,
                          types={'calldata/GQ': 'float32'}, region=c)
    return(None)
Beispiel #8
0
def convertVCFToH5(vcfFileName):

    names = vcfFileName.split('.')
    h5FileName = names[0] + '.h5'

    h5File = Path(h5FileName)

    if h5File.is_file():
        print(' ')
    else:
        print('Convertion')
        allel.vcf_to_hdf5(
            vcfFileName, h5FileName, fields='*', overwrite=True
        )  # The saved data can be accessed via the h5py library, e.g.:
Beispiel #9
0
def vcf_to_hdf5(vcf_path: str, hdf5_path: str):
    """ Conversion from VCF or VCF.gz to HDF5 file format.

    Parameters:
        vcf_path (str): Input path to the VCF file.
        hdf5_path (str): Output path to the HDF5 file.

    """
    # sample_list = get_samples(vcf_path)
    logger.debug("Converting VCF file '{vcf_path}'".format(vcf_path=vcf_path))
    if os.path.exists(hdf5_path):
        logger.debug("File '{hdf5_path}' already exists. I will remove it.".format(hdf5_path=hdf5_path))
        os.remove(hdf5_path)
    allel.vcf_to_hdf5(vcf_path, hdf5_path, fields='*', overwrite=False)
    logger.debug("HDF5 file stored in '{hdf5_path}'".format(hdf5_path=hdf5_path))
Beispiel #10
0
def makeh5fromvcf(vcfin, altnum, hf5):
    """
    """
    h5out = "{}.h5".format(vcfin)
    if hf5:
        pass
    else:
        fieldsfromvcf = ['samples', 'calldata/GQ', 'variants/ALT',
                         'variants/REF', 'variants/QUAL', 'variants/CHROM',
                         'variants/POS', 'variants/AF', 'variants/AB',
                         'variants/MQM', 'variants/DP', 'calldata/DP',
                         'calldata/AD', 'calldata/GT']
        allel.vcf_to_hdf5(vcfin, h5out, fields=fieldsfromvcf,
                          types={'calldata/GQ': 'float32'}, alt_number=2)
    # callset = h5py.File(h5out, mode='r')
    return(None)
Beispiel #11
0
def convert_vcf(vcf_filename, h5_filename):
    """Convert vcf_filename"""
    # here we save only CHROM, GT (genotypes) and POS (SNP positions)
    # see: https://scikit-allel.readthedocs.io/en/stable/io.html
    allel.vcf_to_hdf5(vcf_filename, h5_filename, fields=['CHROM', 'GT', 'POS'])
Beispiel #12
0
def create_hdf5_file(vcf_path, hdf5_file_name):
    print("converting vcf into hdf5 file anc_panel.h5")
    allel.vcf_to_hdf5(vcf_path,
                      hdf5_file_name,
                      fields=[
                          'samples', 'calldata/GT', 'calldata/GQ',
                          'calldata/PL', 'calldata/DP', 'calldata/RR',
                          'calldata/VR', 'variants/POS', 'variants/CHROM'
                      ],
                      overwrite=True,
                      log=sys.stdout)

    print(
        "performing formatting tests of hdf5 file, is all the data required there?"
    )
    local_callset = h5py.File(hdf5_file_name, mode='r+')

    get_other_dp = 0
    list_of_attributes = list(local_callset['calldata'])
    if 'DP' not in list_of_attributes:
        get_other_dp = 1

    if get_other_dp == 0:
        dp = local_callset['calldata/DP']
        if np.amax(dp[:]) == -1:
            get_other_dp = 1

    if get_other_dp == 1:
        print("Found no DP entries, reconstructing DP from RR and VR\n")
        rr = local_callset['calldata/RR']
        rr2 = rr[:]
        local_rr = rr2[0]

        if not np.any(local_rr):
            print("It appears that DP information can not be reconstructed\n")
            print("Discarding the option to filter for sufficient DP\n")
        else:
            vr = local_callset['calldata/VR']
            new_dp = rr[:] + vr[:]
            if np.amax(new_dp[:]) == -1:
                print("Failed to reconstruct DP entries from RR and VR\n")
                print("Please reformat your VCF to include DP\n")

            data = local_callset['calldata/DP']
            data[...] = new_dp[:]

    gq = local_callset['calldata/GQ']

    temp = gq[:]

    if np.amax(gq[:] == -1):
        print("Found no GQ entries, reconstructing GQ from PL\n")
        pl = local_callset['calldata/PL']

        new_gq = calc_gq(pl[:])
        if np.amax(new_gq[:]) == -1:
            print("Failed to reconstruct GQ entries from PL\n")
            print("Please reformat your VCF to include GQ")
            print("exiting\n")
            exit(1)

        data = local_callset['calldata/GQ']
        data[...] = new_gq[:]

    local_callset.close()

    print("Data is stored in hdf5 container and verified")
Beispiel #13
0
def vcf_to_hdf5(in_path, out_path, path_vcf100g=""):
    """Transform Full VCF to full HDF5"""
    allel.vcf_to_hdf5(input=in_path, output=out_path, compression="gzip")
    print("Finished Transforming VCF to HDF5.")