def main(vcf_file): """Driver function.""" tmp_filename = os.path.basename(vcf_file) h5_file = ''.join([ os.path.dirname(vcf_file), '/', os.path.splitext(tmp_filename)[0], '.h5' ]) # Convert VCF to hdf5 allel.vcf_to_hdf5(vcf_file, h5_file, fields='*', overwrite=True)
def vcf_to_hdf5(vcf_path, zarr_path, fields=None): if fields is None: fields = DEF_VCF_FIELDS # convert our fields to allele zarr fields zarr_fields = [VARIATION_ZARR_FIELD_MAPPING[field] for field in fields] if 'samples' not in zarr_fields: zarr_fields.append('samples') allel.vcf_to_hdf5(str(vcf_path), str(zarr_path), fields=zarr_fields)
def create_h5_file(vcf_file): h5_file_path = vcf_file.split(".vcf")[0] + ".h5" if not check_file_existence(h5_file_path): allel.vcf_to_hdf5( vcf_file, h5_file_path, fields="*", exclude_fields=["calldata/GQ"], overwrite=True, )
def sim_load_vcf_to_h5(vcf_path, h5_path): ''' transform data from vcf to h5 input: paths output: new file on h5_path ''' #download ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/supporting/hd_genotype_chip/ #Reference: http://alimanfoo.github.io/2016/06/10/scikit-allel-tour.html #Example: http://alimanfoo.github.io/2017/06/14/read-vcf.html #vcf_path = 'C://Users//raque//Documents//GitHub//ParKCa//data_s//ALL.chip.omni_broad_sanger_combined.20140818.snps.genotypes.vcf.gz' #h5_path = 'data_s//ALL.chip.omni_broad_sanger_combined.20140818.snps.genotypes.h5' allel.vcf_to_hdf5(vcf_path, h5_path, fields='*', overwrite=True)
def loadvcf(vcfFile): """Reads VCF using scikit-allel, object is stored as pandas DF """ print("loading vcf file...") print("using scitkit allele version:", allel.__version__) h5 = "{}h5".format(vcfFile.strip("vcf")) if os.path.isfile(h5): callset = h5py.File(h5, 'r') else: callset = allel.read_vcf(vcfFile) print("creating h5 for faster loading") allel.vcf_to_hdf5(vcfFile, h5) return(callset)
def loadvcf(vcfFile): """Reads VCF using scikit-allel, object is stored as pandas DF """ print("loading vcf file...") print("using scitkit allele version:", allel.__version__) h5 = "{}h5".format(vcfFile.strip("vcf")) if os.path.isfile(h5): callset = h5py.File(h5, 'r') else: callset = allel.read_vcf(vcfFile) print("creating h5 for faster loading") allel.vcf_to_hdf5(vcfFile, h5) return (callset)
def makeh5fromvcf(vcfin, chromlist): """ """ for c in chromlist: h5out = "{}{}.h5".format(vcfin.rstrip("vcf"), c) fieldsfromvcf = ['samples', 'calldata/GQ', 'variants/ALT', 'variants/REF', 'variants/QUAL', 'variants/CHROM', 'variants/POS', 'variants/AF', 'variants/AB', 'variants/MQM', 'variants/DP', 'calldata/DP', 'calldata/AD', 'calldata/GT'] allel.vcf_to_hdf5(vcfin, h5out, fields=fieldsfromvcf, types={'calldata/GQ': 'float32'}, region=c) return(None)
def convertVCFToH5(vcfFileName): names = vcfFileName.split('.') h5FileName = names[0] + '.h5' h5File = Path(h5FileName) if h5File.is_file(): print(' ') else: print('Convertion') allel.vcf_to_hdf5( vcfFileName, h5FileName, fields='*', overwrite=True ) # The saved data can be accessed via the h5py library, e.g.:
def vcf_to_hdf5(vcf_path: str, hdf5_path: str): """ Conversion from VCF or VCF.gz to HDF5 file format. Parameters: vcf_path (str): Input path to the VCF file. hdf5_path (str): Output path to the HDF5 file. """ # sample_list = get_samples(vcf_path) logger.debug("Converting VCF file '{vcf_path}'".format(vcf_path=vcf_path)) if os.path.exists(hdf5_path): logger.debug("File '{hdf5_path}' already exists. I will remove it.".format(hdf5_path=hdf5_path)) os.remove(hdf5_path) allel.vcf_to_hdf5(vcf_path, hdf5_path, fields='*', overwrite=False) logger.debug("HDF5 file stored in '{hdf5_path}'".format(hdf5_path=hdf5_path))
def makeh5fromvcf(vcfin, altnum, hf5): """ """ h5out = "{}.h5".format(vcfin) if hf5: pass else: fieldsfromvcf = ['samples', 'calldata/GQ', 'variants/ALT', 'variants/REF', 'variants/QUAL', 'variants/CHROM', 'variants/POS', 'variants/AF', 'variants/AB', 'variants/MQM', 'variants/DP', 'calldata/DP', 'calldata/AD', 'calldata/GT'] allel.vcf_to_hdf5(vcfin, h5out, fields=fieldsfromvcf, types={'calldata/GQ': 'float32'}, alt_number=2) # callset = h5py.File(h5out, mode='r') return(None)
def convert_vcf(vcf_filename, h5_filename): """Convert vcf_filename""" # here we save only CHROM, GT (genotypes) and POS (SNP positions) # see: https://scikit-allel.readthedocs.io/en/stable/io.html allel.vcf_to_hdf5(vcf_filename, h5_filename, fields=['CHROM', 'GT', 'POS'])
def create_hdf5_file(vcf_path, hdf5_file_name): print("converting vcf into hdf5 file anc_panel.h5") allel.vcf_to_hdf5(vcf_path, hdf5_file_name, fields=[ 'samples', 'calldata/GT', 'calldata/GQ', 'calldata/PL', 'calldata/DP', 'calldata/RR', 'calldata/VR', 'variants/POS', 'variants/CHROM' ], overwrite=True, log=sys.stdout) print( "performing formatting tests of hdf5 file, is all the data required there?" ) local_callset = h5py.File(hdf5_file_name, mode='r+') get_other_dp = 0 list_of_attributes = list(local_callset['calldata']) if 'DP' not in list_of_attributes: get_other_dp = 1 if get_other_dp == 0: dp = local_callset['calldata/DP'] if np.amax(dp[:]) == -1: get_other_dp = 1 if get_other_dp == 1: print("Found no DP entries, reconstructing DP from RR and VR\n") rr = local_callset['calldata/RR'] rr2 = rr[:] local_rr = rr2[0] if not np.any(local_rr): print("It appears that DP information can not be reconstructed\n") print("Discarding the option to filter for sufficient DP\n") else: vr = local_callset['calldata/VR'] new_dp = rr[:] + vr[:] if np.amax(new_dp[:]) == -1: print("Failed to reconstruct DP entries from RR and VR\n") print("Please reformat your VCF to include DP\n") data = local_callset['calldata/DP'] data[...] = new_dp[:] gq = local_callset['calldata/GQ'] temp = gq[:] if np.amax(gq[:] == -1): print("Found no GQ entries, reconstructing GQ from PL\n") pl = local_callset['calldata/PL'] new_gq = calc_gq(pl[:]) if np.amax(new_gq[:]) == -1: print("Failed to reconstruct GQ entries from PL\n") print("Please reformat your VCF to include GQ") print("exiting\n") exit(1) data = local_callset['calldata/GQ'] data[...] = new_gq[:] local_callset.close() print("Data is stored in hdf5 container and verified")
def vcf_to_hdf5(in_path, out_path, path_vcf100g=""): """Transform Full VCF to full HDF5""" allel.vcf_to_hdf5(input=in_path, output=out_path, compression="gzip") print("Finished Transforming VCF to HDF5.")