def __init__(self, fname="", in_memory=False): self.in_memory = in_memory self.h5 = None self.caches = set() if fname: self.load(fname) import multiprocessing tbl.set_blosc_max_threads(multiprocessing.cpu_count()) # TODO: figure out why BLOSC is still single threaded?
def _read_hdf5(filepath, branches, partial_load=None): import tables tables.set_blosc_max_threads(4) with tables.open_file(filepath) as f: outputs = {k: getattr(f.root, k) for k in branches} if partial_load is not None and partial_load != (0, 1): start, stop = np.trunc( np.asfarray(partial_load) * len(outputs[branches[0]])) for k, v in outputs.items(): outputs[k] = v[start:stop] return outputs
def _read_hdf5(filepath, branches, load_range=None): import tables tables.set_blosc_max_threads(4) with tables.open_file(filepath) as f: outputs = {k: getattr(f.root, k)[:] for k in branches} if load_range is not None: start = math.trunc(load_range[0] * len(outputs[branches[0]])) stop = max(start + 1, math.trunc(load_range[1] * len(outputs[branches[0]]))) for k, v in outputs.items(): outputs[k] = v[start:stop] return outputs
btsettl='bt-settl.lowres.grid.fits' # elodie31 = 'Elodie_v3.1.grid.fits' ) # Make sure the configuration is coherent for the python installation try: import numexpr if not __USE_NUMEXPR__: numexpr.set_num_threads(1) numexpr.set_vml_num_threads(1) else: numexpr.set_num_threads(__NTHREADS__) numexpr.set_vml_num_threads(__NTHREADS__) except ImportError: __USE_NUMEXPR__ = False try: import tables tables.parameters.MAX_NUMEXPR_THREADS = __NTHREADS__ tables.parameters.MAX_BLOSC_THREADS = __NTHREADS__ tables.set_blosc_max_threads(__NTHREADS__) except ImportError: pass def printConfig(): print(""" ============ BEAST defaut configuration =========== * Including C-code during computations: %s * Parallel processing using %d threads """ % (__WITH_C_LIBS__, __NTHREADS__))
image[slice_number] = slope * image[slice_number].astype( np.float64) image[slice_number] = image[slice_number].astype(np.int16) image[slice_number] += np.int16(intercept) return np.array(image, dtype=np.int16) labelsDF = pandas.read_csv('/data/datasets/lung/stage1_labels.csv', sep=',') labelsDF.columns = ['uuid', 'cancer'] print labelsDF.columns if __name__ == '__main__': tables.set_blosc_max_threads(4) #filters = tables.Filters(complevel=1, complib='blosc:lz4') # 7.7sec / 1.2 GB (14 sec 1015MB if precision is reduced) 140s 3.7GB filters = tables.Filters(complevel=5, complib='blosc:snappy') DB = tables.open_file(OUTPUT_FOLDER + 'segmented.h5', mode='w', filters=None) #images = DB.create_earray(DB.root, 'resampled', atom=tables.Int16Atom(shape=RESAMPLED_IMG_SHAPE), shape=(0,), expectedrows=len(file_list), filters=filters) images = DB.create_carray(DB.root, 'resampled', atom=tables.Int16Atom(shape=RESAMPLED_IMG_SHAPE), shape=(len(patients), ), filters=filters) imageDF = pandas.DataFrame() #first_patient = load_scan(INPUT_FOLDER + patients[0])
def create_from_gz(self, path, tree_path="", regular_flushes=10000, blosc_max_threads=8, complevel=0): """ Converts a raw, gzip compressed MAF file to HDF5 format. :param path: path to maf.gz file :param regular_flushes: number of MAF blocks to process before pushing data to disk by PyTables.table.flush() :param blosc_max_threads: number of threads for parallel (de-)compression by BLOSC (currently has no effect?) """ tbl.set_blosc_max_threads(blosc_max_threads) # TODO: figure out why BLOSC is still single threaded? dirname, basename = os.path.split(path) name_parts = basename.split('.')[:-1] h5path = os.path.join(dirname, ".".join( name_parts + ['hdf5'] )) self.logger = logging.getLogger("MAFBlockDB({0})".format(h5path)) self.logger.info("creating hdf5 table '{0}' from '{1}'".format(h5path, path) ) filters = tbl.Filters(complevel=complevel, complib='blosc') self.h5 = tbl.open_file(h5path, mode = "w", title = "MAF {0}".format(path), filters=filters) names_table = self.h5.create_vlarray(self.h5.root, 'species_names', atom = tbl.VLStringAtom(), title = "all species names mentioned in the MAF file in the correct order") if not tree_path: tree_path = find_tree(path) self.species_names = species_list_from_tree(tree_path) n_species = len(self.species_names) self.species_index = {} for i,species in enumerate(self.species_names): self.species_index[species] = i names_table.append(species.encode('ascii')) names_table.flush() self.logger.debug("stored {0} species names".format(n_species)) seqs = self.h5.create_vlarray(self.h5.root, 'seqs', atom = tbl.VLStringAtom(), title = "all sequences in all MAF blocks", filters=filters) scores = self.h5.create_earray(self.h5.root, 'scores', atom = tbl.Float32Atom(), shape=(0,), title = "alignment scores for each block", filters=filters, expectedrows=1000000) self.h5.create_group(self.h5.root, 'coords' ) coords_tables = {} coords_curs = {} for species in self.species_names: table = self.h5.create_table(self.h5.root.coords, species, MAFCoords, "coordinates for species '{0}'".format(species), expectedrows=1000000) coords_tables[species] = table coords_curs[species] = table.row table.flush() #pads_tab #pads = self.h5.create_table(self.h5.root, 'pads', MAFPads, "block padding info for each species", expectedrows=1000000) coord_rows_vector = self.h5.create_earray(self.h5.root, 'coord_rows_vector', atom = tbl.Int32Col(), shape=(0,n_species), title = "vector with row numbers of coords record for each species", filters=filters, expectedrows=1000000) seq_rows_vector = self.h5.create_earray(self.h5.root, 'seq_rows_vector', atom = tbl.Int32Col(), shape=(0,n_species), title = "vector with row numbers of sequences for each species", filters=filters, expectedrows=1000000) #pad_rows_vector = self.h5.create_earray(self.h5.root, 'pad_rows_vector', atom = tbl.Int64Col(shape=n_species), title = "vector with row numbers of pads record for each species", filters=filters, expectedrows=1000000) T0 = time() T_last = T0 curr_coords = {} curr_seqs = {} from gzip import GzipFile for maf_line in GzipFile(path): if not maf_line.strip(): continue if maf_line.startswith('a'): scores.append([(float(maf_line.split('=')[1]))]) if curr_coords: cvec = np.array([curr_coords.get(species,-1) for species in self.species_names]) svec = np.array([curr_seqs.get(species,-1) for species in self.species_names]) coord_rows_vector.append([cvec]) seq_rows_vector.append([svec]) curr_coords = {} curr_seqs = {} if scores.nrows and (scores.nrows % regular_flushes == 0): self.flush() T = time() dT = T - T_last T_last = T kbps = (regular_flushes / 1000.) / dT self.logger.debug("processed {0:.0f}k MAF blocks ({1:.1f}k blocks per second)".format(scores.nrows/1000., kbps) ) elif maf_line.startswith('s'): parts = re.split(r'\s+',maf_line) loc,start,size,strand,total,seq = parts[1:7] species,chrom = loc.split('.',1) start = int(start) size = int(size) total = int(total) if strand == '+': end = start + size else: start, end = total - (start + size), total - start # if you think this is sick, go tell the evil master MAF and his # sidekick Dr. minus, the inventor of the minus strand, to their # faces. ;) curr_coords[species] = coords_tables[species].nrows n = coords_tables[species].nrows coords_tables[species].append( [(coord_rows_vector.nrows, chrom, start, end, (strand == '-'))] ) assert coords_tables[species].nrows == n + 1 # need flush?? # store the alignment row in the VLStringArray. # the link is through keeping the seqs current row number curr_seqs[species] = seqs.nrows seqs.append(seq.encode('ascii')) elif maf_line.startswith('i'): continue #parts = re.split(r'\s+',maf_line) #loc,pre_code,pre_num,post_code,post_num = parts[1:6] #species,chrom = loc.split('.',1) #pads = pads_lookup[species] #pads.append( [(maf_block_id, pre_code, pre_num, post_code, post_num)] ) elif maf_line.startswith('e'): continue # currently ignored else: print "ignoring unknown MAF line '{0}'".format(maf_line.strip()) if curr_coords: cvec = np.array([curr_coords.get(species,-1) for species in self.species_names]) svec = np.array([curr_seqs.get(species,-1) for species in self.species_names]) coord_rows_vector.append([cvec]) seq_rows_vector.append([svec]) self.logger.info("done processing {0} MAF blocks in {1:.1f}sec.".format(scores.nrows, (time() - T0)) ) self.build_indices()
numcodecs.blosc.set_nthreads(nthreads) if persistent: z = zarr.open(fname_zarr, mode='w', shape=shape, chunks=chunkshape, dtype=dtype, compressor=compressor) else: z = zarr.empty(shape=shape, chunks=chunkshape, dtype=dtype, compressor=compressor) z[:] = content zratio = z.nbytes / z.nbytes_stored if persistent: del z t1 = time() print("Time for filling array (zarr): %.3fs ; CRatio: %.1fx" % ((t1 - t0), zratio)) # Create and fill a hdf5 array t0 = time() filters = tables.Filters(complevel=clevel, complib="blosc:%s" % cname, shuffle=True) tables.set_blosc_max_threads(nthreads) if persistent: h5f = tables.open_file(fname_h5, 'w') else: h5f = tables.open_file(fname_h5, 'w', driver='H5FD_CORE', driver_core_backing_store=0) h5ca = h5f.create_carray(h5f.root, 'carray', filters=filters, chunkshape=chunkshape, obj=content) h5f.flush() h5ratio = h5ca.size_in_memory / h5ca.size_on_disk if persistent: h5f.close() t1 = time() print("Time for filling array (hdf5): %.3fs ; CRatio: %.1fx" % ((t1 - t0), h5ratio)) # Check that the contents are the same t0 = time() if persistent:
import hashlib import logging from functools import partial from os import cpu_count from pathlib import Path from typing import Any, Callable, Dict, List, Optional, Union import numpy as np import pandas as pd import tables Path_s = Union[Path, str] To_save = Optional[Dict[str, Any]] List_str = Optional[List[str]] tables.set_blosc_max_threads(cpu_count() // 2) def vars_to_dict(obj: Any, vars_: List[str]) -> Dict[str, Any]: """Get instance variables using names in `vars` from `obj`. Args: obj (Any) vars (List[str]): List of instance variable names. Returns: Dict[str, Any]: Instance variable name and its object. """ out = dict() for var in vars_: try:
def get_training_array(tensor_fn, var_fn, bed_fn, bin_fn, shuffle=True, is_allow_duplicate_chr_pos=True, chunk_id=None, chunk_num=None, platform='ont', pileup=False, maximum_non_variant_ratio=None, candidate_details_fn_prefix=None): """ Generate training array for training. here pytables with blosc:lz4hc are used for extreme fast compression and decompression, which can meet the requirement of gpu utilization. lz4hc decompression allows speed up training array decompression 4~5x compared with tensorflow tfrecord file format, current gpu utilization could reach over 85% with only 10G memory. tensor_fn: string format tensor acquired from CreateTensorPileup or CreateTensorFullAlign, include contig name position, tensor matrix, alternative information. var_fn: simplified variant(vcf) format from GetTruths, which include contig name, position, reference base, alternative base, genotype. bin_fn: pytables format output bin file name. shuffle: whether apply index shuffling when generating training data, default True, which would promote robustness. is_allow_duplicate_chr_pos: whether allow duplicate positions when training, if there exists downsampled data, lower depth will add a random prefix character. chunk_id: specific chunk id works with total chunk_num for parallel execution. Here will merge all tensor file with sampe prefix. chunk_num: total chunk number for parallel execution. Each chunk refer to a smaller reference regions. platform: platform for tensor shape, ont give a larger maximum depth compared with pb and illumina. pileup: whether in pileup mode. Define two calling mode, pileup or full alignment. maximum_non_variant_ratio: define a maximum non variant ratio for training, we always expect use more non variant data, while it would greatly increase training time, especially in ont data, here we usually use 1:1 or 1:2 for variant candidate: non variant candidate. candidate_details_fn_prefix: a counter to calculate total variant and non variant from the information in alternative file. """ tree = bed_tree_from(bed_file_path=bed_fn) is_tree_empty = len(tree.keys()) == 0 Y_true_var, miss_variant_set, truth_alt_dict = variant_map_from(var_fn, tree, is_tree_empty) Y = copy.deepcopy(Y_true_var) global param float_type = 'int32' if pileup: import shared.param_p as param else: import shared.param_f as param float_type = 'int8' import tables FILTERS = tables.Filters(complib='blosc:lz4hc', complevel=5) tensor_shape = param.ont_input_shape if platform == 'ont' else param.input_shape subprocess_list = [] if tensor_fn == 'PIPE': subprocess_list.append(sys.stdin) elif os.path.exists(tensor_fn): subprocess_list.append(subprocess_popen(shlex.split("{} -fdc {}".format(param.zstd, tensor_fn))).stdout) # select all match prefix if file path not exists else: tensor_fn = tensor_fn.split('/') directry, file_prefix = '/'.join(tensor_fn[:-1]), tensor_fn[-1] all_file_name = [] for file_name in os.listdir(directry): if file_name.startswith(file_prefix + '_') or file_name.startswith( file_prefix + '.'): # add '_.' to avoid add other prefix chr all_file_name.append(file_name) all_file_name = sorted(all_file_name) if chunk_id is not None: chunk_size = len(all_file_name) // chunk_num if len(all_file_name) % chunk_num == 0 else len( all_file_name) // chunk_num + 1 chunk_start = chunk_size * chunk_id chunk_end = chunk_start + chunk_size all_file_name = all_file_name[chunk_start:chunk_end] if not len(all_file_name): print("[INFO] chunk_id exceed total file number, skip chunk", file=sys.stderr) return 0 for file_name in all_file_name: subprocess_list.append( subprocess_popen(shlex.split("{} -fdc {}".format(param.zstd, os.path.join(directry, file_name)))).stdout) tables.set_blosc_max_threads(64) int_atom = tables.Atom.from_dtype(np.dtype(float_type)) string_atom = tables.StringAtom(itemsize=param.no_of_positions + 50) long_string_atom = tables.StringAtom(itemsize=5000) # max alt_info length table_file = tables.open_file(bin_fn, mode='w', filters=FILTERS) table_file.create_earray(where='/', name='position_matrix', atom=int_atom, shape=[0] + tensor_shape, filters=FILTERS) table_file.create_earray(where='/', name='position', atom=string_atom, shape=(0, 1), filters=FILTERS) table_file.create_earray(where='/', name='label', atom=int_atom, shape=(0, param.label_size), filters=FILTERS) table_file.create_earray(where='/', name='alt_info', atom=long_string_atom, shape=(0, 1), filters=FILTERS) table_dict = update_table_dict() # generator to avoid high memory occupy bin_reader_generator = partial(bin_reader_generator_from, Y_true_var=Y_true_var, Y=Y, is_tree_empty=is_tree_empty, tree=tree, miss_variant_set=miss_variant_set, truth_alt_dict=truth_alt_dict, is_allow_duplicate_chr_pos=is_allow_duplicate_chr_pos, maximum_non_variant_ratio=maximum_non_variant_ratio) total_compressed = 0 for fin in subprocess_list: bin_g = bin_reader_generator(tensor_fn=fin) completed = False while not completed: try: X, total, completed = next(bin_g) except StopIteration: completed = True if X is None or not len(X): break all_chr_pos = sorted(X.keys()) if shuffle == True: np.random.shuffle(all_chr_pos) for key in all_chr_pos: string, alt_info, seq = X[key] del X[key] label = None if key in Y: label = Y[key] pos = key + ':' + seq if not is_allow_duplicate_chr_pos: del Y[key] elif is_allow_duplicate_chr_pos: tmp_key = key[1:] label = Y[tmp_key] pos = tmp_key + ':' + seq if label is None: print(key) continue total_compressed = write_table_dict(table_dict, string, label, pos, total_compressed, alt_info, tensor_shape, pileup) if total_compressed % 500 == 0 and total_compressed > 0: table_dict = write_table_file(table_file, table_dict, tensor_shape, param.label_size, float_type) if total_compressed % 50000 == 0: print("[INFO] Compressed %d tensor" % (total_compressed), file=sys.stderr) fin.close() if total_compressed % 500 != 0 and total_compressed > 0: table_dict = write_table_file(table_file, table_dict, tensor_shape, param.label_size, float_type) table_file.close() print("[INFO] Compressed %d/%d tensor" % (total_compressed, total), file=sys.stderr)
def Run(args): in_fn_list = args.in_fn out_fn = args.out_fn platform = args.platform pileup = args.pileup global param float_type = 'int32' if pileup: import shared.param_p as param else: import shared.param_f as param float_type = 'int8' tensor_shape = param.ont_input_shape if platform == 'ont' else param.input_shape # select all match prefix if file path not exists tables.set_blosc_max_threads(64) int_atom = tables.Atom.from_dtype(np.dtype(float_type)) string_atom = tables.StringAtom(itemsize=param.no_of_positions + 50) long_string_atom = tables.StringAtom(itemsize=5000) # max alt_info length table_file = tables.open_file(out_fn, mode='w', filters=FILTERS) table_file.create_earray(where='/', name='position_matrix', atom=int_atom, shape=[0] + tensor_shape, filters=FILTERS) table_file.create_earray(where='/', name='position', atom=string_atom, shape=(0, 1), filters=FILTERS) table_file.create_earray(where='/', name='label', atom=int_atom, shape=(0, param.label_size), filters=FILTERS) table_file.create_earray(where='/', name='alt_info', atom=long_string_atom, shape=(0, 1), filters=FILTERS) table_dict = utils.update_table_dict() total_compressed = 0 for f in in_fn_list: print("[INFO] Merging file {}".format(f)) fi = tables.open_file(f, model='r') assert (len(fi.root.label) == len(fi.root.position) == len( fi.root.position_matrix) == len(fi.root.alt_info)) for index in range(len(fi.root.label)): table_dict['label'].append(fi.root.label[index]) table_dict['position'].append(fi.root.position[index]) table_dict['position_matrix'].append( fi.root.position_matrix[index]) table_dict['alt_info'].append(fi.root.alt_info[index]) total_compressed += 1 if total_compressed % 500 == 0 and total_compressed > 0: table_dict = utils.write_table_file(table_file, table_dict, tensor_shape, param.label_size, float_type) if total_compressed % 50000 == 0: print("[INFO] Compressed %d tensor" % (total_compressed), file=sys.stderr) fi.close() if total_compressed % 500 != 0 and total_compressed > 0: table_dict = utils.write_table_file(table_file, table_dict, tensor_shape, param.label_size, float_type) print("[INFO] Compressed %d tensor" % (total_compressed), file=sys.stderr) table_file.close()
import logging import random import numpy as np from argparse import ArgumentParser, SUPPRESS import tensorflow_addons as tfa import tensorflow as tf import tables import os import sys from itertools import accumulate import clair3.model as model_path from shared.utils import str2bool logging.basicConfig(format='%(message)s', level=logging.INFO) tables.set_blosc_max_threads(512) os.environ['NUMEXPR_MAX_THREADS'] = '64' os.environ['NUMEXPR_NUM_THREADS'] = '8' def get_label_task(label, label_shape_cum, task): if task == 0: return label[:label_shape_cum[task]] elif task == len(label_shape_cum) - 1: return label[label_shape_cum[task - 1]:] else: return label[label_shape_cum[task - 1]:label_shape_cum[task]] def cal_class_weight(samples_per_cls, no_of_classes, beta=0.999): effective_num = 1.0 - np.power(beta, samples_per_cls)