Example #1
0
    def __init__(self, fname="", in_memory=False):
        self.in_memory = in_memory
        self.h5 = None
        self.caches = set()

        if fname:
            self.load(fname)
    
        import multiprocessing
        tbl.set_blosc_max_threads(multiprocessing.cpu_count()) # TODO: figure out why BLOSC is still single threaded?
Example #2
0
def _read_hdf5(filepath, branches, partial_load=None):
    import tables
    tables.set_blosc_max_threads(4)
    with tables.open_file(filepath) as f:
        outputs = {k: getattr(f.root, k) for k in branches}
    if partial_load is not None and partial_load != (0, 1):
        start, stop = np.trunc(
            np.asfarray(partial_load) * len(outputs[branches[0]]))
        for k, v in outputs.items():
            outputs[k] = v[start:stop]
    return outputs
Example #3
0
def _read_hdf5(filepath, branches, load_range=None):
    import tables
    tables.set_blosc_max_threads(4)
    with tables.open_file(filepath) as f:
        outputs = {k: getattr(f.root, k)[:] for k in branches}
    if load_range is not None:
        start = math.trunc(load_range[0] * len(outputs[branches[0]]))
        stop = max(start + 1,
                   math.trunc(load_range[1] * len(outputs[branches[0]])))
        for k, v in outputs.items():
            outputs[k] = v[start:stop]
    return outputs
Example #4
0
    btsettl='bt-settl.lowres.grid.fits'
    # elodie31 = 'Elodie_v3.1.grid.fits'
)

# Make sure the configuration is coherent for the python installation
try:
    import numexpr
    if not __USE_NUMEXPR__:
        numexpr.set_num_threads(1)
        numexpr.set_vml_num_threads(1)
    else:
        numexpr.set_num_threads(__NTHREADS__)
        numexpr.set_vml_num_threads(__NTHREADS__)
except ImportError:
    __USE_NUMEXPR__ = False

try:
    import tables
    tables.parameters.MAX_NUMEXPR_THREADS = __NTHREADS__
    tables.parameters.MAX_BLOSC_THREADS = __NTHREADS__
    tables.set_blosc_max_threads(__NTHREADS__)
except ImportError:
    pass


def printConfig():
    print(""" ============ BEAST defaut configuration ===========
    * Including C-code during computations: %s
    * Parallel processing using %d threads
    """ % (__WITH_C_LIBS__, __NTHREADS__))
Example #5
0
            image[slice_number] = slope * image[slice_number].astype(
                np.float64)
            image[slice_number] = image[slice_number].astype(np.int16)

        image[slice_number] += np.int16(intercept)

    return np.array(image, dtype=np.int16)


labelsDF = pandas.read_csv('/data/datasets/lung/stage1_labels.csv', sep=',')
labelsDF.columns = ['uuid', 'cancer']
print labelsDF.columns

if __name__ == '__main__':

    tables.set_blosc_max_threads(4)
    #filters = tables.Filters(complevel=1, complib='blosc:lz4')      # 7.7sec / 1.2 GB   (14 sec 1015MB if precision is reduced)           140s 3.7GB
    filters = tables.Filters(complevel=5, complib='blosc:snappy')
    DB = tables.open_file(OUTPUT_FOLDER + 'segmented.h5',
                          mode='w',
                          filters=None)
    #images = DB.create_earray(DB.root, 'resampled', atom=tables.Int16Atom(shape=RESAMPLED_IMG_SHAPE), shape=(0,), expectedrows=len(file_list), filters=filters)
    images = DB.create_carray(DB.root,
                              'resampled',
                              atom=tables.Int16Atom(shape=RESAMPLED_IMG_SHAPE),
                              shape=(len(patients), ),
                              filters=filters)

    imageDF = pandas.DataFrame()

    #first_patient = load_scan(INPUT_FOLDER + patients[0])
Example #6
0
    def create_from_gz(self, path, tree_path="", regular_flushes=10000, blosc_max_threads=8, complevel=0):
        """
        Converts a raw, gzip compressed MAF file to HDF5 format.
        
        :param path: path to maf.gz file
        :param regular_flushes: number of MAF blocks to process before pushing
            data to disk by PyTables.table.flush()
        :param blosc_max_threads: number of threads for parallel 
            (de-)compression by BLOSC (currently has no effect?)
        """
        
        tbl.set_blosc_max_threads(blosc_max_threads) # TODO: figure out why BLOSC is still single threaded?
        
        dirname, basename = os.path.split(path)
        name_parts = basename.split('.')[:-1]
        h5path = os.path.join(dirname, ".".join( name_parts + ['hdf5'] ))
        self.logger = logging.getLogger("MAFBlockDB({0})".format(h5path))
        self.logger.info("creating hdf5 table '{0}' from '{1}'".format(h5path, path) )
        
        filters = tbl.Filters(complevel=complevel, complib='blosc')
        self.h5 = tbl.open_file(h5path, mode = "w", title = "MAF {0}".format(path), filters=filters)
        names_table = self.h5.create_vlarray(self.h5.root, 'species_names', atom = tbl.VLStringAtom(), title = "all species names mentioned in the MAF file in the correct order")

        if not tree_path:
            tree_path = find_tree(path)

        self.species_names = species_list_from_tree(tree_path)
        n_species = len(self.species_names)
        self.species_index = {}
        for i,species in enumerate(self.species_names):
            self.species_index[species] = i
            names_table.append(species.encode('ascii'))

        names_table.flush()
        self.logger.debug("stored {0} species names".format(n_species))
        
        seqs   = self.h5.create_vlarray(self.h5.root, 'seqs', atom = tbl.VLStringAtom(), title = "all sequences in all MAF blocks", filters=filters)
        scores = self.h5.create_earray(self.h5.root, 'scores', atom = tbl.Float32Atom(), shape=(0,), title = "alignment scores for each block", filters=filters, expectedrows=1000000)
        self.h5.create_group(self.h5.root, 'coords' )
        
        coords_tables = {}
        coords_curs = {}
        for species in self.species_names:
            table = self.h5.create_table(self.h5.root.coords, species, MAFCoords, "coordinates for species '{0}'".format(species), expectedrows=1000000)
            coords_tables[species] = table
            coords_curs[species] = table.row
            table.flush()

        #pads_tab
        #pads        = self.h5.create_table(self.h5.root, 'pads', MAFPads, "block padding info for each species", expectedrows=1000000)
        
        coord_rows_vector = self.h5.create_earray(self.h5.root, 'coord_rows_vector', atom = tbl.Int32Col(), shape=(0,n_species), title = "vector with row numbers of coords record for each species", filters=filters, expectedrows=1000000)
        seq_rows_vector   = self.h5.create_earray(self.h5.root, 'seq_rows_vector', atom = tbl.Int32Col(), shape=(0,n_species), title = "vector with row numbers of sequences for each species", filters=filters, expectedrows=1000000)
        #pad_rows_vector   = self.h5.create_earray(self.h5.root, 'pad_rows_vector', atom = tbl.Int64Col(shape=n_species), title = "vector with row numbers of pads record for each species", filters=filters, expectedrows=1000000)        
        T0 = time()
        T_last = T0

        curr_coords = {}
        curr_seqs = {}
        
        from gzip import GzipFile
        for maf_line in GzipFile(path):
            if not maf_line.strip():
                continue
            
            if maf_line.startswith('a'):
                scores.append([(float(maf_line.split('=')[1]))])
                if curr_coords:
                    cvec = np.array([curr_coords.get(species,-1) for species in self.species_names])
                    svec = np.array([curr_seqs.get(species,-1) for species in self.species_names])
                    coord_rows_vector.append([cvec])
                    seq_rows_vector.append([svec])
                    
                    curr_coords = {}
                    curr_seqs = {}

                if scores.nrows and (scores.nrows % regular_flushes == 0):
                    self.flush()

                    T = time()
                    dT = T - T_last
                    T_last = T
                    kbps = (regular_flushes / 1000.) / dT
                    
                    self.logger.debug("processed {0:.0f}k MAF blocks ({1:.1f}k blocks per second)".format(scores.nrows/1000., kbps) )
                
            elif maf_line.startswith('s'):

                parts = re.split(r'\s+',maf_line)
                loc,start,size,strand,total,seq = parts[1:7]
            
                species,chrom = loc.split('.',1)
                start = int(start)
                size = int(size)
                total = int(total)
                if strand == '+':
                    end = start + size
                else:
                    start, end = total - (start + size), total - start
                    # if you think this is sick, go tell the evil master MAF and his
                    # sidekick Dr. minus, the inventor of the minus strand, to their 
                    # faces. ;)

                curr_coords[species] = coords_tables[species].nrows
                n = coords_tables[species].nrows
                coords_tables[species].append( [(coord_rows_vector.nrows, chrom, start, end, (strand == '-'))] )
                assert coords_tables[species].nrows == n + 1 # need flush??
                
                                
                # store the alignment row in the VLStringArray. 
                # the link is through keeping the seqs current row number
                curr_seqs[species] = seqs.nrows
                seqs.append(seq.encode('ascii'))

            elif maf_line.startswith('i'):
                continue
                #parts = re.split(r'\s+',maf_line)
                #loc,pre_code,pre_num,post_code,post_num = parts[1:6]
                #species,chrom = loc.split('.',1)
                #pads = pads_lookup[species]
                #pads.append( [(maf_block_id, pre_code, pre_num, post_code, post_num)] )
            elif maf_line.startswith('e'):
                continue # currently ignored
            else:
                print "ignoring unknown MAF line '{0}'".format(maf_line.strip())
                
        if curr_coords:
            cvec = np.array([curr_coords.get(species,-1) for species in self.species_names])
            svec = np.array([curr_seqs.get(species,-1) for species in self.species_names])
            coord_rows_vector.append([cvec])
            seq_rows_vector.append([svec])

        self.logger.info("done processing {0} MAF blocks in {1:.1f}sec.".format(scores.nrows, (time() - T0)) )
        self.build_indices()
Example #7
0
numcodecs.blosc.set_nthreads(nthreads)
if persistent:
    z = zarr.open(fname_zarr, mode='w', shape=shape, chunks=chunkshape, dtype=dtype, compressor=compressor)
else:
    z = zarr.empty(shape=shape, chunks=chunkshape, dtype=dtype, compressor=compressor)
z[:] = content
zratio = z.nbytes / z.nbytes_stored
if persistent:
    del z
t1 = time()
print("Time for filling array (zarr): %.3fs ; CRatio: %.1fx" % ((t1 - t0), zratio))

# Create and fill a hdf5 array
t0 = time()
filters = tables.Filters(complevel=clevel, complib="blosc:%s" % cname, shuffle=True)
tables.set_blosc_max_threads(nthreads)
if persistent:
    h5f = tables.open_file(fname_h5, 'w')
else:
    h5f = tables.open_file(fname_h5, 'w', driver='H5FD_CORE', driver_core_backing_store=0)
h5ca = h5f.create_carray(h5f.root, 'carray', filters=filters, chunkshape=chunkshape, obj=content)
h5f.flush()
h5ratio = h5ca.size_in_memory / h5ca.size_on_disk
if persistent:
    h5f.close()
t1 = time()
print("Time for filling array (hdf5): %.3fs ; CRatio: %.1fx" % ((t1 - t0), h5ratio))

# Check that the contents are the same
t0 = time()
if persistent:
Example #8
0
import hashlib
import logging
from functools import partial
from os import cpu_count
from pathlib import Path
from typing import Any, Callable, Dict, List, Optional, Union

import numpy as np
import pandas as pd
import tables

Path_s = Union[Path, str]
To_save = Optional[Dict[str, Any]]
List_str = Optional[List[str]]

tables.set_blosc_max_threads(cpu_count() // 2)


def vars_to_dict(obj: Any, vars_: List[str]) -> Dict[str, Any]:
    """Get instance variables using names in `vars` from `obj`.

    Args:
        obj (Any)
        vars (List[str]): List of instance variable names.

    Returns:
        Dict[str, Any]: Instance variable name and its object.
    """
    out = dict()
    for var in vars_:
        try:
Example #9
0
def get_training_array(tensor_fn, var_fn, bed_fn, bin_fn, shuffle=True, is_allow_duplicate_chr_pos=True, chunk_id=None,
                       chunk_num=None, platform='ont', pileup=False, maximum_non_variant_ratio=None, candidate_details_fn_prefix=None):

    """
    Generate training array for training. here pytables with blosc:lz4hc are used for extreme fast compression and decompression,
    which can meet the requirement of gpu utilization. lz4hc decompression allows speed up training array decompression 4~5x compared
    with tensorflow tfrecord file format, current gpu utilization could reach over 85% with only 10G memory.
    tensor_fn: string format tensor acquired from CreateTensorPileup or CreateTensorFullAlign, include contig name position, tensor matrix, alternative information.
    var_fn: simplified variant(vcf) format from GetTruths, which include contig name, position, reference base, alternative base, genotype.
    bin_fn: pytables format output bin file name.
    shuffle: whether apply index shuffling when generating training data, default True, which would promote robustness.
    is_allow_duplicate_chr_pos: whether allow duplicate positions when training, if there exists downsampled data, lower depth will add a random prefix character.
    chunk_id: specific chunk id works with total chunk_num for parallel execution. Here will merge all tensor file with sampe prefix.
    chunk_num: total chunk number for parallel execution. Each chunk refer to a smaller reference regions.
    platform: platform for tensor shape, ont give a larger maximum depth compared with pb and illumina.
    pileup: whether in pileup mode. Define two calling mode, pileup or full alignment.
    maximum_non_variant_ratio: define a maximum non variant ratio for training, we always expect use more non variant data, while it would greatly increase training
    time, especially in ont data, here we usually use 1:1 or 1:2 for variant candidate: non variant candidate.
    candidate_details_fn_prefix: a counter to calculate total variant and non variant from the information in alternative file.
    """

    tree = bed_tree_from(bed_file_path=bed_fn)
    is_tree_empty = len(tree.keys()) == 0
    Y_true_var, miss_variant_set, truth_alt_dict = variant_map_from(var_fn, tree, is_tree_empty)
    Y = copy.deepcopy(Y_true_var)

    global param
    float_type = 'int32'
    if pileup:
        import shared.param_p as param
    else:
        import shared.param_f as param
        float_type = 'int8'

    import tables
    FILTERS = tables.Filters(complib='blosc:lz4hc', complevel=5)
    tensor_shape = param.ont_input_shape if platform == 'ont' else param.input_shape

    subprocess_list = []
    if tensor_fn == 'PIPE':
        subprocess_list.append(sys.stdin)
    elif os.path.exists(tensor_fn):
        subprocess_list.append(subprocess_popen(shlex.split("{} -fdc {}".format(param.zstd, tensor_fn))).stdout)
    # select all match prefix if file path not exists
    else:
        tensor_fn = tensor_fn.split('/')
        directry, file_prefix = '/'.join(tensor_fn[:-1]), tensor_fn[-1]
        all_file_name = []
        for file_name in os.listdir(directry):
            if file_name.startswith(file_prefix + '_') or file_name.startswith(
                    file_prefix + '.'):  # add '_.' to avoid add other prefix chr
                all_file_name.append(file_name)
        all_file_name = sorted(all_file_name)
        if chunk_id is not None:
            chunk_size = len(all_file_name) // chunk_num if len(all_file_name) % chunk_num == 0 else len(
                all_file_name) // chunk_num + 1
            chunk_start = chunk_size * chunk_id
            chunk_end = chunk_start + chunk_size
            all_file_name = all_file_name[chunk_start:chunk_end]
        if not len(all_file_name):
            print("[INFO] chunk_id exceed total file number, skip chunk", file=sys.stderr)
            return 0
        for file_name in all_file_name:
            subprocess_list.append(
                subprocess_popen(shlex.split("{} -fdc {}".format(param.zstd, os.path.join(directry, file_name)))).stdout)

    tables.set_blosc_max_threads(64)
    int_atom = tables.Atom.from_dtype(np.dtype(float_type))
    string_atom = tables.StringAtom(itemsize=param.no_of_positions + 50)
    long_string_atom = tables.StringAtom(itemsize=5000)  # max alt_info length
    table_file = tables.open_file(bin_fn, mode='w', filters=FILTERS)
    table_file.create_earray(where='/', name='position_matrix', atom=int_atom, shape=[0] + tensor_shape,
                             filters=FILTERS)
    table_file.create_earray(where='/', name='position', atom=string_atom, shape=(0, 1), filters=FILTERS)
    table_file.create_earray(where='/', name='label', atom=int_atom, shape=(0, param.label_size), filters=FILTERS)
    table_file.create_earray(where='/', name='alt_info', atom=long_string_atom, shape=(0, 1), filters=FILTERS)

    table_dict = update_table_dict()

    # generator to avoid high memory occupy
    bin_reader_generator = partial(bin_reader_generator_from, 
                                   Y_true_var=Y_true_var,
                                   Y=Y,
                                   is_tree_empty=is_tree_empty,
                                   tree=tree,
                                   miss_variant_set=miss_variant_set,
                                   truth_alt_dict=truth_alt_dict,
                                   is_allow_duplicate_chr_pos=is_allow_duplicate_chr_pos,
                                   maximum_non_variant_ratio=maximum_non_variant_ratio)

    total_compressed = 0
    for fin in subprocess_list:
        bin_g = bin_reader_generator(tensor_fn=fin)
        completed = False
        while not completed:
            try:
                X, total, completed = next(bin_g)
            except StopIteration:
                completed = True
        
            if X is None or not len(X):
                break
            all_chr_pos = sorted(X.keys())
            if shuffle == True:
                np.random.shuffle(all_chr_pos)
            for key in all_chr_pos:

                string, alt_info, seq = X[key]
                del X[key]
                label = None
                if key in Y:
                    label = Y[key]
                    pos = key + ':' + seq
                    if not is_allow_duplicate_chr_pos:
                        del Y[key]
                elif is_allow_duplicate_chr_pos:
                    tmp_key = key[1:]
                    label = Y[tmp_key]
                    pos = tmp_key + ':' + seq
                if label is None:
                    print(key)
                    continue
                total_compressed = write_table_dict(table_dict, string, label, pos, total_compressed, alt_info,
                                                    tensor_shape, pileup)

                if total_compressed % 500 == 0 and total_compressed > 0:
                    table_dict = write_table_file(table_file, table_dict, tensor_shape, param.label_size, float_type)

                if total_compressed % 50000 == 0:
                    print("[INFO] Compressed %d tensor" % (total_compressed), file=sys.stderr)
        fin.close()

    if total_compressed % 500 != 0 and total_compressed > 0:
        table_dict = write_table_file(table_file, table_dict, tensor_shape, param.label_size, float_type)

    table_file.close()
    print("[INFO] Compressed %d/%d tensor" % (total_compressed, total), file=sys.stderr)
Example #10
0
def Run(args):
    in_fn_list = args.in_fn
    out_fn = args.out_fn
    platform = args.platform
    pileup = args.pileup

    global param
    float_type = 'int32'
    if pileup:
        import shared.param_p as param
    else:
        import shared.param_f as param
        float_type = 'int8'

    tensor_shape = param.ont_input_shape if platform == 'ont' else param.input_shape

    # select all match prefix if file path not exists
    tables.set_blosc_max_threads(64)
    int_atom = tables.Atom.from_dtype(np.dtype(float_type))
    string_atom = tables.StringAtom(itemsize=param.no_of_positions + 50)
    long_string_atom = tables.StringAtom(itemsize=5000)  # max alt_info length
    table_file = tables.open_file(out_fn, mode='w', filters=FILTERS)
    table_file.create_earray(where='/',
                             name='position_matrix',
                             atom=int_atom,
                             shape=[0] + tensor_shape,
                             filters=FILTERS)
    table_file.create_earray(where='/',
                             name='position',
                             atom=string_atom,
                             shape=(0, 1),
                             filters=FILTERS)
    table_file.create_earray(where='/',
                             name='label',
                             atom=int_atom,
                             shape=(0, param.label_size),
                             filters=FILTERS)
    table_file.create_earray(where='/',
                             name='alt_info',
                             atom=long_string_atom,
                             shape=(0, 1),
                             filters=FILTERS)

    table_dict = utils.update_table_dict()
    total_compressed = 0

    for f in in_fn_list:
        print("[INFO] Merging file {}".format(f))
        fi = tables.open_file(f, model='r')
        assert (len(fi.root.label) == len(fi.root.position) == len(
            fi.root.position_matrix) == len(fi.root.alt_info))
        for index in range(len(fi.root.label)):
            table_dict['label'].append(fi.root.label[index])
            table_dict['position'].append(fi.root.position[index])
            table_dict['position_matrix'].append(
                fi.root.position_matrix[index])
            table_dict['alt_info'].append(fi.root.alt_info[index])

            total_compressed += 1

            if total_compressed % 500 == 0 and total_compressed > 0:
                table_dict = utils.write_table_file(table_file, table_dict,
                                                    tensor_shape,
                                                    param.label_size,
                                                    float_type)

            if total_compressed % 50000 == 0:
                print("[INFO] Compressed %d tensor" % (total_compressed),
                      file=sys.stderr)
        fi.close()

    if total_compressed % 500 != 0 and total_compressed > 0:
        table_dict = utils.write_table_file(table_file, table_dict,
                                            tensor_shape, param.label_size,
                                            float_type)
        print("[INFO] Compressed %d tensor" % (total_compressed),
              file=sys.stderr)

    table_file.close()
Example #11
0
import logging
import random
import numpy as np
from argparse import ArgumentParser, SUPPRESS
import tensorflow_addons as tfa
import tensorflow as tf
import tables
import os
import sys
from itertools import accumulate

import clair3.model as model_path
from shared.utils import str2bool

logging.basicConfig(format='%(message)s', level=logging.INFO)
tables.set_blosc_max_threads(512)
os.environ['NUMEXPR_MAX_THREADS'] = '64'
os.environ['NUMEXPR_NUM_THREADS'] = '8'


def get_label_task(label, label_shape_cum, task):
    if task == 0:
        return label[:label_shape_cum[task]]
    elif task == len(label_shape_cum) - 1:
        return label[label_shape_cum[task - 1]:]
    else:
        return label[label_shape_cum[task - 1]:label_shape_cum[task]]


def cal_class_weight(samples_per_cls, no_of_classes, beta=0.999):
    effective_num = 1.0 - np.power(beta, samples_per_cls)