Beispiel #1
0
np.random.seed(args.seed)

# directories for checkpoint, images and log files
save_models_folder = wd + '/output/saved_models/'
os.makedirs(save_models_folder, exist_ok=True)

save_logs_folder = wd + '/output/saved_logs/'
os.makedirs(save_logs_folder, exist_ok=True)

###########################################################################################################
# Data
###########################################################################################################
# data loader
data_filename = args.data_path + '/RC-49_' + str(args.img_size) + 'x' + str(
    args.img_size) + '.h5'
hf = h5py.File(data_filename, 'r')
labels = hf['labels'][:]
labels = labels.astype(np.float)
images = hf['images'][:]
hf.close()
N_all = len(images)
assert len(images) == len(labels)

q1 = args.min_label
q2 = args.max_label
indx = np.where((labels > q1) * (labels < q2) == True)[0]
labels = labels[indx]
images = images[indx]
assert len(labels) == len(images)

# normalize to [0, 1]
###
###           In this section we will create a new HDF5 file with a dataset and attributes.

  # Import the h5py and NumPy packages

import h5py
import numpy as np

  # Set the array length we use for our dataset.

array_len = 1500

  # Let's create a new HDF5 file called myfile.h5.  Set its access attribute such that we
  # will be able to write data into it.

fid = h5py.File('myfile.h5', 'w')


###
### PART 2 - Dataset Creation

  # Let's add our first dataset: an 1D array consisting of array_len random floats.
  # Start by creating a NumPy array of random floats. 

input_data = np.random.rand(array_len)

  # Let's define a dataset. We start by constructing an emtpy dataset with space enough 
  # to hold our input_data array.

dset = fid.create_dataset( "RandomData", (array_len,), dtype='f' )
Beispiel #3
0
def create(
    cool_uri,
    bins,
    pixels,
    columns=None,
    dtypes=None,
    metadata=None,
    assembly=None,
    symmetric_upper=True,
    mode=None,
    h5opts=None,
    boundscheck=True,
    triucheck=True,
    dupcheck=True,
    ensure_sorted=False,
    lock=None,
    append=False,
    append_scool=False,
    scool_root_uri=None,
    **kwargs
):
    """
    Create a new Cooler.

    Deprecated parameters
    ---------------------
    chromsizes : Series
        Chromsizes are now inferred from ``bins``.
    append : bool, optional
        Append new Cooler to the file if it exists. If False, an existing file
        with the same name will be truncated. Default is False.
        Use the ``mode`` argument instead.
    dtype : dict, optional
        Dictionary mapping column names in the pixel table to dtypes.
        Use the ``dtypes`` argument instead.

    """
    file_path, group_path = parse_cooler_uri(cool_uri)

    if mode is None:
        mode = "a" if append else "w"

    h5opts = _set_h5opts(h5opts)

    if not isinstance(bins, pd.DataFrame):
        raise ValueError(
            "Second positional argument must be a pandas DataFrame. "
            "Note that the `chromsizes` argument is now deprecated: "
            "see documentation for `create`."
        )
    if append_scool == True and scool_root_uri is None:
        raise ValueError(
            "If the parameter `append_scool` is set, the parameter `scool_root_uri` must be defined."
        )
    dtypes = _get_dtypes_arg(dtypes, kwargs)

    for col in ["chrom", "start", "end"]:
        if col not in bins.columns:
            raise ValueError("Missing column from bin table: '{}'.".format(col))

    # Populate expected pixel column names. Include user-provided value
    # columns.
    if columns is None:
        columns = ["bin1_id", "bin2_id", "count"]
    else:
        columns = list(columns)
        for col in ["bin1_id", "bin2_id"]:  # don't include count!
            if col not in columns:
                columns.insert(0, col)

    # Populate dtypes for expected pixel columns, and apply user overrides.
    if dtypes is None:
        dtypes = dict(PIXEL_DTYPES)
    else:
        dtypes_ = dict(dtypes)
        dtypes = dict(PIXEL_DTYPES)
        dtypes.update(dtypes_)

    # Get empty "meta" header frame (assigns the undeclared dtypes).
    # Any columns from the input not in meta will be ignored.
    meta = get_meta(columns, dtypes, default_dtype=float)

    # Determine the appropriate iterable
    try:
        from dask.dataframe import DataFrame as dask_df
    except (ImportError, AttributeError):  # pragma: no cover
        dask_df = ()

    if isinstance(pixels, dask_df):
        iterable = map(lambda x: x.compute(), pixels.to_delayed())
        input_columns = infer_meta(pixels).columns
    elif isinstance(pixels, pd.DataFrame):
        iterable = (pixels,)
        input_columns = infer_meta(pixels).columns
    elif isinstance(pixels, dict):
        iterable = (pixels,)
        input_columns = infer_meta([(k, v.dtype) for (k, v) in pixels.items()]).columns
    else:
        iterable = pixels
        input_columns = None

    # If possible, ensure all expected columns are available
    if input_columns is not None:
        for col in columns:
            if col not in input_columns:
                col_type = "Standard" if col in PIXEL_FIELDS else "User"
                raise ValueError(
                    "{} column not found in input: '{}'".format(col_type, col)
                )

    # Prepare chroms and bins
    bins = bins.copy()
    bins["chrom"] = bins["chrom"].astype(object)
    chromsizes = get_chromsizes(bins)
    try:
        chromsizes = six.iteritems(chromsizes)
    except AttributeError:
        pass
    chromnames, lengths = zip(*chromsizes)
    chroms = pd.DataFrame(
        {"name": chromnames, "length": lengths}, columns=["name", "length"]
    )
    binsize = get_binsize(bins)
    n_chroms = len(chroms)
    n_bins = len(bins)

    if not symmetric_upper and triucheck:
        warnings.warn(
            "Creating a non-symmetric matrix, but `triucheck` was set to "
            "True. Changing to False."
        )
        triucheck = False

    # Chain input validation to the end of the pipeline
    if boundscheck or triucheck or dupcheck or ensure_sorted:
        validator = validate_pixels(
            n_bins, boundscheck, triucheck, dupcheck, ensure_sorted
        )
        iterable = map(validator, iterable)

    # Create root group
    with h5py.File(file_path, mode) as f:
        logger.info('Creating cooler at "{}::{}"'.format(file_path, group_path))
        if group_path == "/":
            for name in ["chroms", "bins", "pixels", "indexes"]:
                if name in f:
                    del f[name]
        else:
            try:
                f.create_group(group_path)
            except ValueError:
                del f[group_path]
                f.create_group(group_path)

    # Write chroms, bins and pixels
    if append_scool:
        src_path, src_group = parse_cooler_uri(scool_root_uri)
        dst_path, dst_group = parse_cooler_uri(cool_uri)

        with h5py.File(src_path, "r+") as src, h5py.File(dst_path, "r+") as dst:

            dst[dst_group]["chroms"] = src["chroms"]

            # hard link to root bins table, but only the three main datasets
            dst[dst_group]["bins/chrom"] = src["bins/chrom"]
            dst[dst_group]["bins/start"]= src["bins/start"]
            dst[dst_group]["bins/end"]= src["bins/end"]

            # create per cell the additional columns e.g. 'weight'
            # these columns are individual for each cell
            columns = list(bins.keys())
            for col in ["chrom", "start", "end"]:
                columns.remove(col)
            if columns:
                put(dst[dst_group]['bins'], bins[columns])
        with h5py.File(file_path, "r+") as f:
            h5 = f[group_path]
            grp = h5.create_group("pixels")
            if symmetric_upper:
                max_size = n_bins * (n_bins - 1) // 2 + n_bins
            else:
                max_size = n_bins * n_bins
            prepare_pixels(grp, n_bins, max_size, meta.columns, dict(meta.dtypes), h5opts)
    else:
        with h5py.File(file_path, "r+") as f:
            h5 = f[group_path]

            logger.info("Writing chroms")
            grp = h5.create_group("chroms")
            write_chroms(grp, chroms, h5opts)

            logger.info("Writing bins")
            grp = h5.create_group("bins")
            write_bins(grp, bins, chroms["name"], h5opts)

            grp = h5.create_group("pixels")
            if symmetric_upper:
                max_size = n_bins * (n_bins - 1) // 2 + n_bins
            else:
                max_size = n_bins * n_bins
            prepare_pixels(grp, n_bins, max_size, meta.columns, dict(meta.dtypes), h5opts)

    # Multiprocess HDF5 reading is supported only if the same HDF5 file is not
    # open in write mode anywhere. To read and write to the same file, pass a
    # lock shared with the HDF5 reading processes. `write_pixels` will acquire
    # it and open the file for writing for the duration of each write step
    # only. After it closes the file and releases the lock, the reading
    # processes will have to re-acquire the lock and re-open the file to obtain
    # the updated file state for reading.
    logger.info("Writing pixels")
    target = posixpath.join(group_path, "pixels")
    nnz, ncontacts = write_pixels(
        file_path, target, meta.columns, iterable, h5opts, lock
    )

    # Write indexes
    with h5py.File(file_path, "r+") as f:
        h5 = f[group_path]

        logger.info("Writing indexes")
        grp = h5.create_group("indexes")

        chrom_offset = index_bins(h5["bins"], n_chroms, n_bins)
        bin1_offset = index_pixels(h5["pixels"], n_bins, nnz)
        write_indexes(grp, chrom_offset, bin1_offset, h5opts)

        logger.info("Writing info")
        info = {}
        info["bin-type"] = u"fixed" if binsize is not None else u"variable"
        info["bin-size"] = binsize if binsize is not None else u"null"
        info["storage-mode"] = u"symmetric-upper" if symmetric_upper else u"square"
        info["nchroms"] = n_chroms
        info["nbins"] = n_bins
        info["sum"] = ncontacts
        info["nnz"] = nnz
        if assembly is not None:
            info["genome-assembly"] = assembly
        if metadata is not None:
            info["metadata"] = metadata
        write_info(h5, info)
Beispiel #4
0
(This script is not generally useful for most ilastik users or developers.)

Input: hdf5 volume
Output: directory of .png tiles representing the volume.  
"""
if __name__ == "__main__":
    import sys
    import h5py

    import logging
    import argparse
    from lazyflow.utility import PathComponents, export_to_tiles

    logger = logging.getLogger()
    logger.addHandler(logging.StreamHandler(sys.stdout))
    logger.setLevel(logging.INFO)

    # Usage: python make_tiles.py --tile_size=250 /path/to/my_vol.h5/some/dataset /path/to/output_dir
    parser = argparse.ArgumentParser()
    parser.add_argument('--tile_size', type=int)
    parser.add_argument('hdf5_dataset_path')
    parser.add_argument('output_dir')

    parsed_args = parser.parse_args(sys.argv[1:])

    path_comp = PathComponents(parsed_args.hdf5_dataset_path)
    with h5py.File(path_comp.externalPath) as input_file:
        vol_dset = input_file[path_comp.internalPath]
        export_to_tiles(vol_dset, parsed_args.tile_size,
                        parsed_args.output_dir)
Beispiel #5
0
>>>>> Dependencies: <<<<<
1. ASTRA toolbox: conda install -c astra-toolbox astra-toolbox
2. tomobar: conda install -c dkazanc tomobar
or install from https://github.com/dkazanc/ToMoBAR

@author: Daniil Kazantsev, e:mail [email protected]
GPLv3 license (ASTRA toolbox)
"""
#import timeit
import matplotlib.pyplot as plt
import numpy as np
import h5py
from ccpi.supp.qualitymetrics import QualityTools

# loading the data 
h5f = h5py.File('data/TomoSim_data1550671417.h5','r')
phantom = h5f['phantom'][:]
projdata_norm = h5f['projdata_norm'][:]
proj_angles = h5f['proj_angles'][:]
h5f.close()

[Vert_det, AnglesNum, Horiz_det] = np.shape(projdata_norm)
N_size = Vert_det

sliceSel = 128
#plt.gray()
plt.figure() 
plt.subplot(131)
plt.imshow(phantom[sliceSel,:,:],vmin=0, vmax=1)
plt.title('3D Phantom, axial view')
Beispiel #6
0
    def __init__(self, 
        path,
        filename,
        idenselect=[],
        train=True,
        transform=None,
        ):
          
        
        if os.path.isdir(path) is not True:
            raise ValueError('Path {} is not directory'.format(path))

        self.path = path
        self.filename = filename
        dir = os.path.join(path, filename + '.mat' )
        f = h5py.File(dir)

        self.data   = np.array(f["data"])
        self.points = np.array(f["points"])
        self.imsize = np.array(f["imsize"])[:,0].astype(int)
        self.iactor = np.array(f["iactor"])[0,:].astype(int) 
        self.labels = np.array(f["iclass"])[0,:].astype(int) - 1
        self.name   = np.array(f["name"])
        self.num    = np.array(f["num"])[0,0].astype(int)
        
        # Emotions class 
        if filename == 'ck' or filename == 'ckp':       
            #classes = ['Neutral - NE', 'Anger - AN', 'Contempt - CO', 'Disgust - DI', 'Fear - FR', 'Happiness - HA', 'Sadness - SA', 'Surprise - SU']
            toferp = [0, 4, 7, 5, 6, 1, 3, 2 ]
        elif filename=='bu3dfe' or filename=='jaffe':
            #classes = ['Neutral - NE', 'Anger - AN', 'Disgust - DI', 'Fear - FR', 'Happiness - HA', 'Sadness - SA', 'Surprise - SU', 'Contempt - CO']
            toferp = [0, 4, 5, 6, 1, 3, 2, 7 ]
        else:
            assert(False)
        
        #self.toferp = toferp
        #self.classes = classes
        #self.class_to_idx = { _class: i for i, _class in enumerate(classes) }    

        self.labels = np.array([ toferp[l] for l in self.labels ])
        self.numclass = len(np.unique(self.labels))

        index = np.ones( (self.num,1) )
        actors = np.unique(self.iactor)
        for i in idenselect:
            index[self.iactor == actors[i]] = 0       
        self.indexs = np.where(index == train)[0]        
        self.transform = transform
        
        # ######
        # index_nne = []
        # for idx in self.indexs:
        #     if( self.labels[ idx ] != 0 ):
        #         index_nne.append( idx )
        # self.indexs = np.array(index_nne)
        # #######
        
        self.labels_org = self.labels
        self.labels = self.labels[ self.indexs ]
        self.classes = [self.classes[ i ] for i in  np.unique( self.labels ) ] 
        self.labels = self.labels ### - 1 ############ 
        self.numclass = len(self.classes)   
        self.index = 0
Beispiel #7
0
print(args.restore_model,args.restore_predictor)
print()
print('Save locations')
print(args.save_model,args.save_predictor)
print()

# Which array to convert from categorical to residue letter
if args.encoding == 'categorical':
    ORDER = cst.ORDER_CATEGORICAL
    CATEGORIES = cst.CATEGORIES
elif args.encoding == 'blosum':
    ORDER = cst.ORDER_BLOSUM
    CATEGORIES = cst.BLOSUM

### Collect data
f = h5py.File('/projects/ml/flu/fludb_data/processed_data_525916981168.h5','r')

train_labels_dataset = f['train_labels']
train_labels = train_labels_dataset[()]

valid_labels_dataset = f['valid_labels']
valid_labels = valid_labels_dataset[()]

test_labels_dataset = f['test_labels']
test_labels = test_labels_dataset[()]

if args.encoding == 'categorical':
    train_sequences_dataset = f['train_sequences_categorical']
    train_sequences = train_sequences_dataset[()]

    valid_sequences_dataset = f['valid_sequences_categorical']
def createVSB100(db_settings, logger):
    '''
    This method creates the database needed for caffe.
    '''
    action = 'vw_commercial'
    database_path = db_settings['database_path']
    features_path = db_settings['features_path'] #'/cs/vml3/mkhodaba/cvpr16/Graph_construction/Features/{action_name}_features.mat'
    video_info_path = db_settings['video_info_path'] #'/cs/vml3/mkhodaba/cvpr16/Graph_construction/Features/{action_name}_vidinfo.mat'
    #database_path = '/cs/vml2/mkhodaba/cvpr16/datasets/VSB100/databases/{action_name}.h5'
    #features_path = '/cs/vml3/mkhodaba/cvpr16/Graph_construction/Features/{action_name}_features.mat'
    #video_info_path = '/cs/vml3/mkhodaba/cvpr16/Graph_construction/Features/{action_name}_vidinfo.mat'
    features_path = features_path.format(action_name=action)
    video_info_path = video_info_path.format(action_name=action)
    database_path = database_path.format(action_name=action)
    neighbors_num = db_settings['number_of_neighbors']
    neighbor_frames_num = db_settings['neighbor_frames_num']

    from scipy.io import loadmat
    import numpy as np
    from scipy.spatial import cKDTree
    from random import randint
    from sklearn.preprocessing import StandardScaler
    try:
        features = loadmat(features_path)['features'] #number_of_frames x number_of_supervoxels_per_frame x feature_length
    except:
        import h5py
        features = h5py.File(features_path)
        print features.keys()
    video_info = loadmat(video_info_path) #video_info = [mapped, labelledlevelvideo, numberofsuperpixelsperframe]
                        #mapped -> #number_of_frames x number_of_supervoxels_per_frame
                        #labelledlevelvideo -> height x width x number_of_frames
                        #framebelong -> total_number_of_super_pixels x 1
                        #labelsatframe -> total_number_of_super_pixels x 1
    kdtrees = []
    labelledlevelvideo = video_info['labelledlevelvideo']
    numberofsuperpixelsperframe = video_info['numberofsuperpixelsperframe']
    numberofsuperpixelsperframe = numberofsuperpixelsperframe[0]
    print features.shape
    frames_num = len(features)
    superpixels_num = len(features[0]) #per frame
    feature_len = len(features[0][0])
    print features[0][0][1:50]
    normalize_data = False
    if normalize_data:
        features_normalized = np.zeros((np.sum(numberofsuperpixelsperframe), feature_len))
        print features_normalized.shape
        idx = 0
        for f in xrange(frames_num):
            for s in xrange(numberofsuperpixelsperframe[f]):
                features_normalized[idx][...] = features[f][s][...]
                idx += 1
        clf = StandardScaler()
        features_normalized_2 = clf.fit_transform(features_normalized)
        idx = 0
        for f in xrange(frames_num):
            for s in xrange(numberofsuperpixelsperframe[f]):
                features[f][s][...] = features_normalized_2[idx][...]
                idx +=1

    print features[0][0][1:50]
    print features.shape
    print frames_num, superpixels_num, feature_len
    print numberofsuperpixelsperframe
    #centers[f][i] -> h,w of center
    centers = np.zeros((frames_num, superpixels_num, 2)) #[[[0.0,0.0] for i in xrange(superpixels_num)] for j in xrange(frames_num)] #frames_num x superpixels_num x 2
    pixels_count = [[0 for i in xrange(superpixels_num)] for j in xrange(frames_num)] #frames_num x superpixels_num
    height = len(labelledlevelvideo)
    width = len(labelledlevelvideo[0])
    logger.log('Computing centers of superpixels ...')
    for f in xrange(frames_num):
        logger.log('Frame %d' % f)
        for h in xrange(height):
            for w in xrange(width):
                try:
                    idx = labelledlevelvideo[h][w][f]-1
                except:
                    print h, w, f
                    raise
                centers[f][idx][0] += h
                centers[f][idx][1] += w
                pixels_count[f][idx] += 1
        for i in xrange(numberofsuperpixelsperframe[f]):
            centers[f][i][0] /= pixels_count[f][i]
            centers[f][i][1] /= pixels_count[f][i]
        logger.log('Building kdtree')
        kdtree = cKDTree(np.array(centers[f][:numberofsuperpixelsperframe[f]]))
        kdtrees.append(kdtree)
    framebelong = video_info['framebelong']
    print framebelong.shape
    labelsatframe = video_info['labelsatframe']
    target_superpixel_num = 0
    for f in xrange(neighbor_frames_num, frames_num-neighbor_frames_num):
        target_superpixel_num += numberofsuperpixelsperframe[f]
    n = target_superpixel_num
    #len(framebelong)
    superpixel_skip_num = 0
    n_neg = 10
    for f in xrange(neighbor_frames_num):
        superpixel_skip_num += numberofsuperpixelsperframe[f]
    data = {'target':np.zeros((n*n_neg, feature_len)), 'negative':np.zeros((n*n_neg, feature_len))}
    #data = {'target':np.zeros((n, feature_len)), 'negative':np.zeros((n, feature_len))}
    #Tracer()()
    total_number_of_neighbors = neighbors_num  * (2*neighbor_frames_num+1)
    total_number_of_neighbors = neighbors_num  * (2*neighbor_frames_num+1)
    for i in range(total_number_of_neighbors):
        data['neighbor{0}'.format(i)] = np.zeros((n*n_neg, feature_len))
        #data['neighbor{0}'.format(i)] = np.zeros((n, feature_len))
    superpixel_idx = -1
    logger.log('Creating the database of superpixels:features')
    for f in xrange(neighbor_frames_num, frames_num-neighbor_frames_num): #TODO: start from a frame that has at least neighbor_frames_num number of frames before it
        logger.log('Frame %d' % f)
        logger.log('There are %d superpixels in in this frame' % numberofsuperpixelsperframe[f])
        for i in xrange(numberofsuperpixelsperframe[f]):
            superpixel_idx += 1
            assert f == framebelong[superpixel_idx+superpixel_skip_num]-1, 'Something went wrong in mapping superpixel index to frames/label at frame (1)'
            assert i == labelsatframe[superpixel_idx+superpixel_skip_num]-1, 'Something went wrong in mapping superpixel index to frames/label at frame (2)'
            data['target'][superpixel_idx*n_neg:(superpixel_idx + 1)*n_neg][...] = features[f][i][...]
            #data['target'][superpixel_idx][...] = features[f][i][...]

            center = centers[f][i]
            frame_start = max(0, f-neighbor_frames_num)
            frame_end = min(frames_num, f+neighbor_frames_num)
            neighbor_idx = 0
            #print frame_start, frame_end
            for target_frame in xrange(frame_start, frame_end+1):
                if f == target_frame:
                    nearest_neighbors = kdtrees[target_frame].query(center, neighbors_num+1)[1] # Added one to the neighbors because the target itself is included
                    nearest_neighbors = nearest_neighbors[1:]
                else:
                    nearest_neighbors = kdtrees[target_frame].query(center, neighbors_num)[1]
                for idx in nearest_neighbors:
                    #data['neighbor{0}'.format(neighbor_idx)][superpixel_idx*n_neg:(superpixel_idx + 1)*n_neg][...] = features[target_frame][idx][...]
                    data['neighbor{0}'.format(neighbor_idx)][superpixel_idx][...] = features[target_frame][idx][...]
                    neighbor_idx += 1
            assert neighbor_idx == total_number_of_neighbors, "Number of neighbors doesn't match ( %d != %d )" % (neighbor_idx, total_number_of_neighbors)
            #TODO: print "Random frame ... (Warning: if it's taknig too long stop it! \n Apparantly, the number of neighboring frames are relatively large \n with respect to the number of video frames)"
            # frame_random = randint(0, frames_num-1)
            # while frame_end-frame_start < 0.5*frames_num and frame_start <= frame_random <= frame_end:
                # frame_random = randint(0, frames_num-1)
            # idx_random = randint(0, numberofsuperpixelsperframe[ frame_random]-1)
            # data['negative'][superpixel_idx][...] = features[frame_random][idx_random][...]
            nearest_neighbors = kdtrees[f].query(center, 5*neighbors_num+n_neg)[1]
            #nearest_neighbors = kdtrees[f].query(center, 5*neighbors_num)[1]
            #It's the nearest of farthest superpixels to this one
            idx_random = nearest_neighbors[-1]
            if i == 10:
                print 'f, i, superpixel_idx, idx_random', f, i, superpixel_idx, idx_random
            #data['negative'][superpixel_idx][...] = features[f][idx_random][...]
            for j in xrange(n_neg):
                idx_random = nearest_neighbors[-j]
                data['negative'][superpixel_idx*n_neg + j][...] = features[f][idx_random][...]

    assert superpixel_idx+1 == target_superpixel_num, "Total number of superpixels doesn't match (%d != %d)" % (superpixel_idx, target_superpixel_num)
    db_path = database_path.format(action_name=action)
    print db_path
    database = DB(db_path)
    for name, datum in data.iteritems():
        database.save(datum, name)
    database.close()
    #Creating the database for extracting the final representations. It just needs to have the targets nothing else.

#    n = len(framebelong)
#    print 'n', n
#    data = {'target':np.zeros((n*n_neg, feature_len)), 'negative':np.zeros((n*n_neg, feature_len))}
#    total_number_of_neighbors = neighbors_num  * (2*neighbor_frames_num+1)
#    for i in range(total_number_of_neighbors):
#        data['neighbor{0}'.format(i)] = np.zeros((n*n_neg, feature_len))
#    superpixel_idx = 0
#    for f in xrange(1,frames_num-1):
#        for i in xrange(numberofsuperpixelsperframe[f]):
#            try:
#                data['target'][superpixel_idx*n_neg:(superpixel_idx + 1)*n_neg][...] = features[f][i][...]
#            except:
#                print superpixel_idx, f, i
#                raise
#            superpixel_idx +=1
#    database_path = db_settings['database_path']
#    db_path = database_path.format(action_name=(action+'_test'))
#    print 'test db path', db_path
#    database = DB(db_path)
#    for name, datum in data.iteritems():
#        database.save(datum, name)
#    database.close()
#
    write_db_list(db_settings, logger)

    logger.log('Creating database Done!')
Beispiel #9
0
def slice_templates(params, to_remove=[], to_merge=[], extension=''):

    import shutil, h5py
    file_out_suff = params.get('data', 'file_out_suff')

    data_file = params.data_file
    N_e = params.getint('data', 'N_e')
    N_total = params.nb_channels
    N_t = params.getint('detection', 'N_t')
    template_shift = params.getint('detection', 'template_shift')

    if comm.rank == 0:
        print_and_log(['Node 0 is slicing templates'], 'debug', logger)
        old_templates = load_data(params, 'templates')
        old_limits = load_data(params, 'limits')
        x, N_tm = old_templates.shape
        norm_templates = load_data(params, 'norm-templates')

        if to_merge != []:
            for count in xrange(len(to_merge)):
                remove = to_merge[count][1]
                to_remove += [remove]

        all_templates = set(numpy.arange(N_tm // 2))
        to_keep = numpy.array(list(all_templates.difference(to_remove)))

        positions = numpy.arange(len(to_keep))

        local_keep = to_keep[positions]
        templates = scipy.sparse.lil_matrix((N_e * N_t, 2 * len(to_keep)),
                                            dtype=numpy.float32)
        hfile = h5py.File(file_out_suff + '.templates-new.hdf5',
                          'w',
                          libver='latest')
        norms = hfile.create_dataset('norms',
                                     shape=(2 * len(to_keep), ),
                                     dtype=numpy.float32,
                                     chunks=True)
        limits = hfile.create_dataset('limits',
                                      shape=(len(to_keep), 2),
                                      dtype=numpy.float32,
                                      chunks=True)
        for count, keep in zip(positions, local_keep):

            templates[:, count] = old_templates[:, keep]
            templates[:,
                      count + len(to_keep)] = old_templates[:,
                                                            keep + N_tm // 2]
            norms[count] = norm_templates[keep]
            norms[count + len(to_keep)] = norm_templates[keep + N_tm // 2]
            if to_merge == []:
                new_limits = old_limits[keep]
            else:
                subset = numpy.where(to_merge[:, 0] == keep)[0]
                if len(subset) > 0:
                    idx = numpy.unique(to_merge[subset].flatten())
                    ratios = norm_templates[idx] / norm_templates[keep]
                    new_limits = [
                        numpy.min(ratios * old_limits[idx][:, 0]),
                        numpy.max(ratios * old_limits[idx][:, 1])
                    ]
                else:
                    new_limits = old_limits[keep]
            limits[count] = new_limits

        templates = templates.tocoo()
        hfile.create_dataset('temp_x', data=templates.row)
        hfile.create_dataset('temp_y', data=templates.col)
        hfile.create_dataset('temp_data', data=templates.data)
        hfile.create_dataset('temp_shape',
                             data=numpy.array([N_e, N_t, 2 * len(to_keep)],
                                              dtype=numpy.int32))
        hfile.close()

        if os.path.exists(file_out_suff + '.templates%s.hdf5' % extension):
            os.remove(file_out_suff + '.templates%s.hdf5' % extension)
        shutil.move(file_out_suff + '.templates-new.hdf5',
                    file_out_suff + '.templates%s.hdf5' % extension)

    comm.Barrier()
Beispiel #10
0
# positionArray = [[0, 0]*T]*len(theta_1)
# for i in range(0, len(theta_1)):
#     positionArray[i] = ForwardModel(timeArray, [theta_1[i], theta_2[i]], state0)
#     print(i)
# positionArray = np.array(positionArray)
# xData = np.hstack(positionArray[:,:,0])
# yData = np.hstack(positionArray[:,:,1])
# hdf5file2 = h5py.File('predictiveValues.h5', 'a')
# WriteData(hdf5file2, 'data/predictiveX', xData)
# WriteData(hdf5file2, 'data/predictiveY', yData)
# fig = MakeFigure(450, 1)
# ax = plt.gca()
# #ax.set_title('Iceberg Predictive Model', fontsize = 12)
# ax.set_xlabel('Latitude (deg)', fontsize = 30)
# ax.set_ylabel('Longitude (deg)', fontsize = 30)
# hist = ax.hist2d(xData, yData, normed=True, bins = (500,500), cmap = plt.cm.viridis)
# #plt.colorbar(hist[3], ax=ax)
# plt.show()
filename = 'predictiveValues.h5'
hdf5file = h5py.File(filename, 'r')
timeMat = hdf5file['data/predictiveX'].value
Data = hdf5file['data/predictiveY'].value
fig = MakeFigure(450, 1)
ax = plt.gca()
#ax.set_title('Harmonic Predictive Model', fontsize = 30)
ax.set_xlabel('Longitude', fontsize = 30)
ax.set_ylabel('Latitude', fontsize = 30)
hist = ax.hist2d(timeMat, Data, normed=True, bins = (1000,1000), cmax= 0.15, cmap = plt.cm.viridis)
#plt.colorbar(hist[3], ax=ax)
plt.show()
Beispiel #11
0
output_shape_path = "./output_shape/{}".format(cat_id)
output_color_path = "./output_color/{}".format(cat_id)

ids = glob.glob(os.path.join(output_shape_path, "*"))
ids = [os.path.basename(i) for i in ids]

# # #
avg_psnr_rgb = 0.0
avg_psnr_ycc = 0.0
count = 0

for id_ in ids:
    # Load ground truth volume
    gt_path = os.path.join(gt_output_path, id_,
                           "models/model_normalized_{}.h5".format(vol_dim))
    f_gt = h5py.File(gt_path)
    data_gt = f_gt['data'][:]
    indices_gt = np.where((data_gt[:, :, :, 0] > -0.5) == 1)

    # Load views
    views_path = os.path.join(gt_output_path, id_, "views/*.png")
    views_paths = glob.glob(views_path)
    views_paths.sort()

    # prediction
    pred_color_path_id = os.path.join(output_color_path, id_)
    pred_colors_paths = glob.glob(os.path.join(pred_color_path_id, "*.h5"))
    pred_colors_paths.sort()

    pred_shape_path_id = os.path.join(output_shape_path, id_)
    pred_shapes_paths = glob.glob(os.path.join(pred_shape_path_id, "*.h5"))
Beispiel #12
0
def main():

    # Parse command-line arguments.
    args = parser.parse_args()

    # Validate arguments
    if not args.paths:
        log.error("No ROOT files were specified.")
        return

    if args.max_processes > 20:
        log.error(
            "The requested number of processes ({}) is excessive. Exiting.".
            format(args.max_processes))
        return

    if args.stop is not None:
        args.stop = int(args.stop)
        pass

    if not args.outdir.endswith('/'):
        args.outdir += '/'
        pass

    if args.shuffle:
        raise NotImplemented()

    args.paths = sorted(args.paths)

    for path in args.paths:

        # Base candidate selection
        selection = None  # "(p_truth_eta > -1.5 && p_truth_eta < 1.5)"

        # Read numpy array from file.
        f = ROOT.TFile(path, 'READ')
        tree = f.Get('tree')

        # Split indices into batches
        N = min(1000000, tree.GetEntries())  # @TEMP
        index_edges = map(
            int, np.linspace(0, N, args.max_processes + 1, endpoint=True))
        index_ranges = zip(index_edges[:-1], index_edges[1:])

        # Start conversion process(es)
        pool = multiprocessing.Pool(processes=args.max_processes)
        results = pool.map(converter, [(path, start, stop, selection)
                                       for (start, stop) in index_ranges])

        # Concatenate data
        data = np.concatenate(results)
        print data.shape

        # Save as gzipped HDF5
        mkdir(args.outdir)
        filename = 'cells_{}.h5'.format(args.tag)
        log.debug("  Saving to {}".format(args.outdir + filename))
        with h5py.File(args.outdir + filename, 'w') as hf:
            hf.create_dataset('egamma',
                              data=data,
                              chunks=(min(1024, data.shape[0]), ),
                              compression='gzip')
            pass
        call(['gzip', '-f', args.outdir + filename])
        pass

    return
Beispiel #13
0
def load_stdata(fname):
    f = h5py.File(fname, 'r')
    data = f['data'].value
    timestamps = f['date'].value
    f.close()
    return data, timestamps
Beispiel #14
0
ext = os.path.splitext(demFile)[1]

if ext == '.hgt':

   amp,dem,demRsc = readfile.read_float32(demFile)

elif ext == '.dem':
   dem,demRsc =readfile.read_dem(demFile)


try:
  outName=sys.argv[2]
except:
  outName='dem.h5'


h5=h5py.File(outName,'w')
group=h5.create_group('dem')

dset = group.create_dataset('dem', data=dem, compression='gzip')

for key , value in demRsc.iteritems():
     group.attrs[key]=value

group.attrs['ref_y']=0
group.attrs['ref_x']=0
h5.close()


Beispiel #15
0
def get_group_mass_fractions(file, group):

    # Read HDF5 file

    h5file = h5py.File(file, 'r')
    return h5file['/' + group + '/Mass Fractions']
Beispiel #16
0
def slice_clusters(params,
                   result,
                   to_remove=[],
                   to_merge=[],
                   extension='',
                   light=False):

    import h5py, shutil
    file_out_suff = params.get('data', 'file_out_suff')
    data_file = params.data_file
    N_e = params.getint('data', 'N_e')
    N_total = params.nb_channels
    N_t = params.getint('detection', 'N_t')
    template_shift = params.getint('detection', 'template_shift')

    if comm.rank == 0:

        print_and_log(['Node 0 is slicing clusters'], 'debug', logger)

        if to_merge != []:
            for count in xrange(len(to_merge)):
                remove = to_merge[count][1]
                to_remove += [remove]

        all_elements = [[] for i in xrange(N_e)]
        for target in numpy.unique(to_remove):
            elec = result['electrodes'][target]
            nic = target - numpy.where(result['electrodes'] == elec)[0][0]
            mask = result['clusters_' + str(elec)] > -1
            tmp = numpy.unique(result['clusters_' + str(elec)][mask])
            all_elements[elec] += list(
                numpy.where(result['clusters_' + str(elec)] == tmp[nic])[0])

        for elec in xrange(N_e):
            if not light:
                result['data_' + str(elec)] = numpy.delete(result['data_' +
                                                                  str(elec)],
                                                           all_elements[elec],
                                                           axis=0)
                result['clusters_' + str(elec)] = numpy.delete(
                    result['clusters_' + str(elec)], all_elements[elec])
                result['times_' + str(elec)] = numpy.delete(
                    result['times_' + str(elec)], all_elements[elec])
                result['peaks_' + str(elec)] = numpy.delete(
                    result['peaks_' + str(elec)], all_elements[elec])
            else:

                result['clusters_' + str(elec)] = numpy.delete(
                    result['clusters_' + str(elec)], all_elements[elec])
                myfile = h5py.File(file_out_suff + '.clusters.hdf5',
                                   'r',
                                   libver='latest')
                data = myfile.get('data_' + str(elec))[:]
                result['data_' + str(elec)] = numpy.delete(data,
                                                           all_elements[elec],
                                                           axis=0)
                data = myfile.get('times_' + str(elec))[:]
                result['times_' + str(elec)] = numpy.delete(
                    data, all_elements[elec])
                data = myfile.get('peaks_' + str(elec))[:]
                result['peaks_' + str(elec)] = numpy.delete(
                    data, all_elements[elec])
                myfile.close()

        result['electrodes'] = numpy.delete(result['electrodes'],
                                            numpy.unique(to_remove))

        cfile = h5py.File(file_out_suff + '.clusters-new.hdf5',
                          'w',
                          libver='latest')
        to_write = ['data_', 'clusters_', 'times_', 'peaks_']
        for ielec in xrange(N_e):
            write_datasets(cfile, to_write, result, ielec)

        write_datasets(cfile, ['electrodes'], result)
        cfile.close()
        if os.path.exists(file_out_suff + '.clusters%s.hdf5' % extension):
            os.remove(file_out_suff + '.clusters%s.hdf5' % extension)
        shutil.move(file_out_suff + '.clusters-new.hdf5',
                    file_out_suff + '.clusters%s.hdf5' % extension)

    comm.Barrier()
Beispiel #17
0
    n_a = 1
    n_b = 1
    n_k = 1

    #    var_w = 4.0  # need only to define a reasonable integration interval
    #    var_e = 1.0

    #    std_w = np.sqrt(var_w)
    #    std_e = np.sqrt(var_e)
    model_name_load = 'NLS_noise'  # start from NLS fit
    model_name_save = 'ML_noise'  # Refine with ML fit
    dataset_name = 'train_noise'

    # In[Load data]
    filename = os.path.join('data', 'dataset.h5')
    h5_data = h5py.File(filename, 'r')
    u = np.array(h5_data[dataset_name]['u'])
    y = np.array(h5_data[dataset_name]['y'])
    y0 = np.array(h5_data[dataset_name]['y0'])

    # Train on a single example
    u = u[0:1, ...]
    y = y[0:1, ...]

    batch_size = u.shape[0]
    seq_len = u.shape[1]
    n_u = u.shape[2]
    n_y = y.shape[2]

    # In[To tensors]
    u_torch = torch.tensor(u, dtype=torch.float32)
def main(args):
    dset = h5py.File(args.filename, 'r')
    if not dset:
        print("Not a valid dataset: %s" % (args.filename))
        return

    dsetNames = dset.keys()
    print("File %s contains %d groups:" % (args.filename, len(dset.keys())))
    print(" ", "\n  ".join(dsetNames))

    if not args.in_group:
        if len(dset.keys()) > 1:
            print("Input group not specified -- selecting most recent")
        args.in_group = list(dset.keys())[-1]

    if not args.out_folder:
        args.out_folder = re.sub('.h5$', '', args.filename)
        print("Output folder not specified -- using %s" % args.out_folder)

    if args.in_group not in dset:
        print("Could not find group %s" % (args.in_group))
        return

    if not os.path.exists(args.out_folder):
        os.makedirs(args.out_folder)

    group = dset.get(args.in_group)
    print("Reading data from group '%s' in file '%s'" % (args.in_group, args.filename))

    # mrdImg data is stored as:
    #   /group/config              text of recon config parameters (optional)
    #   /group/xml                 text of ISMRMRD flexible data header (optional)
    #   /group/image_0/data        array of IsmrmrdImage data
    #   /group/image_0/header      array of ImageHeader
    #   /group/image_0/attributes  text of mrdImg MetaAttributes

    isImage = True
    imageNames = group.keys()
    print("Found %d mrdImg sub-groups: %s" % (len(imageNames), ", ".join(imageNames)))

    for imageName in imageNames:
        if ((imageName == 'xml') or (imageName == 'config') or (imageName == 'config_file')):
            continue

        mrdImg = group[imageName]
        if not (('data' in mrdImg) and ('header' in mrdImg) and ('attributes' in mrdImg)):
            isImage = False

    dset.close()

    if (isImage is False):
        print("File does not contain properly formatted MRD raw or mrdImg data")
        return

    dset = ismrmrd.Dataset(args.filename, args.in_group, False)

    groups = dset.list()

    if ('xml' in groups):
        xml_header = dset.read_xml_header()
        xml_header = xml_header.decode("utf-8")
        mrdHead = ismrmrd.xsd.CreateFromDocument(xml_header)

    for group in groups:
        if ( (group == 'config') or (group == 'config_file') or (group == 'xml') ):
            continue

        print("Reading images from '/" + args.in_group + "/" + group + "'")

        for imgNum in range(0, dset.number_of_images(group)):
            mrdImg = dset.read_image(group, imgNum)
            meta = ismrmrd.Meta.deserialize(mrdImg.attribute_string)

            if ((mrdImg.data.shape[0] == 3) and (mrdImg.getHead().image_type == 6)):
                # RGB images
                print("RGB data not yet supported")
                continue
            else:
                if (mrdImg.data.shape[1] != 1):
                    print("Multi-slice data not yet supported")
                    continue

                if (mrdImg.data.shape[0] != 1):
                    print("Multi-channel data not yet supported")
                    continue

                # Use previously JSON serialized header as a starting point, if available
                if meta.get('DicomJson') is not None:
                    dicomDset = pydicom.dataset.Dataset.from_json(base64.b64decode(meta['DicomJson']))
                else:
                    dicomDset = pydicom.dataset.Dataset()

                # Enforce explicit little endian for written DICOM files
                dicomDset.file_meta                            = pydicom.dataset.FileMetaDataset()
                dicomDset.file_meta.TransferSyntaxUID          = pydicom.uid.ExplicitVRLittleEndian
                dicomDset.file_meta.MediaStorageSOPClassUID    = pynetdicom.sop_class.MRImageStorage
                dicomDset.file_meta.MediaStorageSOPInstanceUID = pydicom.uid.generate_uid()
                pydicom.dataset.validate_file_meta(dicomDset.file_meta)
                # FileMetaInformationGroupLength is still missing?
                dicomDset.is_little_endian                     = True
                dicomDset.is_implicit_VR                       = False

                # ----- Update DICOM header from MRD header -----
                try:
                    if mrdHead.measurementInformation is None:
                        pass
                        # print("  MRD header does not contain measurementInformation section")
                    else:
                        # print("---------- Old -------------------------")
                        # print("SeriesInstanceUID  : %s" % dicomDset.SeriesInstanceUID   )
                        # print("PatientPosition    : %s" % dicomDset.PatientPosition     )
                        # print("SeriesDescription  : %s" % dicomDset.SeriesDescription   )
                        # print("FrameOfReferenceUID: %s" % dicomDset.FrameOfReferenceUID )

                        if mrdHead.measurementInformation.measurementID       is not None: dicomDset.SeriesInstanceUID   = mrdHead.measurementInformation.measurementID
                        if mrdHead.measurementInformation.patientPosition     is not None: dicomDset.PatientPosition     = mrdHead.measurementInformation.patientPosition.name
                        if mrdHead.measurementInformation.protocolName        is not None: dicomDset.SeriesDescription   = mrdHead.measurementInformation.protocolName
                        if mrdHead.measurementInformation.frameOfReferenceUID is not None: dicomDset.FrameOfReferenceUID = mrdHead.measurementInformation.frameOfReferenceUID

                        # print("---------- New -------------------------")
                        # print("SeriesInstanceUID  : %s" % dicomDset.SeriesInstanceUID   )
                        # print("PatientPosition    : %s" % dicomDset.PatientPosition     )
                        # print("SeriesDescription  : %s" % dicomDset.SeriesDescription   )
                        # print("FrameOfReferenceUID: %s" % dicomDset.FrameOfReferenceUID )
                except:
                    print("Error setting header information from MRD header's measurementInformation section")

                try:
                    # print("---------- Old -------------------------")
                    # print("mrdHead.acquisitionSystemInformation.systemVendor         : %s" % mrdHead.acquisitionSystemInformation.systemVendor          )
                    # print("mrdHead.acquisitionSystemInformation.systemModel          : %s" % mrdHead.acquisitionSystemInformation.systemModel           )
                    # print("mrdHead.acquisitionSystemInformation.systemFieldStrength_T: %s" % mrdHead.acquisitionSystemInformation.systemFieldStrength_T )
                    # print("mrdHead.acquisitionSystemInformation.institutionName      : %s" % mrdHead.acquisitionSystemInformation.institutionName       )
                    # print("mrdHead.acquisitionSystemInformation.stationName          : %s" % mrdHead.acquisitionSystemInformation.stationName           )

                    if mrdHead.acquisitionSystemInformation.systemVendor          is not None: dicomDset.Manufacturer          = mrdHead.acquisitionSystemInformation.systemVendor         
                    if mrdHead.acquisitionSystemInformation.systemModel           is not None: dicomDset.ManufacturerModelName = mrdHead.acquisitionSystemInformation.systemModel          
                    if mrdHead.acquisitionSystemInformation.systemFieldStrength_T is not None: dicomDset.MagneticFieldStrength = mrdHead.acquisitionSystemInformation.systemFieldStrength_T
                    if mrdHead.acquisitionSystemInformation.institutionName       is not None: dicomDset.InstitutionName       = mrdHead.acquisitionSystemInformation.institutionName      
                    if mrdHead.acquisitionSystemInformation.stationName           is not None: dicomDset.StationName           = mrdHead.acquisitionSystemInformation.stationName

                    # print("---------- New -------------------------")
                    # print("mrdHead.acquisitionSystemInformation.systemVendor         : %s" % mrdHead.acquisitionSystemInformation.systemVendor          )
                    # print("mrdHead.acquisitionSystemInformation.systemModel          : %s" % mrdHead.acquisitionSystemInformation.systemModel           )
                    # print("mrdHead.acquisitionSystemInformation.systemFieldStrength_T: %s" % mrdHead.acquisitionSystemInformation.systemFieldStrength_T )
                    # print("mrdHead.acquisitionSystemInformation.institutionName      : %s" % mrdHead.acquisitionSystemInformation.institutionName       )
                    # print("mrdHead.acquisitionSystemInformation.stationName          : %s" % mrdHead.acquisitionSystemInformation.stationName           )
                except:
                    print("Error setting header information from MRD header's acquisitionSystemInformation section")

                # Set mrdImg pixel data from MRD mrdImg
                dicomDset.PixelData = np.squeeze(mrdImg.data).tobytes() # mrdImg.data is [cha z y x] -- squeeze to [y x] for [row col]
                dicomDset.Rows      = mrdImg.data.shape[2]
                dicomDset.Columns   = mrdImg.data.shape[3]

                if (mrdImg.data.dtype == 'uint16') or (mrdImg.data.dtype == 'int16'):
                    dicomDset.BitsAllocated = 16
                    dicomDset.BitsStored    = 16
                    dicomDset.HighBit       = 15
                elif (mrdImg.data.dtype == 'uint32') or (mrdImg.data.dtype == 'int') or (mrdImg.data.dtype == 'float32'):
                    dicomDset.BitsAllocated = 32
                    dicomDset.BitsStored    = 32
                    dicomDset.HighBit       = 31
                elif (mrdImg.data.dtype == 'float64'):
                    dicomDset.BitsAllocated = 64
                    dicomDset.BitsStored    = 64
                    dicomDset.HighBit       = 63
                else:
                    print("Unsupported data type: ", mrdImg.data.dtype)

                dicomDset.SeriesNumber               = mrdImg.image_series_index
                dicomDset.InstanceNumber             = mrdImg.image_index

                # ----- Set some mandatory default values -----
                if not 'SamplesPerPixel' in dicomDset:
                    dicomDset.SamplesPerPixel = 1

                if not 'PhotometricInterpretation' in dicomDset:
                    dicomDset.PhotometricInterpretation = 'MONOCHROME2'

                if not 'PixelRepresentation' in dicomDset:
                    dicomDset.PixelRepresentation = 0  # Unsigned integer

                if not 'ImageType' in dicomDset:
                    dicomDset.ImageType = ['ORIGINAL', 'PRIMARY', 'M']

                if not 'SeriesNumber' in dicomDset:
                    dicomDset.SeriesNumber = 1

                if not 'SeriesDescription' in dicomDset:
                    dicomDset.SeriesDescription = ''

                if not 'InstanceNumber' in dicomDset:
                    dicomDset.InstanceNumber = 1

                # ----- Update DICOM header from MRD ImageHeader -----
                dicomDset.ImageType[2]               = imtype_map[mrdImg.image_type]
                dicomDset.PixelSpacing               = [float(mrdImg.field_of_view[0]) / mrdImg.data.shape[2], float(mrdImg.field_of_view[1]) / mrdImg.data.shape[3]]
                dicomDset.SliceThickness             = mrdImg.field_of_view[2]
                dicomDset.ImagePositionPatient       = [mrdImg.position[0], mrdImg.position[1], mrdImg.position[2]]
                dicomDset.ImageOrientationPatient    = [mrdImg.read_dir[0], mrdImg.read_dir[1], mrdImg.read_dir[2], mrdImg.phase_dir[0], mrdImg.phase_dir[1], mrdImg.phase_dir[2]]

                time_sec = mrdImg.acquisition_time_stamp/1000/2.5
                hour = int(np.floor(time_sec/3600))
                min  = int(np.floor((time_sec - hour*3600)/60))
                sec  = time_sec - hour*3600 - min*60
                dicomDset.AcquisitionTime            = "%02.0f%02.0f%02.6f" % (hour, min, sec)
                dicomDset.TriggerTime                = mrdImg.physiology_time_stamp[0] / 2.5

                # ----- Update DICOM header from MRD Image MetaAttributes -----
                if meta.get('SeriesDescription') is not None:
                    dicomDset.SeriesDescription = meta['SeriesDescription']

                if meta.get('SeriesDescriptionAdditional') is not None:
                    dicomDset.SeriesDescription = dicomDset.SeriesDescription + meta['SeriesDescriptionAdditional']

                if meta.get('ImageComment') is not None:
                    dicomDset.ImageComment = "_".join(meta['ImageComment'])

                if meta.get('ImageType') is not None:
                    dicomDset.ImageType = meta['ImageType']

                if (meta.get('ImageRowDir') is not None) and (meta.get('ImageColumnDir') is not None):
                    dicomDset.ImageOrientationPatient = [float(meta['ImageRowDir'][0]), float(meta['ImageRowDir'][1]), float(meta['ImageRowDir'][2]), float(meta['ImageColumnDir'][0]), float(meta['ImageColumnDir'][1]), float(meta['ImageColumnDir'][2])]

                if meta.get('RescaleIntercept') is not None:
                    dicomDset.RescaleIntercept = meta['RescaleIntercept']

                if meta.get('RescaleSlope') is not None:
                    dicomDset.RescaleSlope = meta['RescaleSlope']

                if meta.get('WindowCenter') is not None:
                    dicomDset.WindowCenter = meta['WindowCenter']

                if meta.get('WindowWidth') is not None:
                    dicomDset.WindowWidth = meta['WindowWidth']

                if meta.get('EchoTime') is not None:
                    dicomDset.EchoTime = meta['EchoTime']

                if meta.get('InversionTime') is not None:
                    dicomDset.InversionTime = meta['InversionTime']

                # Unhandled fields:
                # LUTFileName
                # ROI

                # Write DICOM files
                fileName = "%02.0f_%s_%03.0f.dcm" % (dicomDset.SeriesNumber, dicomDset.SeriesDescription, dicomDset.InstanceNumber)
                print("  Writing file %s" % fileName)
                dicomDset.save_as(os.path.join(args.out_folder, fileName))
    return
Beispiel #19
0
def HDF5_ATL06_tide_write(IS2_atl06_tide, IS2_atl06_attrs, INPUT=None,
    FILENAME='', FILL_VALUE=None, DIMENSIONS=None, CLOBBER=False):
    #-- setting HDF5 clobber attribute
    if CLOBBER:
        clobber = 'w'
    else:
        clobber = 'w-'

    #-- open output HDF5 file
    fileID = h5py.File(os.path.expanduser(FILENAME), clobber)

    #-- create HDF5 records
    h5 = {}

    #-- number of GPS seconds between the GPS epoch (1980-01-06T00:00:00Z UTC)
    #-- and ATLAS Standard Data Product (SDP) epoch (2018-01-01T00:00:00Z UTC)
    h5['ancillary_data'] = {}
    for k,v in IS2_atl06_tide['ancillary_data'].items():
        #-- Defining the HDF5 dataset variables
        val = 'ancillary_data/{0}'.format(k)
        h5['ancillary_data'][k] = fileID.create_dataset(val, np.shape(v), data=v,
            dtype=v.dtype, compression='gzip')
        #-- add HDF5 variable attributes
        for att_name,att_val in IS2_atl06_attrs['ancillary_data'][k].items():
            h5['ancillary_data'][k].attrs[att_name] = att_val

    #-- write each output beam
    beams = [k for k in IS2_atl06_tide.keys() if bool(re.match(r'gt\d[lr]',k))]
    for gtx in beams:
        fileID.create_group(gtx)
        #-- add HDF5 group attributes for beam
        for att_name in ['Description','atlas_pce','atlas_beam_type',
            'groundtrack_id','atmosphere_profile','atlas_spot_number',
            'sc_orientation']:
            fileID[gtx].attrs[att_name] = IS2_atl06_attrs[gtx][att_name]
        #-- create land_ice_segments group
        fileID[gtx].create_group('land_ice_segments')
        h5[gtx] = dict(land_ice_segments={})
        for att_name in ['Description','data_rate']:
            att_val = IS2_atl06_attrs[gtx]['land_ice_segments'][att_name]
            fileID[gtx]['land_ice_segments'].attrs[att_name] = att_val

        #-- delta_time, geolocation and segment_id variables
        for k in ['delta_time','latitude','longitude','segment_id']:
            #-- values and attributes
            v = IS2_atl06_tide[gtx]['land_ice_segments'][k]
            attrs = IS2_atl06_attrs[gtx]['land_ice_segments'][k]
            fillvalue = FILL_VALUE[gtx]['land_ice_segments'][k]
            #-- Defining the HDF5 dataset variables
            val = '{0}/{1}/{2}'.format(gtx,'land_ice_segments',k)
            if fillvalue:
                h5[gtx]['land_ice_segments'][k] = fileID.create_dataset(val,
                    np.shape(v), data=v, dtype=v.dtype, fillvalue=fillvalue,
                    compression='gzip')
            else:
                h5[gtx]['land_ice_segments'][k] = fileID.create_dataset(val,
                    np.shape(v), data=v, dtype=v.dtype, compression='gzip')
            #-- create or attach dimensions for HDF5 variable
            if DIMENSIONS[gtx]['land_ice_segments'][k]:
                #-- attach dimensions
                for i,dim in enumerate(DIMENSIONS[gtx]['land_ice_segments'][k]):
                    h5[gtx]['land_ice_segments'][k].dims[i].attach_scale(
                        h5[gtx]['land_ice_segments'][dim])
            else:
                #-- make dimension
                h5[gtx]['land_ice_segments'][k].make_scale(k)
            #-- add HDF5 variable attributes
            for att_name,att_val in attrs.items():
                h5[gtx]['land_ice_segments'][k].attrs[att_name] = att_val

        #-- add to geophysical corrections
        key = 'geophysical'
        fileID[gtx]['land_ice_segments'].create_group(key)
        h5[gtx]['land_ice_segments'][key] = {}
        for att_name in ['Description','data_rate']:
            att_val=IS2_atl06_attrs[gtx]['land_ice_segments'][key][att_name]
            fileID[gtx]['land_ice_segments'][key].attrs[att_name] = att_val
        for k,v in IS2_atl06_tide[gtx]['land_ice_segments'][key].items():
            #-- attributes
            attrs = IS2_atl06_attrs[gtx]['land_ice_segments'][key][k]
            fillvalue = FILL_VALUE[gtx]['land_ice_segments'][key][k]
            #-- Defining the HDF5 dataset variables
            val = '{0}/{1}/{2}/{3}'.format(gtx,'land_ice_segments',key,k)
            if fillvalue:
                h5[gtx]['land_ice_segments'][key][k] = \
                    fileID.create_dataset(val, np.shape(v), data=v,
                    dtype=v.dtype, fillvalue=fillvalue, compression='gzip')
            else:
                h5[gtx]['land_ice_segments'][key][k] = \
                    fileID.create_dataset(val, np.shape(v), data=v,
                    dtype=v.dtype, compression='gzip')
            #-- attach dimensions
            for i,dim in enumerate(DIMENSIONS[gtx]['land_ice_segments'][key][k]):
                h5[gtx]['land_ice_segments'][key][k].dims[i].attach_scale(
                    h5[gtx]['land_ice_segments'][dim])
            #-- add HDF5 variable attributes
            for att_name,att_val in attrs.items():
                h5[gtx]['land_ice_segments'][key][k].attrs[att_name] = att_val

    #-- HDF5 file title
    fileID.attrs['featureType'] = 'trajectory'
    fileID.attrs['title'] = 'ATLAS/ICESat-2 L3A Land Ice Height'
    fileID.attrs['summary'] = ('Estimates of the ice-sheet tidal parameters '
        'needed to interpret and assess the quality of land height estimates.')
    fileID.attrs['description'] = ('Land ice parameters for each beam.  All '
        'parameters are calculated for the same along-track increments for '
        'each beam and repeat.')
    date_created = datetime.datetime.today()
    fileID.attrs['date_created'] = date_created.isoformat()
    project = 'ICESat-2 > Ice, Cloud, and land Elevation Satellite-2'
    fileID.attrs['project'] = project
    platform = 'ICESat-2 > Ice, Cloud, and land Elevation Satellite-2'
    fileID.attrs['project'] = platform
    #-- add attribute for elevation instrument and designated processing level
    instrument = 'ATLAS > Advanced Topographic Laser Altimeter System'
    fileID.attrs['instrument'] = instrument
    fileID.attrs['source'] = 'Spacecraft'
    fileID.attrs['references'] = 'https://nsidc.org/data/icesat-2'
    fileID.attrs['processing_level'] = '4'
    #-- add attributes for input ATL06 file
    fileID.attrs['input_files'] = os.path.basename(INPUT)
    #-- find geospatial and temporal ranges
    lnmn,lnmx,ltmn,ltmx,tmn,tmx = (np.inf,-np.inf,np.inf,-np.inf,np.inf,-np.inf)
    for gtx in beams:
        lon = IS2_atl06_tide[gtx]['land_ice_segments']['longitude']
        lat = IS2_atl06_tide[gtx]['land_ice_segments']['latitude']
        delta_time = IS2_atl06_tide[gtx]['land_ice_segments']['delta_time']
        #-- setting the geospatial and temporal ranges
        lnmn = lon.min() if (lon.min() < lnmn) else lnmn
        lnmx = lon.max() if (lon.max() > lnmx) else lnmx
        ltmn = lat.min() if (lat.min() < ltmn) else ltmn
        ltmx = lat.max() if (lat.max() > ltmx) else ltmx
        tmn = delta_time.min() if (delta_time.min() < tmn) else tmn
        tmx = delta_time.max() if (delta_time.max() > tmx) else tmx
    #-- add geospatial and temporal attributes
    fileID.attrs['geospatial_lat_min'] = ltmn
    fileID.attrs['geospatial_lat_max'] = ltmx
    fileID.attrs['geospatial_lon_min'] = lnmn
    fileID.attrs['geospatial_lon_max'] = lnmx
    fileID.attrs['geospatial_lat_units'] = "degrees_north"
    fileID.attrs['geospatial_lon_units'] = "degrees_east"
    fileID.attrs['geospatial_ellipsoid'] = "WGS84"
    fileID.attrs['date_type'] = 'UTC'
    fileID.attrs['time_type'] = 'CCSDS UTC-A'
    #-- convert start and end time from ATLAS SDP seconds into GPS seconds
    atlas_sdp_gps_epoch=IS2_atl06_tide['ancillary_data']['atlas_sdp_gps_epoch']
    gps_seconds = atlas_sdp_gps_epoch + np.array([tmn,tmx])
    #-- calculate leap seconds
    leaps = pyTMD.time.count_leap_seconds(gps_seconds)
    #-- convert from seconds since 1980-01-06T00:00:00 to Julian days
    time_julian = 2400000.5 + pyTMD.time.convert_delta_time(gps_seconds - leaps,
        epoch1=(1980,1,6,0,0,0), epoch2=(1858,11,17,0,0,0), scale=1.0/86400.0)
    #-- convert to calendar date
    YY,MM,DD,HH,MN,SS = pyTMD.time.convert_julian(time_julian,FORMAT='tuple')
    #-- add attributes with measurement date start, end and duration
    tcs = datetime.datetime(int(YY[0]), int(MM[0]), int(DD[0]),
        int(HH[0]), int(MN[0]), int(SS[0]), int(1e6*(SS[0] % 1)))
    fileID.attrs['time_coverage_start'] = tcs.isoformat()
    tce = datetime.datetime(int(YY[1]), int(MM[1]), int(DD[1]),
        int(HH[1]), int(MN[1]), int(SS[1]), int(1e6*(SS[1] % 1)))
    fileID.attrs['time_coverage_end'] = tce.isoformat()
    fileID.attrs['time_coverage_duration'] = '{0:0.0f}'.format(tmx-tmn)
    #-- Closing the HDF5 file
    fileID.close()
Beispiel #20
0
def batchGen(filenames, batch_size=16, maxlen=None, classification=True):
    """
    Generator function for batches of documents and labels
    from a list of HDF5 files
    Args:
        filenames: list of HDF5 filenames
        batch_size: size of each batch to yield from generator
        maxlen: maximum length of each example document
    Yields:
        padded_docs, padded_labels: A tuple of padded documents and
                                    corresponding labels.
    """
    while True:
        for fname in filenames:
            with h5py.File(fname, 'r') as hf:
                # Get a list of all examples in the file
                groups = [item[1] for item in hf.items()]

                # Get lists of all sentences and all labels
                docs = [grp['sents'][()] for grp in groups]
                docs = [docs.tolist() for docs in docs]
                labels = np.array([grp['labels'][()] for grp in groups])

                # Only get examples longer than 0 and less than maxlen
                if maxlen:
                    docs = [x for x in docs if len(x) < maxlen and len(x) > 0]
                    labels = [
                        x for x in labels if len(x) < maxlen and len(x) > 0
                    ]

                # Only get examples longer than 0
                else:
                    docs = [x for x in docs if len(x) > 0]
                    labels = [x for x in labels if len(x) > 0]

                # Shuffle documents and labels
                docs, labels = shuffle(docs, labels)

                n = len(docs)
                assert n == len(
                    labels)  # Ensure docs and labels are same length
                num_batches = np.floor(n / batch_size).astype(np.int16)

                for idx in range(num_batches):
                    # Get each batch of documents and labels
                    batch_docs = docs[idx * batch_size:(idx + 1) * batch_size]
                    batch_labels = labels[idx * batch_size:(idx + 1) *
                                          batch_size]

                    # Pad docs and labels to the length of the longest sample in the batch
                    padded_docs = pad_sequences(batch_docs,
                                                dtype=object,
                                                value=' ',
                                                maxlen=maxlen,
                                                padding='pre',
                                                truncating='post')

                    #
                    if classification:
                        padded_labels = pad_sequences(batch_labels,
                                                      dtype=int,
                                                      value=2,
                                                      maxlen=maxlen,
                                                      padding='pre',
                                                      truncating='post')
                        padded_labels = to_categorical(padded_labels,
                                                       num_classes=3,
                                                       dtype='int32')
                    else:
                        padded_labels = pad_sequences(batch_labels,
                                                      dtype=int,
                                                      value=0,
                                                      maxlen=maxlen,
                                                      padding='pre',
                                                      truncating='post')

                        padded_labels = np.expand_dims(padded_labels, axis=-1)

                    yield (padded_docs, padded_labels)
Beispiel #21
0
def genMPO_SiSjRpt(nsite,ig,jg,fname,xts,ifQt,debug=False):
   if debug: print '\n[mpo_dmrg_propsMPORpt.genMPO_SiSjRpt] fname=',fname
   t0 = time.time()
   fop = h5py.File(fname,'w')
   npt = len(xts)
   nop = 3*npt
   fop['nop'] = nop
   for isite in range(nsite):
      ti = time.time()
      gname = 'site'+str(isite)
      grp = fop.create_group(gname)
      # 0.5*(Si+*Sj-+Si-*Sj) + Szi*Szj
      if not ifQt:
	 # (a) 0.5*Si+*Sj-
	 if isite == 0:
	    cop = mpo_dmrg_spinopers.genLocal2Spatial(nsite,isite,ig,jg,'Sp','Sm',0.5)
         else:
	    cop = mpo_dmrg_spinopers.genLocal2Spatial(nsite,isite,ig,jg,'Sp','Sm',1.0)
	 for ipt in range(npt):
	    rop = mpo_dmrg_opers.genExpISyPhi(xts[ipt])
	    wop = mpo_dmrg_opers.prodTwoOpers(cop,rop)
            grp['op'+str(0*npt+ipt)] = wop
	 # (b) 0.5*Si-*Sj+
	 if isite == 0:
	    cop = mpo_dmrg_spinopers.genLocal2Spatial(nsite,isite,ig,jg,'Sm','Sp',0.5)
         else:
	    cop = mpo_dmrg_spinopers.genLocal2Spatial(nsite,isite,ig,jg,'Sm','Sp',1.0)
	 for ipt in range(npt):
	    rop = mpo_dmrg_opers.genExpISyPhi(xts[ipt])
	    wop = mpo_dmrg_opers.prodTwoOpers(cop,rop)
 	    grp['op'+str(1*npt+ipt)] = wop
	 # (c) Szi*Szj
	 cop = mpo_dmrg_spinopers.genLocal2Spatial(nsite,isite,ig,jg,'Sz','Sz',1.0)
	 for ipt in range(npt):
	    rop = mpo_dmrg_opers.genExpISyPhi(xts[ipt])
	    wop = mpo_dmrg_opers.prodTwoOpers(cop,rop)
 	    grp['op'+str(2*npt+ipt)] = wop
      else:
	 # Sip*Sjm
	 for ipt in range(npt):
	    if isite == 0:
	       cop = qtensor_spinopers.genLocal2RSpatialQt(nsite,isite,ig,jg,'Sp','Sm',0.5,xts[ipt])
            else:
	       cop = qtensor_spinopers.genLocal2RSpatialQt(nsite,isite,ig,jg,'Sp','Sm',1.0,xts[ipt])
            cop.dump(grp,'op'+str(0*npt+ipt))
	 # Sim*Sjp
	 for ipt in range(npt):
	    if isite == 0:
	       cop = qtensor_spinopers.genLocal2RSpatialQt(nsite,isite,ig,jg,'Sm','Sp',0.5,xts[ipt])
            else:
	       cop = qtensor_spinopers.genLocal2RSpatialQt(nsite,isite,ig,jg,'Sm','Sp',1.0,xts[ipt])
            cop.dump(grp,'op'+str(1*npt+ipt))
	 # Siz*Sjz
	 for ipt in range(npt):
	    cop = qtensor_spinopers.genLocal2RSpatialQt(nsite,isite,ig,jg,'Sz','Sz',1.0,xts[ipt])
 	    cop.dump(grp,'op'+str(2*npt+ipt))
      tf = time.time()
      if debug: print ' isite =',isite,' time = %.2f s'%(tf-ti) 
   t1 = time.time()
   if debug: print ' time for genMPO_SiSjRpt = %.2f s'%(t1-t0)
   return fop
Beispiel #22
0
import os, h5py
from time import time
import numpy as np
from scipy import sparse
from sklearn.utils import extmath


"""Import data:
"""

def importData(sub)
    f = h5py.File(('/scr/litauen1/%s.hcp.lh.mat' % sub),'r')
    data = np.array(f.get('connData'))
    cortex = np.array(f.get('cortex')) - 1
    return data, cortex

data, cortex = importData('')

print("Computing the principal singular vectors using randomized_svd")
t0 = time()
U, s, V = extmath.randomized_svd(data, 5, n_iter=3)
print("done in %0.3fs" % (time() - t0))

def centrality_scores(X, alpha=0.85, max_iter=100, tol=1e-10):
    """Power iteration computation of the principal eigenvector

    This method is also known as Google PageRank and the implementation
    is based on the one from the NetworkX project (BSD licensed too)
    with copyrights by:

      Aric Hagberg <*****@*****.**>
Beispiel #23
0
# Calculates the average of the orientation for atom i with other atom at distance j,
# over all particles.

# Load libraries
import h5py
import matplotlib as mpl
mpl.use('agg')
import matplotlib.pyplot as plt
import numpy as np
import math as ma

figx = 4.
figy = 4.

# k2
in_file = h5py.File("k2/averaged_data.h5","r")
mean_1_sep_0_k2 = in_file["mean_1_sep_0"][:,:]
mean_2_sep_0_k2 = in_file["mean_2_sep_0"][:,:]
mean_1_sep_1_k2 = in_file["mean_1_sep_1"][:,:]
mean_2_sep_1_k2 = in_file["mean_2_sep_1"][:,:]
mean_1_sep_3_k2 = in_file["mean_1_sep_3"][:,:]
mean_2_sep_3_k2 = in_file["mean_2_sep_3"][:,:]
mean_1_sep_6_k2 = in_file["mean_1_sep_6"][:,:]
mean_2_sep_6_k2 = in_file["mean_2_sep_6"][:,:]
# k3
in_file = h5py.File("k3/averaged_data.h5","r")
mean_1_sep_0_k3 = in_file["mean_1_sep_0"][:,:]
mean_2_sep_0_k3 = in_file["mean_2_sep_0"][:,:]
mean_1_sep_1_k3 = in_file["mean_1_sep_1"][:,:]
mean_2_sep_1_k3 = in_file["mean_2_sep_1"][:,:]
mean_1_sep_3_k3 = in_file["mean_1_sep_3"][:,:]
Beispiel #24
0
    order = OutOfOrder(botnet)
    target = TargetInfo(order)
    target_left = BorderTargetInfo(order, direction='left')
    target_right = BorderTargetInfo(order, direction='right')

    move_side = MoveSidewards(botnet, target_left, target_right)

if __name__ == '__main__':

    plot_data_file = '../plot/plot_data/out_of_order_robot1.h5'
    import os
    import h5py
    if os.path.isfile(plot_data_file):
        # data file exists, so simulation ran before
        # no need for running simulation again, so just get data for plotting
        with h5py.File(plot_data_file, 'r') as hf:
            print('List of arrays in this file: \n', hf.keys())
            trange = np.array(hf.get('trange'))
            p_x = np.array(hf.get('p_x'))
            p_forget = np.array(hf.get('p_forget'))
            p_diff = np.array(hf.get('p_diff'))
            p_evidence = np.array(hf.get('p_evidence'))
            p_neg_min = np.array(hf.get('p_neg_min'))
            p_evidence_left = np.array(hf.get('p_evidence_left'))
            p_odd = np.array(hf.get('p_odd'))
            p_evidence_right = np.array(hf.get('p_evidence_right'))
    else:
        # data file does not exists, so run simulation
        sim = nengo.Simulator(model)
        sim.run(3)
Beispiel #25
0
            images_name = [tmp.strip() for tmp in f.readlines()]

        with open(os.path.join(scene_path, "calibration.txt")) as f:
            calib_name = [tmp.strip() for tmp in f.readlines()]

        r_list = list()
        t_list = list()
        geoms = list()
        resized_shapes = list()
        org_imsizes = list()
        K_list = list()

        # Read image infos
        for im, calib in zip(images_name, calib_name):

            calib_h5 = h5py.File(os.path.join(scene_path, calib))      
            r_list.append(np.array(calib_h5["R"]))
            t_list.append(np.array(calib_h5["T"]).T)
            geoms.append(calib_h5)
            org_imsizes.append(np.array(calib_h5['imsize'][0]).tolist())
            
            K_list.append(np.array(calib_h5['K']))


            resized_shapes.append(getResizedSize(minSize, Image.open(os.path.join(scene_path, im)).size, strideNet))
            
            
        #for i, (idA, idB) in tqdm(enumerate(pairs_ids)):
        for i, (idA, idB) in enumerate(pairs_ids):
            
            if i % 50 == 49 : 
def extractPatch4OneSubject(matFA, matSeg, matMask, fileID, d, step, rate):
    eps = 5e-2
    [row, col, leng] = matFA.shape
    cubicCnt = 0
    estNum = 40000
    trainFA = np.zeros([estNum, 1, dFA[0], dFA[1], dFA[2]], dtype=np.float16)
    trainSeg = np.zeros([estNum, 1, dSeg[0], dSeg[1], dSeg[2]],
                        dtype=np.float16)

    print('trainFA shape, ', trainFA.shape)
    # to padding for input
    margin1 = int((dFA[0] - dSeg[0]) / 2)
    margin2 = int((dFA[1] - dSeg[1]) / 2)
    margin3 = int((dFA[2] - dSeg[2]) / 2)
    two_margin1 = dFA[0] - dSeg[0]
    two_margin2 = dFA[1] - dSeg[1]
    two_margin3 = dFA[2] - dSeg[2]
    cubicCnt = 0
    marginD = [margin1, margin2, margin3]
    print('matFA shape is ', matFA.shape)
    matFAOut = np.zeros(
        [row + two_margin1, col + two_margin2, leng + two_margin3],
        dtype=np.float16)
    print('matFAOut shape is ', matFAOut.shape)
    matFAOut[marginD[0]:row + marginD[0], marginD[1]:col + marginD[1],
             marginD[2]:leng + marginD[2]] = matFA

    matSegOut = np.zeros(
        [row + two_margin1, col + two_margin2, leng + two_margin3],
        dtype=np.float16)
    matSegOut[marginD[0]:row + marginD[0], marginD[1]:col + marginD[1],
              marginD[2]:leng + marginD[2]] = matSeg

    matMaskOut = np.zeros(
        [row + two_margin1, col + two_margin2, leng + two_margin3],
        dtype=np.float16)
    matMaskOut[marginD[0]:row + marginD[0], marginD[1]:col + marginD[1],
               marginD[2]:leng + marginD[2]] = matMask

    # for mageFA, enlarge it by padding
    if margin1 != 0:
        matFAOut[0:marginD[0], marginD[1]:col + marginD[1], marginD[2]:leng +
                 marginD[2]] = matFA[marginD[0] -
                                     1::-1, :, :]  # reverse 0:marginD[0]
        matFAOut[row + marginD[0]:matFAOut.shape[0],
                 marginD[1]:col + marginD[1],
                 marginD[2]:leng + marginD[2]] = matFA[
                     matFA.shape[0] - 1:row - marginD[0] -
                     1:-1, :, :]  # we'd better flip it along the 1st dimension
    if margin2 != 0:
        matFAOut[
            marginD[0]:row + marginD[0], 0:marginD[1], marginD[2]:leng +
            marginD[2]] = matFA[:, marginD[1] - 1::
                                -1, :]  # we'd flip it along the 2nd dimension
        matFAOut[
            marginD[0]:row + marginD[0], col + marginD[1]:matFAOut.shape[1],
            marginD[2]:leng +
            marginD[2]] = matFA[:, matFA.shape[1] - 1:col - marginD[1] - 1:
                                -1, :]  # we'd flip it along the 2nd dimension
    if margin3 != 0:
        matFAOut[marginD[0]:row + marginD[0], marginD[1]:col + marginD[1],
                 0:marginD[2]] = matFA[:, :, marginD[
                     2] - 1::-1]  # we'd better flip it along the 3rd dimension
        matFAOut[marginD[0]:row + marginD[0], marginD[1]:col + marginD[1],
                 marginD[2] +
                 leng:matFAOut.shape[2]] = matFA[:, :, matFA.shape[2] -
                                                 1:leng - marginD[2] - 1:-1]
        # for matseg, enlarge it by padding
    if margin1 != 0:
        matSegOut[0:marginD[0], marginD[1]:col + marginD[1], marginD[2]:leng +
                  marginD[2]] = matSeg[marginD[0] -
                                       1::-1, :, :]  # reverse 0:marginD[0]
        matSegOut[row + marginD[0]:matSegOut.shape[0],
                  marginD[1]:col + marginD[1],
                  marginD[2]:leng + marginD[2]] = matSeg[
                      matSeg.shape[0] - 1:row - marginD[0] - 1:
                      -1, :, :]  # we'd better flip it along the 1st dimension
    if margin2 != 0:
        matSegOut[
            marginD[0]:row + marginD[0], 0:marginD[1], marginD[2]:leng +
            marginD[2]] = matSeg[:, marginD[1] - 1::
                                 -1, :]  # we'd flip it along the 2nd dimension
        matSegOut[
            marginD[0]:row + marginD[0], col + marginD[1]:matSegOut.shape[1],
            marginD[2]:leng +
            marginD[2]] = matSeg[:, matSeg.shape[1] - 1:col - marginD[1] - 1:
                                 -1, :]  # we'd flip it along the 2nd dimension
    if margin3 != 0:
        matSegOut[
            marginD[0]:row + marginD[0], marginD[1]:col + marginD[1],
            0:marginD[2]] = matSeg[:, :, marginD[
                2] - 1::-1]  # we'd better flip it along the 3rd dimension
        matSegOut[marginD[0]:row + marginD[0], marginD[1]:col + marginD[1],
                  marginD[2] +
                  leng:matSegOut.shape[2]] = matSeg[:, :, matSeg.shape[2] -
                                                    1:leng - marginD[2] - 1:-1]

    # for matseg, enlarge it by padding
    if margin1 != 0:
        matMaskOut[0:marginD[0], marginD[1]:col + marginD[1], marginD[2]:leng +
                   marginD[2]] = matMask[marginD[0] -
                                         1::-1, :, :]  # reverse 0:marginD[0]
        matMaskOut[row + marginD[0]:matMaskOut.shape[0],
                   marginD[1]:col + marginD[1],
                   marginD[2]:leng + marginD[2]] = matMask[
                       matMask.shape[0] - 1:row - marginD[0] - 1:
                       -1, :, :]  # we'd better flip it along the 1st dimension
    if margin2 != 0:
        matMaskOut[marginD[0]:row + marginD[0], 0:marginD[1],
                   marginD[2]:leng + marginD[2]] = matMask[:, marginD[
                       1] - 1::-1, :]  # we'd flip it along the 2nd dimension
        matMaskOut[marginD[0]:row + marginD[0],
                   col + marginD[1]:matMaskOut.shape[1], marginD[2]:leng +
                   marginD[2]] = matMask[:, matMask.shape[1] - 1:col - marginD[
                       1] - 1:-1, :]  # we'd flip it along the 2nd dimension
    if margin3 != 0:
        matMaskOut[
            marginD[0]:row + marginD[0], marginD[1]:col + marginD[1],
            0:marginD[2]] = matMask[:, :, marginD[
                2] - 1::-1]  # we'd better flip it along the 3rd dimension
        matMaskOut[marginD[0]:row + marginD[0], marginD[1]:col + marginD[1],
                   marginD[2] +
                   leng:matMaskOut.shape[2]] = matMask[:, :, matMask.shape[2] -
                                                       1:leng - marginD[2] -
                                                       1:-1]

    dsfactor = rate

    for i in range(1):
        for j in range(0, col - dSeg[1], step[1]):
            for k in range(0, leng - dSeg[2], step[2]):
                volMask = matMaskOut[i:i + dSeg[0], j:j + dSeg[1],
                                     k:k + dSeg[2]]
                if np.sum(volMask) < eps:
                    continue
                cubicCnt = cubicCnt + 1
                # index at scale 1
                volSeg = matSeg[i:i + dSeg[0], j:j + dSeg[1], k:k + dSeg[2]]
                volFA = matFAOut[i:i + dFA[0], j:j + dFA[1], k:k + dFA[2]]

                trainFA[cubicCnt, 0, :, :, :] = volFA  # 32*32*32

                trainSeg[cubicCnt, 0, :, :, :] = volSeg  # 24*24*24

    trainFA = trainFA[0:cubicCnt, :, :, :, :]
    trainSeg = trainSeg[0:cubicCnt, :, :, :, :]

    save_folder = ''
    if opt.split in ["train", "dev", "test"]:
        save_folder = os.path.join(opt.save_folder, opt.split)
    else:
        print("Specify correct split type!")
        raise FileNotFoundError
    with h5py.File(save_folder + 'train_%s.h5' % fileID, 'w') as f:
        f['noisy'] = trainFA
        f['clear'] = trainSeg

    with open('./train_list.txt', 'a') as f:
        f.write(save_folder + 'train_%s.h5' % fileID)
    return cubicCnt
Beispiel #27
0
def create_scool(
    cool_uri,
    bins,
    cell_name_pixels_dict,
    columns=None,
    dtypes=None,
    metadata=None,
    assembly=None,
    ordered=False,
    symmetric_upper=True,
    mode="w",
    mergebuf=int(20e6),
    delete_temp=True,
    temp_dir=None,
    max_merge=200,
    boundscheck=True,
    dupcheck=True,
    triucheck=True,
    ensure_sorted=False,
    h5opts=None,
    lock=None,
    **kwargs):
    r"""
    Create a single-cell (scool) file.

    For each cell store a cooler matrix under **/cells**, where all matrices
    have the same dimensions.

    Each cell is a regular cooler data collection, so the input must be a
    bin table and pixel table for each cell. The pixel tables are provided as
    a dictionary where the key is a unique cell name. The bin tables can be
    provided as a dict with the same keys or a single common bin table can be
    given.

    .. versionadded:: 0.8.9

    Parameters
    ----------
    cool_uri : str
        Path to scool file or URI string. If the file does not exist,
        it will be created.
    bins : :class:`pandas.DataFrame` or Dict[str, DataFrame]
        A single bin table or dictionary of cell names to bins tables. A bin
        table is a dataframe with columns ``chrom``, ``start`` and ``end``.
        May contain additional columns.
    cell_name_pixels_dict : Dict[str, DataFrame]
        Cell name as key and pixel table DataFrame as value.
        A table, given as a dataframe or a column-oriented dict, containing
        columns labeled ``bin1_id``, ``bin2_id`` and ``count``, sorted by
        (``bin1_id``, ``bin2_id``). If additional columns are included in the
        pixel table, their names and dtypes must be specified using the
        ``columns`` and ``dtypes`` arguments. For larger input data, an
        **iterable** can be provided that yields the pixel data as a sequence
        of chunks. If the input is a dask DataFrame, it will also be processed
        one chunk at a time.
    {other_parameters}

    See also
    --------
    cooler.create_cooler
    cooler.zoomify_cooler

    {notes}

    """
    file_path, group_path = parse_cooler_uri(cool_uri)
    h5opts = _set_h5opts(h5opts)

    if isinstance(bins, pd.DataFrame):
        bins_dict = {cell_name: bins for cell_name in cell_name_pixels_dict}
        cell_names = sorted(cell_name_pixels_dict)
    else:
        # Assume bins is a dict of cell name -> dataframe
        bins_dict = bins
        if len(bins_dict) == 0:
            raise ValueError("At least one bin must be given.")
        else:
            bins = bins_dict[next(iter(bins_dict))][["chrom", "start", "end"]]

        # Sort bins_dict and cell_name_pixels_dict to guarantee matching keys
        bins_keys = sorted(bins_dict)
        cell_names = sorted(cell_name_pixels_dict)
        for key_bins, key_pixels in zip(bins_keys, cell_names):
            if key_bins != key_pixels:
                raise ValueError('Bins and pixel dicts do not have matching keys')

    dtypes = _get_dtypes_arg(dtypes, kwargs)

    for col in ["chrom", "start", "end"]:
        if col not in bins.columns:
            raise ValueError("Missing column from bin table: '{}'.".format(col))

    # Populate dtypes for expected pixel columns, and apply user overrides.
    if dtypes is None:
        dtypes = dict(PIXEL_DTYPES)
    else:
        dtypes_ = dict(dtypes)
        dtypes = dict(PIXEL_DTYPES)
        dtypes.update(dtypes_)

    # Determine the appropriate iterable
    try:
        from dask.dataframe import DataFrame as dask_df
    except (ImportError, AttributeError):  # pragma: no cover
        dask_df = ()

    # Prepare chroms and bins
    bins = bins.copy()
    bins["chrom"] = bins["chrom"].astype(object)
    chromsizes = get_chromsizes(bins)
    try:
        chromsizes = six.iteritems(chromsizes)
    except AttributeError:
        pass
    chromnames, lengths = zip(*chromsizes)
    chroms = pd.DataFrame(
        {"name": chromnames, "length": lengths}, columns=["name", "length"]
    )
    binsize = get_binsize(bins)
    n_chroms = len(chroms)
    n_bins = len(bins)

    # Create root group
    with h5py.File(file_path, mode) as f:
        logger.info('Creating cooler at "{}::{}"'.format(file_path, group_path))
        if group_path == "/":
            for name in ["chroms", "bins"]:
                if name in f:
                    del f[name]
        else:
            try:
                f.create_group(group_path)
            except ValueError:
                del f[group_path]
                f.create_group(group_path)

    with h5py.File(file_path, "r+") as f:
        h5 = f[group_path]

        logger.info("Writing chroms")
        grp = h5.create_group("chroms")
        write_chroms(grp, chroms, h5opts)

        logger.info("Writing bins")
        grp = h5.create_group("bins")
        write_bins(grp, bins, chroms["name"], h5opts)

    with h5py.File(file_path, "r+") as f:
        h5 = f[group_path]

        logger.info("Writing info")
        info = {}
        info["bin-type"] = u"fixed" if binsize is not None else u"variable"
        info["bin-size"] = binsize if binsize is not None else u"null"
        info["nchroms"] = n_chroms
        info["ncells"] = len(cell_name_pixels_dict)
        info["nbins"] = n_bins
        if assembly is not None:
            info["genome-assembly"] = assembly
        if metadata is not None:
            info["metadata"] = metadata
        write_info(h5, info, True)

    # Append single cells
    for key in cell_names:
        if '/' in key:
            cell_name = key.split('/')[-1]
        else:
            cell_name = key

        create(
            cool_uri + '::/cells/' + cell_name,
            bins_dict[key],
            cell_name_pixels_dict[key],
            columns=columns,
            dtypes=dtypes,
            metadata=metadata,
            assembly=assembly,
            ordered=ordered,
            symmetric_upper=symmetric_upper,
            mode='a',
            boundscheck=boundscheck,
            dupcheck=dupcheck,
            triucheck=triucheck,
            ensure_sorted=ensure_sorted,
            h5opts=h5opts,
            lock=lock,
            mergebuf=mergebuf,
            delete_temp=delete_temp,
            temp_dir=temp_dir,
            max_merge=max_merge,
            append_scool=True,
            scool_root_uri=cool_uri
        )
Beispiel #28
0
# encode the target labels
targetNames = np.unique(labels)
le = LabelEncoder()
target = le.fit_transform(labels)
print("[STATUS] training labels encoded...")

# normalize the feature vector in the range (0-1)
scaler = MinMaxScaler(feature_range=(0, 1))
rescaled_features = scaler.fit_transform(global_features)
print("[STATUS] feature vector normalized...")

print("[STATUS] target labels: {}".format(target))
print("[STATUS] target labels shape: {}".format(target.shape))

# save the feature vector using HDF5
h5f_data = h5py.File('output/data.h5', 'w')
h5f_data.create_dataset('dataset_1', data=np.array(rescaled_features))

h5f_label = h5py.File('output/labels.h5', 'w')
h5f_label.create_dataset('dataset_1', data=np.array(target))

h5f_data.close()
h5f_label.close()

print("[STATUS] end of training..")

# import the necessary packages
import h5py
import numpy as np
import os
import glob
Beispiel #29
0
def append(cool_uri, table, data, chunked=False, force=False, h5opts=None, lock=None):  # pragma: no cover
    """
    Append one or more data columns to an existing table.

    Parameters
    ----------
    cool_uri : str
        Path to Cooler file or URI to Cooler group.
    table : str
        Name of table (HDF5 group).
    data : dict-like
        DataFrame, Series or mapping of column names to data. If the input is a
        dask DataFrame or Series, the data is written in chunks.
    chunked : bool, optional
        If True, the values of the data dict are treated as separate chunk
        iterators of column data.
    force : bool, optional
        If True, replace existing columns with the same name as the input.
    h5opts : dict, optional
        HDF5 dataset filter options to use (compression, shuffling,
        checksumming, etc.). Default is to use autochunking and GZIP
        compression, level 6.
    lock : multiprocessing.Lock, optional
        Optional lock to synchronize concurrent HDF5 file access.

    """
    h5opts = _set_h5opts(h5opts)

    file_path, group_path = parse_cooler_uri(cool_uri)

    try:
        from dask.dataframe import DataFrame as dask_df, Series as dask_series
    except (ImportError, AttributeError):
        dask_df = ()
        dask_series = ()

    if isinstance(data, dask_series):
        data = data.to_frame()

    try:
        names = data.keys()
    except AttributeError:
        names = data.columns

    with h5py.File(file_path, "r+") as f:
        h5 = f[group_path]
        for name in names:
            if name in h5[table]:
                if not force:
                    raise ValueError(
                        "'{}' column already exists. ".format(name)
                        + "Use --force option to overwrite."
                    )
                else:
                    del h5[table][name]

        if isinstance(data, dask_df):
            # iterate over dataframe chunks
            for chunk in data.to_delayed():
                i = 0
                for chunk in data.to_delayed():
                    chunk = chunk.compute()
                    try:
                        if lock is not None:
                            lock.acquire()
                        put(h5[table], chunk, lo=i, h5opts=h5opts)
                    finally:
                        if lock is not None:
                            lock.release()
                    i += len(chunk)
        elif chunked:
            # iterate over chunks from each column
            for key in data.keys():
                i = 0
                for chunk in data[key]:
                    try:
                        if lock is not None:
                            lock.acquire()
                        put(h5[table], {key: chunk}, lo=i, h5opts=h5opts)
                    finally:
                        if lock is not None:
                            lock.release()
                    i += len(chunk)
        else:
            # write all the data
            try:
                if lock is not None:
                    lock.acquire()
                put(h5[table], data, lo=0, h5opts=h5opts)
            finally:
                if lock is not None:
                    lock.release()
Beispiel #30
0
def prepare_data(argv=None):
    '''Aggregate sequence data GTDB using a file-of-files'''
    from io import BytesIO
    import tempfile
    import h5py

    from datetime import datetime

    from tqdm import tqdm

    from skbio import TreeNode
    from skbio.sequence import DNA, Protein

    from hdmf.common import get_hdf5io
    from hdmf.data_utils import DataChunkIterator

    from ..utils import get_faa_path, get_fna_path, get_genomic_path
    from deep_taxon.sequence.convert import AASeqIterator, DNASeqIterator, DNAVocabIterator, DNAVocabGeneIterator
    from deep_taxon.sequence.dna_table import AATable, DNATable, SequenceTable, TaxaTable, DeepIndexFile, NewickString, CondensedDistanceMatrix, GenomeTable, TreeGraph

    parser = argparse.ArgumentParser()
    parser.add_argument('fadir',
                        type=str,
                        help='directory with NCBI sequence files')
    parser.add_argument('metadata', type=str, help='metadata file from GTDB')
    parser.add_argument('out', type=str, help='output HDF5')
    parser.add_argument(
        '-T',
        '--tree',
        type=str,
        help='a Newick file with a tree of representative taxa',
        default=None)
    parser.add_argument(
        '-A',
        '--accessions',
        type=str,
        default=None,
        help='file of the NCBI accessions of the genomes to convert')
    parser.add_argument(
        '-d',
        '--max_deg',
        type=float,
        default=None,
        help='max number of degenerate characters in protein sequences')
    parser.add_argument('-l',
                        '--min_len',
                        type=float,
                        default=None,
                        help='min length of sequences')
    parser.add_argument('--iter',
                        action='store_true',
                        default=False,
                        help='convert using iterators')
    parser.add_argument(
        '-p',
        '--num_procs',
        type=int,
        default=1,
        help='the number of processes to use for counting total sequence size')
    parser.add_argument('-L',
                        '--total_seq_len',
                        type=int,
                        default=None,
                        help='the total sequence length')
    parser.add_argument('-t',
                        '--tmpdir',
                        type=str,
                        default=None,
                        help='a temporary directory to store sequences')
    parser.add_argument('-N',
                        '--n_seqs',
                        type=int,
                        default=None,
                        help='the total number of sequences')
    rep_grp = parser.add_mutually_exclusive_group()
    rep_grp.add_argument(
        '-n',
        '--nonrep',
        action='store_true',
        default=False,
        help='keep non-representative genomes only. keep both by default')
    rep_grp.add_argument(
        '-r',
        '--rep',
        action='store_true',
        default=False,
        help='keep representative genomes only. keep both by default')
    parser.add_argument(
        '-a',
        '--all',
        action='store_true',
        default=False,
        help=
        'keep all non-representative genomes. By default, only non-reps with the highest and lowest contig count are kept'
    )
    grp = parser.add_mutually_exclusive_group()
    grp.add_argument('-P',
                     '--protein',
                     action='store_true',
                     default=False,
                     help='get paths for protein files')
    grp.add_argument('-C',
                     '--cds',
                     action='store_true',
                     default=False,
                     help='get paths for CDS files')
    grp.add_argument('-G',
                     '--genomic',
                     action='store_true',
                     default=False,
                     help='get paths for genomic files (default)')
    parser.add_argument('-z',
                        '--gzip',
                        action='store_true',
                        default=False,
                        help='GZip sequence table')
    dep_grp = parser.add_argument_group(
        title="Legacy options you probably do not need")
    dep_grp.add_argument('-e',
                         '--emb',
                         type=str,
                         help='embedding file',
                         default=None)

    if len(sys.argv) == 1:
        parser.print_help()
        sys.exit(1)

    args = parser.parse_args(args=argv)

    if args.total_seq_len is not None:
        if args.n_seqs is None:
            sys.stderr.write(
                "If using --total_seq_len, you must also use --n_seqs\n")
    if args.n_seqs is not None:
        if args.total_seq_len is None:
            sys.stderr.write(
                "If using --n_seqs, you must also use --total_seq_len\n")

    if not any([args.protein, args.cds, args.genomic]):
        args.genomic = True

    logging.basicConfig(stream=sys.stderr,
                        level=logging.INFO,
                        format='%(asctime)s - %(message)s')
    logger = logging.getLogger()

    #############################
    # read and filter taxonomies
    #############################
    logger.info('Reading taxonomies from %s' % args.metadata)
    taxlevels = [
        'domain', 'phylum', 'class', 'order', 'family', 'genus', 'species'
    ]
    extra_cols = ['contig_count', 'checkm_completeness']

    def func(row):
        dat = dict(zip(taxlevels, row['gtdb_taxonomy'].split(';')))
        dat['species'] = dat['species']  # .split(' ')[1]
        dat['gtdb_genome_representative'] = row['gtdb_genome_representative'][
            3:]
        dat['accession'] = row['accession'][3:]
        for k in extra_cols:
            dat[k] = row[k]
        return pd.Series(data=dat)

    taxdf = pd.read_csv(args.metadata, header=0, sep='\t')[['accession', 'gtdb_taxonomy', 'gtdb_genome_representative', 'contig_count', 'checkm_completeness']]\
                        .apply(func, axis=1)

    taxdf = taxdf.set_index('accession')
    dflen = len(taxdf)
    logger.info('Found %d total genomes' % dflen)
    taxdf = taxdf[taxdf['gtdb_genome_representative'].str.contains(
        'GC[A,F]_', regex=True)]  # get rid of genomes that are not at NCBI
    taxdf = taxdf[taxdf.index.str.contains(
        'GC[A,F]_', regex=True)]  # get rid of genomes that are not at NCBI
    logger.info('Discarded %d non-NCBI genomes' % (dflen - len(taxdf)))

    rep_taxdf = taxdf[taxdf.index == taxdf['gtdb_genome_representative']]

    if args.accessions is not None:
        logger.info('reading accessions %s' % args.accessions)
        with open(args.accessions, 'r') as f:
            accessions = [l[:-1] for l in f.readlines()]
        dflen = len(taxdf)
        taxdf = taxdf[taxdf.index.isin(accessions)]
        logger.info('Discarded %d genomes not found in %s' %
                    (dflen - len(taxdf), args.accessions))

    dflen = len(taxdf)
    if args.nonrep:
        taxdf = taxdf[taxdf.index != taxdf['gtdb_genome_representative']]
        logger.info('Discarded %d representative genomes' %
                    (dflen - len(taxdf)))
        dflen = len(taxdf)
        if not args.all:
            groups = taxdf[['gtdb_genome_representative', 'contig_count'
                            ]].groupby('gtdb_genome_representative')
            min_ctgs = groups.idxmin()['contig_count']
            max_ctgs = groups.idxmax()['contig_count']
            accessions = np.unique(np.concatenate([min_ctgs, max_ctgs]))
            taxdf = taxdf.filter(accessions, axis=0)
            logger.info('Discarded %d extra non-representative genomes' %
                        (dflen - len(taxdf)))
    elif args.rep:
        taxdf = taxdf[taxdf.index == taxdf['gtdb_genome_representative']]
        logger.info('Discarded %d non-representative genomes' %
                    (dflen - len(taxdf)))

    dflen = len(taxdf)
    logger.info('%d remaining genomes' % dflen)

    ###############################
    # Arguments for constructing the DeepIndexFile object
    ###############################
    di_kwargs = dict()

    taxa_ids = taxdf.index.values

    # get paths to Fasta Files
    fa_path_func = partial(get_genomic_path, directory=args.fadir)
    if args.cds:
        fa_path_func = partial(get_fna_path, directory=args.fadir)
    elif args.protein:
        fa_path_func = partial(get_faa_path, directory=args.fadir)

    map_func = map
    if args.num_procs > 1:
        logger.info(f'using {args.num_procs} processes to locate Fasta files')
        import multiprocessing as mp
        map_func = mp.Pool(processes=args.num_procs).imap

    logger.info('Locating Fasta files for each taxa')
    fapaths = list(tqdm(map_func(fa_path_func, taxa_ids), total=len(taxa_ids)))

    logger.info('Found Fasta files for all accessions')

    #############################
    # read and filter embeddings
    #############################
    emb = None
    if args.emb is not None:
        logger.info('reading embeddings from %s' % args.emb)
        with h5py.File(args.emb, 'r') as f:
            emb = f['embedding'][:]
            emb_taxa = f['leaf_names'][:]
        logger.info('selecting embeddings for taxa found in %s' %
                    args.accessions)
        emb = select_embeddings(taxa_ids, emb_taxa, emb)

    logger.info(f'Writing {len(rep_taxdf)} taxa to taxa table')
    tt_args = [
        'taxa_table', 'a table for storing taxa data', rep_taxdf.index.values
    ]
    tt_kwargs = dict()
    for t in taxlevels[:-1]:
        enc = LabelEncoder().fit(rep_taxdf[t].values)
        _data = enc.transform(rep_taxdf[t].values).astype(np.uint32)
        _vocab = enc.classes_.astype('U')
        logger.info(f'{t} - {len(_vocab)} classes')
        tt_args.append(
            EnumData(name=t,
                     description=f'label encoded {t}',
                     data=_data,
                     elements=_vocab))
    # we have too many species to store this as VocabData, nor does it save any spaces
    tt_args.append(
        VectorData(name='species',
                   description=f'Microbial species in the form Genus species',
                   data=rep_taxdf['species'].values))

    if emb is not None:
        tt_kwargs['embedding'] = emb
    #tt_kwargs['rep_taxon_id'] = rep_taxdf['gtdb_genome_representative'].values

    taxa_table = TaxaTable(*tt_args, **tt_kwargs)

    h5path = args.out

    logger.info("reading %d Fasta files" % len(fapaths))
    logger.info("Total size: %d", sum(list(map_func(os.path.getsize,
                                                    fapaths))))

    tmp_h5_file = None
    if args.protein:
        vocab_it = AAVocabIterator
        SeqTable = SequenceTable
        skbio_cls = Protein
    else:
        vocab_it = DNAVocabIterator
        SeqTable = DNATable
        skbio_cls = DNA

    vocab = np.array(list(vocab_it.characters()))
    if not args.protein:
        np.testing.assert_array_equal(vocab, list('ACYWSKDVNTGRMHB'))

    if args.total_seq_len is None:
        logger.info('counting total number of sqeuences')
        n_seqs, total_seq_len = np.array(
            list(zip(
                *tqdm(map_func(seqlen, fapaths), total=len(fapaths))))).sum(
                    axis=1)
        logger.info(f'found {total_seq_len} bases across {n_seqs} sequences')
    else:
        n_seqs, total_seq_len = args.n_seqs, args.total_seq_len
        logger.info(
            f'As specified, there are {total_seq_len} bases across {n_seqs} sequences'
        )

    logger.info(
        f'allocating uint8 array of length {total_seq_len} for sequences')

    if args.tmpdir is not None:
        if not os.path.exists(args.tmpdir):
            os.mkdir(args.tmpdir)
        tmpdir = tempfile.mkdtemp(dir=args.tmpdir)
    else:
        tmpdir = tempfile.mkdtemp()

    comp = 'gzip' if args.gzip else None
    tmp_h5_filename = os.path.join(tmpdir, 'sequences.h5')
    logger.info(f'writing temporary sequence data to {tmp_h5_filename}')
    tmp_h5_file = h5py.File(tmp_h5_filename, 'w')
    sequence = tmp_h5_file.create_dataset('sequences',
                                          shape=(total_seq_len, ),
                                          dtype=np.uint8,
                                          compression=comp)
    seqindex = tmp_h5_file.create_dataset('sequences_index',
                                          shape=(n_seqs, ),
                                          dtype=np.uint64,
                                          compression=comp)
    genomes = tmp_h5_file.create_dataset('genomes',
                                         shape=(n_seqs, ),
                                         dtype=np.uint64,
                                         compression=comp)
    seqlens = tmp_h5_file.create_dataset('seqlens',
                                         shape=(n_seqs, ),
                                         dtype=np.uint64,
                                         compression=comp)
    names = tmp_h5_file.create_dataset('seqnames',
                                       shape=(n_seqs, ),
                                       dtype=h5py.special_dtype(vlen=str),
                                       compression=comp)

    taxa = np.zeros(len(fapaths), dtype=int)

    seq_i = 0
    b = 0
    for genome_i, fa in tqdm(enumerate(fapaths), total=len(fapaths)):
        kwargs = {
            'format': 'fasta',
            'constructor': skbio_cls,
            'validate': False
        }
        taxid = taxa_ids[genome_i]
        rep_taxid = taxdf['gtdb_genome_representative'][genome_i]
        taxa[genome_i] = np.where(rep_taxdf.index == rep_taxid)[0][0]
        for seq in skbio.io.read(fa, **kwargs):
            enc_seq = vocab_it.encode(seq)
            e = b + len(enc_seq)
            sequence[b:e] = enc_seq
            seqindex[seq_i] = e
            genomes[seq_i] = genome_i
            seqlens[seq_i] = len(enc_seq)
            names[seq_i] = vocab_it.get_seqname(seq)
            b = e
            seq_i += 1
    ids = tmp_h5_file.create_dataset('ids', data=np.arange(n_seqs), dtype=int)
    tmp_h5_file.flush()

    io = get_hdf5io(h5path, 'w')

    print([a['name'] for a in GenomeTable.__init__.__docval__['args']])

    genome_table = GenomeTable(
        'genome_table',
        'information about the genome each sequence comes from',
        taxa_ids,
        taxa,
        taxa_table=taxa_table)

    #############################
    # read and trim tree
    #############################
    if args.tree:
        logger.info('Reading tree from %s' % args.tree)
        root = TreeNode.read(args.tree, format='newick')

        logger.info('Found %d tips' % len(list(root.tips())))

        logger.info('Transforming leaf names for shearing')
        for tip in root.tips():
            tip.name = tip.name[3:].replace(' ', '_')

        logger.info('converting tree to Newick string')
        bytes_io = BytesIO()
        root.write(bytes_io, format='newick')
        tree_str = bytes_io.getvalue()
        di_kwargs['tree'] = NewickString('tree', data=tree_str)

        # get distances from tree if they are not provided
        tt_dmat = root.tip_tip_distances().filter(rep_taxdf.index)
        di_kwargs['distances'] = CondensedDistanceMatrix('distances',
                                                         data=tt_dmat.data)

        adj, gt_indices = get_tree_graph(root, rep_taxdf)
        di_kwargs['tree_graph'] = TreeGraph(data=adj,
                                            leaves=gt_indices,
                                            table=genome_table,
                                            name='tree_graph')

    if args.gzip:
        names = io.set_dataio(names, compression='gzip', chunks=True)
        sequence = io.set_dataio(sequence,
                                 compression='gzip',
                                 maxshape=(None, ),
                                 chunks=True)
        seqindex = io.set_dataio(seqindex,
                                 compression='gzip',
                                 maxshape=(None, ),
                                 chunks=True)
        seqlens = io.set_dataio(seqlens,
                                compression='gzip',
                                maxshape=(None, ),
                                chunks=True)
        genomes = io.set_dataio(genomes,
                                compression='gzip',
                                maxshape=(None, ),
                                chunks=True)
        ids = io.set_dataio(ids,
                            compression='gzip',
                            maxshape=(None, ),
                            chunks=True)

    seq_table = SeqTable(
        'seq_table',
        'a table storing sequences for computing sequence embedding',
        names,
        sequence,
        seqindex,
        seqlens,
        genomes,
        genome_table=genome_table,
        id=ids,
        vocab=vocab)

    difile = DeepIndexFile(seq_table, taxa_table, genome_table, **di_kwargs)

    before = datetime.now()
    io.write(difile, exhaust_dci=False, link_data=False)
    io.close()
    after = datetime.now()
    delta = (after - before).total_seconds()

    logger.info(
        f'Sequence totals {sequence.dtype.itemsize * sequence.size} bytes')
    logger.info(f'Took {delta} seconds to write after read')

    if tmp_h5_file is not None:
        tmp_h5_file.close()

    logger.info("reading %s" % (h5path))
    h5size = os.path.getsize(h5path)
    logger.info("HDF5 size: %d", h5size)