コード例 #1
0
ファイル: converter.py プロジェクト: reneevdw/GenNet
    def convert_genotypes(self):

        chunk_size = self.split_size
        if chunk_size is None:
            raise ValueError(
                'CONVERTER_SPLIT_SIZE does not define in config file!')
        G = np.array([])
        # self.reader.folder.processed=0
        while True:
            with Timer() as t:
                G = self.reader.folder.get_bed(chunk_size)
                if isinstance(G, type(None)):
                    break

            print(('Time to read {} SNPs is {} s'.format(G.shape[0], t.secs)))

            self.write_data('gen')
            atom = tables.Int8Atom()
            self.genotype = self.h5_gen_file.create_carray(
                self.h5_gen_file.root,
                'genotype',
                atom, (G.shape),
                title='Genotype',
                filters=self.pytable_filters)
            with Timer() as t:
                self.genotype[:] = G

            print(('Time to write {} SNPs is {} s'.format(G.shape[0], t.secs)))

            self.h5_gen_file.close()
            G = None
            gc.collect()
コード例 #2
0
def contact_maps_from_traj(pdb_file, traj_file, savefile, contact_cutoff=8.0):
    """
    Get contact map from trajectory.
    """

    mda_traj = mda.Universe(pdb_file, traj_file)
    traj_length = len(mda_traj.trajectory)
    nloops = int(
        brute(best_loop, (loop_range, ), args=(traj_length, size),
              finish=None))
    print("traj_length: %d  nloop: %d" % (traj_length, nloops))
    write_freq = nloops // 5
    ca = mda_traj.select_atoms('name CA')
    dist_shape = distances.self_distance_array(ca.positions).shape[0]

    if rank == 0:
        savefile = os.path.abspath(savefile)
        outfile = tables.open_file(savefile, 'w')
        atom = tables.Int8Atom()
        cm_table = outfile.create_earray(outfile.root,
                                         'contact_maps',
                                         atom,
                                         shape=(0, dist_shape))
        print("dist_shape ", dist_shape)
    contact_matrices = []
    # workaround mpi4py 2^32 limit on number of objects
    # and ib memory size limit
    for loop in range(nloops):
        contact_matrices_loop = []

        nframes = traj_length // (size * nloops)
        start = (rank + loop * size) * nframes
        end = (rank + 1 + loop * size) * nframes
        if loop == nloops - 1 and rank == size - 1:
            end = traj_length
        print("loop %d rank %d start %d end %d" % (loop, rank, start, end))
        for frame in mda_traj.trajectory[start:end]:
            cm_matrix = (distances.self_distance_array(ca.positions) <
                         contact_cutoff) * 1.0
            contact_matrices_loop.append(cm_matrix.astype('int8'))
        print("rank %d cm size %d" % (rank, len(contact_matrices_loop)))
        contact_matrices_loop = comm.gather(contact_matrices_loop, root=0)
        if rank == 0:
            contact_matrices.append(
                list(chain.from_iterable(contact_matrices_loop)))
            print("loop %d " % loop, len(contact_matrices_loop),
                  len(contact_matrices_loop[0]))
            if (loop + 1) % write_freq == 0:
                contact_matrices = list(chain.from_iterable(contact_matrices))
                cm_table.append(contact_matrices)
                contact_matrices = []
        comm.Barrier()
    if rank == 0:
        if len(contact_matrices) > 0:
            contact_matrices = list(chain.from_iterable(contact_matrices))
            cm_table.append(contact_matrices)
        outfile.close()
コード例 #3
0
ファイル: io_hdf5.py プロジェクト: IlanaL1/opengwas
 def init_genotypes(self):
     """Create the array on file (which is empty) to store genotypes."""
     atom = tables.Int8Atom()
     self.genotype = self.h5_file.createCArray(
         self.h5_file.root,
         'genotype',
         atom, (self.num_probes, self.num_individuals),
         title='Genotype',
         filters=self.pytable_filters)
コード例 #4
0
 def copy_data(targeth, name, source, idx):
     a = targeth.createCArray(targeth.root,
                              name,
                              tb.Int8Atom(),
                              shape=[len(idx)] + list(source.shape[1:]))
     print "Copying {} images to {}".format(len(idx), name)
     t0 = time()
     for i, ind in enumerate(idx):
         if i > 0 and i % printfreq == 0:
             progress(i, len(idx), time() - t0, printfreq)
             t0 = time()
         a[i, :] = source[ind, :]
コード例 #5
0
def create_data_file(out_file,
                     n_channels,
                     n_samples,
                     image_shape,
                     storage_names=('data', 'truth', 'affine'),
                     affine_shape=(0, 4, 4),
                     normalize=True,
                     affine_dtype=tables.Float32Atom()):
    hdf5_file = tables.open_file(out_file, mode='w')
    filters = tables.Filters(
        complevel=5
    )  #, complib='blosc')  # suggested remove in https://github.com/ellisdg/3DUnetCNN/issues/58
    data_shape = tuple([0, n_channels] + list(image_shape))
    truth_shape = tuple([0, 1] + list(image_shape))

    if not normalize:
        data_storage = hdf5_file.create_earray(hdf5_file.root,
                                               storage_names[0],
                                               tables.Int8Atom(),
                                               shape=data_shape,
                                               filters=filters,
                                               expectedrows=n_samples)
    else:
        data_storage = hdf5_file.create_earray(hdf5_file.root,
                                               storage_names[0],
                                               tables.Float32Atom(),
                                               shape=data_shape,
                                               filters=filters,
                                               expectedrows=n_samples)
    truth_storage = hdf5_file.create_earray(hdf5_file.root,
                                            storage_names[1],
                                            tables.UInt8Atom(),
                                            shape=truth_shape,
                                            filters=filters,
                                            expectedrows=n_samples)
    affine_storage = hdf5_file.create_earray(hdf5_file.root,
                                             storage_names[2],
                                             affine_dtype,
                                             shape=affine_shape,
                                             filters=filters,
                                             expectedrows=n_samples)
    if len(storage_names) == 4:
        normalization_storage = hdf5_file.create_earray(hdf5_file.root,
                                                        storage_names[3],
                                                        tables.Float32Atom(),
                                                        shape=(0, 2),
                                                        filters=filters,
                                                        expectedrows=n_samples)
        # will hold mean and std of this case for later normalization
        return hdf5_file, data_storage, truth_storage, affine_storage, normalization_storage
    return hdf5_file, data_storage, truth_storage, affine_storage
コード例 #6
0
    def _create_table(self, name, example):
        """
        Create a new table within the HDF file, where the tables shape and its
        datatype are determined by *example*.
        """
        type_map = {
            np.dtype(np.float64): tables.Float64Atom(),
            np.dtype(np.float32): tables.Float32Atom(),
            np.dtype(np.int): tables.Int64Atom(),
            np.dtype(np.int8): tables.Int8Atom(),
            np.dtype(np.uint8): tables.UInt8Atom(),
            np.dtype(np.int16): tables.Int16Atom(),
            np.dtype(np.uint16): tables.UInt16Atom(),
            np.dtype(np.int32): tables.Int32Atom(),
            np.dtype(np.uint32): tables.UInt32Atom(),
            np.dtype(np.bool): tables.BoolAtom(),
        }

        try:
            if type(example) == np.ndarray:
                h5type = type_map[example.dtype]
            elif type(example) == str:
                h5type = tables.VLStringAtom()
        except KeyError:
            raise TypeError(
                "Could not create table %s because of unknown dtype '%s'" %
                (name, example.dtype))  #+ ", of name: " % example.shape)

        if type(example) == np.ndarray:
            h5dim = (0, ) + example.shape

            h5 = self.h5
            filters = tables.Filters(complevel=self.compression_level,
                                     complib='zlib',
                                     shuffle=True)
            self.tables[name] = h5.create_earray(h5.root,
                                                 name,
                                                 h5type,
                                                 h5dim,
                                                 filters=filters)
        elif type(example) == str:
            h5 = self.h5
            filters = tables.Filters(complevel=self.compression_level,
                                     complib='zlib',
                                     shuffle=True)
            self.tables[name] = h5.create_vlarray(h5.root,
                                                  name,
                                                  h5type,
                                                  filters=filters)
        self.types[name] = type(example)
コード例 #7
0
    def _create_table_list(self, name, example):
        """
        Create a new table within the HDF file, where the tables shape and its
        datatype are determined by *example*.
        The modified version for creating table with appendList
        """
        type_map = {
            np.dtype(np.float64): tables.Float64Atom(),
            np.dtype(np.float32): tables.Float32Atom(),
            np.dtype(np.int): tables.Int64Atom(),
            np.dtype(np.int8): tables.Int8Atom(),
            np.dtype(np.uint8): tables.UInt8Atom(),
            np.dtype(np.int16): tables.Int16Atom(),
            np.dtype(np.uint16): tables.UInt16Atom(),
            np.dtype(np.int32): tables.Int32Atom(),
            np.dtype(np.uint32): tables.UInt32Atom(),
            np.dtype(np.bool): tables.BoolAtom(),
        }

        try:
            if type(example) == np.ndarray:
                h5type = type_map[example.dtype]
            elif type(example) == list and type(example[0]) == str:
                h5type = tables.VLStringAtom()
        except KeyError:
            raise TypeError("Don't know how to handle dtype '%s'" %
                            example.dtype)

        if type(example) == np.ndarray:
            h5dim = (0, ) + example.shape[1:]

            h5 = self.h5
            filters = tables.Filters(complevel=self.compression_level,
                                     complib='zlib',
                                     shuffle=True)
            self.tables[name] = h5.create_earray(h5.root,
                                                 name,
                                                 h5type,
                                                 h5dim,
                                                 filters=filters)
        elif type(example) == list and type(example[0]) == str:
            h5 = self.h5
            filters = tables.Filters(complevel=self.compression_level,
                                     complib='zlib',
                                     shuffle=True)
            self.tables[name] = h5.create_vlarray(h5.root,
                                                  name,
                                                  h5type,
                                                  filters=filters)
        self.types[name] = type(example)
コード例 #8
0
def main(options):
    gdb = genome.db.GenomeDB(assembly=options.assembly)

    chrom_dict = gdb.get_chromosome_dict()

    track = gdb.create_track(options.track_name[0])
    
    if options.dtype == "float32":
        atom = tables.Float32Atom()
    elif options.dtype == "int8":
        atom = tables.Int8Atom()
    elif options.dtype == "uint8":
        atom = tables.UInt8Atom()
    elif options.dtype == "int16":
        atom = tables.Int16Atom()
    else:
        raise NotImplementedError("datatype %s not implemented" % dtype)

    for path in options.filename:
        filename = path.split("/")[-1]

        if options.format in ("xb", "xbf"):
            # all of the chromosomes are in a single file...
            chrom_names = [chrom.name for chrom in gdb.get_chromosomes()]
        else:
            chrom_names = [extract_chrom_name(filename)]
            
        for chrom_name in chrom_names:
            if chrom_name not in chrom_dict:
                raise ValueError("unknown chromosome '%s'" % chrom_name)

            chrom = chrom_dict[chrom_name]
            sys.stderr.write(chrom_name + "\n")

            # create a chunked array with one dimension the length
            # of the chromosome
            shape = [chrom.length]
            carray = track.h5f.createCArray(track.h5f.root, chrom_name,
                                            atom, shape, filters=ZLIB_FILTER)

            # populate the array with data read from a file
            carray[:] = trackreader.read_file(path, chrom,
                                              dtype=options.dtype,
                                              format=options.format,
                                              pos_idx=options.pos_idx,
                                              val_idx=options.val_idx,
                                              strand=options.strand)

    track.close()
コード例 #9
0
def write_image_annotation_pairs_to_h5(filename_pairs, h5_filename):
    atom = tables.Int8Atom()
    h5_file = tables.open_file(h5_filename, mode='a')
    array_x = h5_file.create_earray(h5_file.root, 'X', atom, (0, 512, 1024, 3))
    array_y = h5_file.create_earray(h5_file.root, 'Y', atom, (0, 512, 1024))
    h = 512
    w = 1024
    for img_path, annotation_path in tqdm(filename_pairs):
        img = misc.imread(img_path)
        img = misc.imresize(img, (h, w))
        annotation = misc.imread(annotation_path)
        annotation = custom_ignore_labels(annotation)
        annotation = misc.imresize(annotation, (h, w), 'nearest')
        array_x.append(np.expand_dims(img, 0))
        array_y.append(np.expand_dims(annotation, 0))
    h5_file.close()
コード例 #10
0
    def _create_table(self, name, example, parent=None):
        """
        Create a new table within the HDF file, where the tables shape and its
        datatype are determined by *example*.
        """
        h5 = self.h5
        filters = tables.Filters(complevel=self.compression_level,
                                 complib='zlib',
                                 shuffle=True)
        if parent is None:
            parent = h5.root

        if type(example) == str:
            h5type = tables.VLStringAtom()
            h5.createVLArray(parent, name, h5type, filters=filters)
            return
        if type(example) == dict:
            self.h5.createGroup(parent, name)
            return
        #If we get here then we're dealing with numpy arrays
        example = np.asarray(example)

        #MODIFICATION: appended name everywhere and introduced string
        type_map = {
            np.dtype(np.float64).name: tables.Float64Atom(),
            np.dtype(np.float32).name: tables.Float32Atom(),
            np.dtype(np.int).name: tables.Int64Atom(),
            np.dtype(np.int8).name: tables.Int8Atom(),
            np.dtype(np.uint8).name: tables.UInt8Atom(),
            np.dtype(np.int16).name: tables.Int16Atom(),
            np.dtype(np.uint16).name: tables.UInt16Atom(),
            np.dtype(np.int32).name: tables.Int32Atom(),
            np.dtype(np.uint32).name: tables.UInt32Atom(),
            np.dtype(np.bool).name: tables.BoolAtom(),
            # Maximal string length of 128 per string - change if needed
            'string32': tables.StringAtom(128)
        }

        try:
            h5type = type_map[example.dtype.name]
            h5dim = (0, ) + example.shape
            h5.createEArray(parent, name, h5type, h5dim, filters=filters)
        except KeyError:
            raise TypeError("Don't know how to handle dtype '%s'" %
                            example.dtype)
コード例 #11
0
ファイル: io_hdf5.py プロジェクト: IlanaL1/opengwas
    def init_phenotypes(self):
        """Extract the phenotype from the individuals"""
        #phenotypes = numpy.loadtxt(self.fam_name, usecols=[5], dtype=int)
        phenotypes = self.individuals['phenotype']
        phenotypes[phenotypes == 1] = -1
        phenotypes[phenotypes == 2] = 1
        phenotypes.shape = (len(phenotypes), 1)
        #check for undefined phenotype
        undefined_phenotype = flatnonzero(phenotypes == 0)
        if len(undefined_phenotype) > 0:
            print('Some phenotypes were undefined')
            print(undefined_phenotype)

        atom = tables.Int8Atom()
        self.h5_file.createCArray(self.h5_file.root,
                                  'phenotype',
                                  atom, (1, self.num_individuals),
                                  title='Phenotype',
                                  filters=self.pytable_filters)
        self.h5_file.root.phenotype[:] = phenotypes.flatten()
        self.phenotypes = self.h5_file.root.phenotype
コード例 #12
0
def main(args):
    usage = """python %s <plink root> <h5 file>
    Convert binary PLINK files into h5 file.
    E.g.: py plink2h5.py mydata_final_clean mydata_final_clean.h5\n""" % args[0]
    if len(args) != 3:
        sys.stderr.write(usage)
        sys.exit(0)

    plinkRoot = args[1]
    h5fname = args[2]

    plinkTitle = plinkRoot.split("/")[-1]

    # Read binary PLINK files
    plinkF = pf.open(plinkRoot)
    numSnps = len(plinkF.get_loci())
    numSamples = len(plinkF.get_samples())
    print "%d SNPs x %d samples" % (numSnps, numSamples)

    # Create the empty array to store genotypes
    atom = tables.Int8Atom()
    h5F = tables.openFile(h5fname, 'w', title=plinkTitle)
    genotype = h5F.createCArray(h5F.root,
                                'genotype',
                                atom, (numSnps, numSamples),
                                title='Genotype',
                                filters=tables.Filters(complevel=5,
                                                       complib='blosc'))

    # populate
    for counter, row in enumerate(plinkF):
        genotype[counter, :] = list(row)
        if counter % 10000 == 9999:
            print(counter + 1), 'SNPs read'
    plinkF.close()
    h5F.close()
コード例 #13
0
ファイル: h5files.py プロジェクト: yigefanrenabc/acoular
                                  precision,
                                  group=None):
        pass


if is_tables:

    precision_to_atom = {
        'float32': tables.Float32Atom(),
        'complex64': tables.ComplexAtom(8),
        'float64': tables.Float64Atom(),
        'complex128': tables.ComplexAtom(16),
        'bool': tables.BoolAtom(),
        'int32': tables.Int32Atom(),
        'int16': tables.Int16Atom(),
        'int8': tables.Int8Atom(),
    }

    class H5FileTables(H5FileBase, tables.File):
        def create_extendable_array(self,
                                    nodename,
                                    shape,
                                    precision,
                                    group=None):
            if not group: group = self.root
            atom = precision_to_atom[precision]
            self.create_earray(group, nodename, atom, shape)

        def get_data_by_reference(self, nodename, group=None):
            if not group: group = self.root
            return self.get_node(group, nodename)
コード例 #14
0
ファイル: parse_pssm.py プロジェクト: semccomas/DL_proj16
name = 'group_' + sys.argv[1][-17:-12]

feature = np.loadtxt(
    sys.argv[2])  #this is the features for secondary structure
h5 = tb.open_file(
    'pssm_test_table_jhE0', 'a'
)  ##########!!!!!!!!!! THIS IS THE ONLY THING YOU HAVE TO CHANGE !!!!!!!!!!!!!!!!
group = h5.create_group('/', name, 'individual group')

pssm = h5.create_earray(
    group, name='one_hot', shape=(0, 21, 15), atom=tb.Float32Atom(
    ))  #would be 0, 21, 15 if you want it to be the shape of the old one
ss = h5.create_earray(group,
                      name='ss',
                      shape=(0, 3),
                      atom=tb.Int8Atom(),
                      filters=tables.Filters(complevel=9,
                                             complib='blosc:snappy'))

index = []
#### splitting the sliding table into bits of 21 sized timesteps ##
for num, line in enumerate(window):
    if num != 0 and num % 21 == 0:
        index.append(num)
window = np.vsplit(window, index)
#print np.shape(window)

for feat, line in zip(feature, window):
    ss.append(feat[np.newaxis, :])
    pssm.append(line[np.newaxis, :])
コード例 #15
0
ファイル: parse_all.py プロジェクト: semccomas/DL_proj16
##############################################################################################################################
################################################### TO PYTABLE #############################################################
##############################################################################################################################


name= 'group_' + sys.argv[1][-8:-3] 
print name 

#feature= np.loadtxt(sys.argv[2])    #this is the features for secondary structure  
h5= tb.open_file(sys.argv[4], 'a')
group= h5.create_group('/', name, 'individual group')

seq_tab = h5.create_earray(group, name='seq_tab', shape=(0, 20, 15), atom=tb.Float32Atom())   #would be 0, 21, 15 if you want it to be the shape of the old one
pssm_tab = h5.create_earray(group, name='pssm_tab', shape=(0, 21, 15), atom=tb.Float32Atom())   #would be 0, 21, 15 if you want it to be the shape of the old one
ss3_feat = h5.create_earray(group, name='ss3_feat', shape=(0, 3), atom=tb.Int8Atom())
ss8_feat = h5.create_earray(group, name='ss8_feat', shape=(0, 8), atom=tb.Int8Atom())
rsa_feat = h5.create_earray(group, name='rsa_feat', shape=(0, 1), atom=tb.Float32Atom())
rsa=np.reshape(rsa,(-1,1))


############## might need to add filter!!!!! ################# 
### also there were 3 difficult files, something went wrong in comp. In the future we can maybe take these away with an if loop just to check that they match




#### splitting the sliding table into bits of 21 (pssm) or 20 (seq) sized timesteps ## 
def timesteps (array, size):
	index= []
	for num, line in enumerate(array):
コード例 #16
0
def load_color(random_seed=123522):
    # Check if dataset is in the data directory.
    data_path = os.path.join(os.path.split(__file__)[0], "data")
    if not os.path.exists(data_path):
        os.makedirs(data_path)

    dataset = 'train.zip'
    data_file = os.path.join(data_path, dataset)
    if os.path.isfile(data_file):
        dataset = data_file

    if (not os.path.isfile(data_file)):
        try:
            import urllib
            urllib.urlretrieve('http://google.com')
        except AttributeError:
            import urllib.request as urllib
        url = 'https://dl.dropboxusercontent.com/u/15378192/train.zip'
        print('Downloading data from %s' % url)
        urllib.urlretrieve(url, data_file)

    data_dir = os.path.join(data_path, 'cvd')
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)
        zf = zipfile.ZipFile(data_file)
        zf.extractall(data_dir)

    data_file = os.path.join(data_path, 'cvd_color.hdf5')
    label_file = os.path.join(data_path, 'cvd_color_labels.npy')
    if not os.path.exists(data_file):
        print('... loading data')
        cat_matches = []
        dog_matches = []
        for root, dirname, filenames in os.walk(data_dir):
            for filename in fnmatch.filter(filenames, 'cat*'):
                cat_matches.append(os.path.join(root, filename))
            for filename in fnmatch.filter(filenames, 'dog*'):
                dog_matches.append(os.path.join(root, filename))

        sort_key = lambda x: int(x.split('.')[-2])
        cat_matches = sorted(cat_matches, key=sort_key)
        dog_matches = sorted(dog_matches, key=sort_key)

        def square(x):
            resize_shape = (260, 260)
            slice_size = (256, 256)
            slice_left = (resize_shape[0] - slice_size[0]) / 2
            slice_upper = (resize_shape[1] - slice_size[1]) / 2
            return imresize(
                x, resize_shape,
                interp='nearest')[slice_left:slice_left + slice_size[0],
                                  slice_upper:slice_upper +
                                  slice_size[1]].transpose(2, 0,
                                                           1).astype('uint8')

        matches = cat_matches + dog_matches
        matches = np.array(matches)
        random_state = np.random.RandomState(random_seed)
        idx = random_state.permutation(len(matches))
        c = [0] * len(cat_matches)
        d = [1] * len(dog_matches)
        y = np.array(c + d).astype('uint8')
        matches = matches[idx]
        y = y[idx]

        compression_filter = tables.Filters(complevel=5, complib='blosc')
        h5_file = tables.openFile(data_file, mode='w')
        example = square(mpimg.imread(matches[0]))
        image_storage = h5_file.createEArray(h5_file.root,
                                             'images',
                                             tables.Int8Atom(),
                                             shape=(0, ) + example.shape,
                                             filters=compression_filter)
        for n, f in enumerate(matches):
            print("Processing image %i of %i" % (n, len(matches)))
            x = square(mpimg.imread(f)).astype('uint8')
            image_storage.append(x[None])
        h5_file.close()
        np.save(label_file, y)
    h5_file = tables.openFile(data_file, mode='r')
    x_s = h5_file.root.images
    y_s = np.load(label_file)
    return (x_s, y_s)
コード例 #17
0
tmesh_id = h5file.createGroup(trianglesMesh_id, 'tmesh')
h5file.setNodeAttr(tmesh_id, 'type', 'unstructured')

eltnodes_arr = h5file.createCArray(tmesh_id,
                                   "elementNodes",
                                   tables.Int32Atom(),
                                   numpy.shape(eltnodes),
                                   filters=filters)
i = 0
for array in eltnodes_arr:
    eltnodes_arr[i] = eltnodes[i]
    i += 1

elttypes_arr = h5file.createCArray(tmesh_id,
                                   "elementTypes",
                                   tables.Int8Atom(),
                                   numpy.shape(elttypes),
                                   filters=filters)

nodes_arr = h5file.createCArray(tmesh_id,
                                "nodes",
                                tables.Float32Atom(),
                                numpy.shape(nodes),
                                filters=filters)
i = 0
for array in nodes_arr:
    nodes_arr[i] = nodes[i]
    i += 1

i = 0
for array in elttypes_arr:
コード例 #18
0
################################################### PSSM TO PYTABLE ####################################################
##############################################################################################################################

name = 'group_' + sys.argv[1][-8:-3]
print name

#feature= np.loadtxt(sys.argv[2])    #this is the features for secondary structure
h5 = tb.open_file(
    '8state_table', 'a'
)  ##########!!!!!!!!!! THIS IS THE ONLY THING YOU HAVE TO CHANGE !!!!!!!!!!!!!!!!
group = h5.create_group('/', name, 'individual group')

one_hot = h5.create_earray(
    group, name='one_hot', shape=(0, 20, 15), atom=tb.Float32Atom(
    ))  #would be 0, 21, 15 if you want it to be the shape of the old one
ss = h5.create_earray(group, name='ss', shape=(0, 8), atom=tb.Int8Atom())

index = []
#### splitting the sliding table into bits of 21 sized timesteps ##
for num, line in enumerate(final):
    if num != 0 and num % 20 == 0:
        index.append(num)
final = np.vsplit(final, index)
#print np.shape(final)

for feat, line in zip(encoded, final):
    ss.append(feat[np.newaxis, :])
    one_hot.append(line[np.newaxis, :])

print ss
print one_hot
コード例 #19
0
def preprocess_all_data(dataset_directory,
                        input_size=1024,
                        alphabet='אבגדהוזחטיכךלמםנןסעפףצץקרשת "',
                        output_filename='./sample_dataset/sample_dataset'):
    """ 
    Gets dataset directory path (which has structure as detailed behind TODO), and writes to file data as numeric NumPy ndarray in HDF5 file and TFrecord file.

    If the output files already exists the preprocessed data is *overwritten*.
    """
    #initialize variables
    preprocessed_samples = np.array([], dtype=np.int8)
    preprocessed_labels = np.array([], dtype=np.int8)

    h5_fn = output_filename + '.h5'
    tfr_fn = output_filename + '.tfrecords'

    #initialize files dataset will be stored in
    with tables.open_file(
            h5_fn,
            mode='w') as h5file, tf.io.TFRecordWriter(tfr_fn) as tfwriter:
        typeAtom = tables.Int8Atom()
        print('Processing...')
        #iterate over authors
        ds_path = pathlib.Path(dataset_directory)
        for author_label, author_dir in enumerate(ds_path.iterdir()):
            #validate
            print('Processing ' + str(author_dir) + '...')
            if not author_dir.is_dir():
                print('File ' + str(author_dir) +
                      ' ignored (invalid location).')
                continue

            #create h5 group and table
            gauthor = h5file.create_group(h5file.root,
                                          'author' + str(author_label),
                                          author_dir.name)
            array_c = h5file.create_earray(gauthor, 'samples', typeAtom,
                                           (0, len(alphabet), input_size),
                                           author_dir.name + " Samples")

            # author_dict[author_label] = author_dir.name
            for book_path in author_dir.iterdir():
                # validation check
                if not book_path.is_file():
                    print('Directory ' + str(author_dir) +
                          ' ignored (invalid location).')
                    continue
                if book_path.suffix != '.json':
                    print('File ' + str(author_dir) +
                          ' ignored (type should be JSON).')
                    continue

                # load JSON data
                with book_path.open(mode='r', encoding='utf8') as book_file:
                    try:
                        book_raw_text = json.load(book_file)['text']
                        # book_raw_text = book_raw_data
                    except:
                        print('File ' + str(author_dir) +
                              ' ignored (impossible to read JSON).')
                        continue

                # flatten
                if isinstance(book_raw_text,
                              list):  # no internal separation of text
                    flattened_raw_lst = list(flatten(book_raw_text))
                elif isinstance(
                        book_raw_text,
                        dict):  # internal separation of text - dict of dicts
                    tmp = []
                    for d in book_raw_text.values():
                        if isinstance(d, dict):
                            tmp.extend(list(d.values()))
                        elif isinstance(d, list):
                            tmp.extend(d)
                    flattened_raw_lst = list(flatten(tmp))
                else:
                    raise ValueError(str(book_path) + ': Could not parse.')

                # ensure file does not have different structure from expected
                assert (all(isinstance(x, str) for x in flattened_raw_lst))
                # TODO: check manually all is well

                # concatenate
                flattened_raw_str = ''.join(flattened_raw_lst)

                # TODO: handle single quote characters

                # keep only letters in alphabet and remove multiple spaces
                filtered = re.sub('[^' + alphabet + ']', ' ',
                                  flattened_raw_str)
                filtered = re.sub(' +', ' ', filtered)
                # TODO: is it always correct to replace out-of-alphabet characters by spaces?

                # split to samples
                #TODO: prevent cutting in the middle of words
                n = input_size
                samples = [
                    filtered[i:i + n] for i in range(0, len(filtered), n)
                ]

                #convert to numerical one-hot
                samples_onehot_minus1 = np.stack(
                    [str2onehot(sample, alphabet) for sample in samples[0:-1]],
                    axis=0)
                #pad last sample and add it to 3d array
                lastsample_onehot = str2onehot(samples[-1], alphabet)
                lastsample_onehot_padded = np.zeros_like(
                    samples_onehot_minus1[-1, :, :], dtype=np.int8)
                lastsample_onehot_padded[
                    0:lastsample_onehot.shape[0],
                    0:lastsample_onehot.shape[1]] = lastsample_onehot
                samples_onehot = np.concatenate(
                    (samples_onehot_minus1,
                     lastsample_onehot_padded[np.newaxis, :, :]))

                ## write to file
                #write to h5
                array_c.append(samples_onehot)
                #write to tfrecord
                for text_arr in samples_onehot:
                    tf_example = text_example(text_arr, author_label)
                    tfwriter.write(tf_example.SerializeToString())
            h5file.flush()
            tfwriter.flush()
コード例 #20
0
image_params = config.image_params
stream = config.image_prep_function(args.data_dir, labels, **image_params)

one_patient = next(stream)
idd_len = len(one_patient[1])

c, h, w = one_patient[2].shape
print("Image channels: {} rows: {} cols: {}".format(c, h, w))

stream = chain([one_patient], stream)

tables_file = os.path.join(args.output_dir, 'image_data.h5')

image_data_table = tables.open_file(tables_file, mode='w')
image_atom = tables.Float32Atom()
int_atom = tables.Int8Atom()
id_atom = tables.StringAtom(len(one_patient[1]))

train_data = image_data_table.create_earray(image_data_table.root, 'train_data', atom=image_atom, shape=(0, 1, h, w))
train_labels = image_data_table.create_earray(image_data_table.root, 'train_labels', atom=int_atom, shape=(0, 2))
test_data = image_data_table.create_earray(image_data_table.root, 'test_data', atom=image_atom, shape=(0, 1, h, w))
test_ids = image_data_table.create_earray(image_data_table.root, 'test_ids', atom=id_atom, shape=(0, idd_len))

test_sample = 1
train_sample = 1

for item in stream:
    dataset, idd, image = item
    id_chars = np.chararray(shape=(1, len(idd)))
    id_chars[:] = idd
コード例 #21
0
    targetfile = arguments['<target.h5>']
    printfreq = int(arguments['--progress-freq'])

    FILTERS = tb.Filters(complevel=5, complib='zlib')
    sourceh = tb.open_file(source, mode='r', filters=FILTERS)
    likeh = tb.open_file(like, mode='r', filters=FILTERS)
    targeth = tb.open_file(targetfile, mode='w', filters=FILTERS)

    source_mask = sourceh.root.datamask
    source_data = sourceh.root.data
    like_mask = likeh.root.datamask
    like_data = likeh.root.data

    target_data = targeth.createCArray(targeth.root,
                                       'data',
                                       tb.Int8Atom(),
                                       shape=[source_data.shape[0]] +
                                       list(like_data.shape[1:]))

    # copy data
    n_images = source_data.shape[0]
    print "Remasking {} images...".format(n_images)
    t0 = time()
    for i in range(n_images):
        if i > 0 and i % printfreq == 0:
            progress(i, n_images, time() - t0, printfreq)
            t0 = time()
        reconstituted = np.zeros(shape=source_mask.shape)
        reconstituted[np.array(source_mask)] = source_data[i, :]
        target_data[i, :] = reconstituted[np.array(like_mask)]
コード例 #22
0
		t = f.root.targets[:]
	
		pos = np.sum(t)
	
		batches = [any(t[i*batch_size:(i+1)*batch_size]) for i in range(number)]
	
		n = np.sum(batches) * batch_size
	
		j = 0
		while (n < pos * 2 - 300):
			batches[j] = True
			n = np.sum(batches) * batch_size
			j += 1
	
	
		a = f.create_earray(f.root,'balance_targets',tables.Int8Atom(),(0,),expectedrows=n)
				
		for i in range(number):
			if batches[i]:
				a.append(t[i*batch_size:(i+1)*batch_size])
	
	
		
	
		for name in total:
			data = f.root[name][:]
			if name.startswith('hist'):
				features = (6272,)
			else:
				features = (64,64)
	
コード例 #23
0
index = []
for num, line in enumerate(OH):
    if num != 0 and num % 20 == 0:
        index.append(num)
OH = np.vsplit(OH, index)

#h5 = tables.open_file(sys.argv[3], 'w')
#### here i make one big table, just change out for the one above if you want lots of individual tables and for gods sake dont forget to delete the file if you run again

h5 = tables.open_file('big_table', 'a')
group = h5.create_group('/', name, 'individual group')

one_hot = h5.create_earray(group,
                           name='one_hot',
                           shape=(0, 20, 15),
                           atom=tables.Int8Atom())
#pssm = h5.create_earray(h5.root, name='pssm', shape=(0, 15, 21), atom=tables.Float32Atom())

d = feat.ndim
if d == 1:
    ss = h5.create_earray(group,
                          name='ss',
                          shape=(0, d),
                          atom=tables.Int8Atom())
    feat = np.reshape(feat, (-1, 1))
else:
    if len(feat[0]) == 4:
        ss = h5.create_earray(group,
                              name='ss',
                              shape=(0, 4),
                              atom=tables.Int8Atom())
コード例 #24
0
ファイル: generate_data.py プロジェクト: Nouf-Barakati/sfan
    def generate_modular(self):
        """
        Generate synthetic data with a modular network and a genotype matrix
        made of random {0, 1, 2}.
        
        Generated files
        ---------------
        <root_dir>/<simu_id>.readme:
            README file describing the simulation paramters.
        <root_dir>/<simu_id>.task_similarities.txt:
            args.num_tasks x args.num_tasks matrix \Omega
            of task covariance.
        <root_dir>/<simu_id>.causal_features:
            args.num_tasks lists of NUM_CAUSAL_EACH causal features,
            chosen from the first NUM_CAUSAL_TOTAL features.
            One list per task. Indices start at 0.
        <root_dir>/<simu_id>.causal_weights:
            Lists of the weights given to the causal features,
            generated so as to respect the covariance structure given by Omega.
            One list per task, in the order of <simu_id>.causal_features.
        <root_dir>/<simu_id>.genotypes.txt:
            num_features x num_samples matrix of {0, 1, 2} (representing SNPs).
        <root_dir>/<simu_id>.network.dimacs:
            A modular network over the self.num_features features,
            with fully connected modules of size MOD_SIZE.
        For task_id in 0, ..., args.num_tasks:
            <root_dir>/<simu_id>.phenotype_<task_id>.txt:
                Phenotype vector (of size args.num_samples) for task <task_id>.
            <root_dir>/<simu_id>.scores_<task_id>.txt
                Node weights (of size args.num_features) for task <task_id>.
                Computed as Pearson correlation.
        """
        # Writing readme
        readme_f = '%s/%s.readme' % (self.root_dir, self.simu_id)
        with open(readme_f, 'w') as f:
            f.write("# Features generated by generate_data.generate_modular\n")
            f.write("%d\tfeatures\n" % self.num_features)
            f.write("%d\tsamples\n" % self.num_samples)
            f.write("%d\ttasks\n" % self.num_tasks)
            f.close()
        logging.info("README file created under %s\n" % readme_f)

        # Generate a matrix of similarities between tasks
        omega = np.random.uniform(size = (self.num_tasks, self.num_tasks))
        omega = omega.transpose().dot(omega)
        d = np.diag(omega)
        d.shape = (self.num_tasks, 1)
        omega = omega / np.sqrt(d.dot(d.transpose()))

        # Save omega to file
        fname = "%s/%s.task_similarities.txt" % (self.root_dir, self.simu_id)
        np.savetxt(fname, omega, fmt='%.3f')
        logging.info("Covariance matrix saved under %s\n" % fname)

        # Generate beta vectors that are correlated according to omega
        # Trick: cov(Ax) = Acov(x)A'
        L = np.linalg.cholesky(omega) # i.e. LL' = omega
        b = np.random.normal(size=(self.num_tasks, NUM_CAUSAL_TOTAL))
        beta = L.dot(b)

        # For each task, keep the NUM_CAUSAL_EACH features with highest weight
        # as causal;
        # drop the weight of the others to 0.
        causal_features = []
        for k in range(self.num_tasks):
            b = [x for x in beta[k, :]]
            b.sort()
            causal_features.append(np.where(beta[k, :] >= b[-NUM_CAUSAL_EACH])[0])
            beta[k, np.where(beta[k, :] < b[-NUM_CAUSAL_EACH])[0]] = 0.

        # Save causal features to file
        fname = "%s/%s.causal_features.txt"  % (self.root_dir, self.simu_id)
        np.savetxt(fname, causal_features, fmt='%d')
        logging.info("Causal features saved under %s\n" % fname)

        # Save beta to file
        fname = "%s/%s.causal_weights.txt"  % (self.root_dir, self.simu_id)
        np.savetxt(fname, beta)
        logging.info("Causal weights saved under %s\n" % fname)

        # Generate genotypes
        # Create PyTables structure for X transposed (so as to access rows, not cols)
        fname = "%s/%s.genotypes.txt"  % (self.root_dir, self.simu_id)
        with tb.open_file(fname, 'w') as h5f:
            filters = tb.Filters(complevel=5, complib='blosc')
            Xtr = h5f.create_carray(h5f.root, 'Xtr', tb.Int8Atom(),
                                    shape=(self.num_features, self.num_samples))
            for row in xrange(self.num_features):
                Xtr[row, :] = np.random.random_integers(0, high=2, size=self.num_samples)
            h5f.close()
        logging.info("Genotypes saved under %s\n" % fname)

        # generate phenotypes and Pearson scores, and save to file
        with tb.open_file(fname, 'r') as h5f:
            Xtr = h5f.root.Xtr
            for task_idx in range(self.num_tasks):
                y = Xtr[:NUM_CAUSAL_TOTAL,:].transpose().dot(beta.transpose()[:,
                                                                              task_idx])
                y += np.random.normal(scale=0.1, size=(self.num_samples, ))
                fname = "%s/%s.phenotype_%d.txt" % (self.root_dir,
                                                    self.simu_id, task_idx)
                np.savetxt(fname, y, fmt='%.3f')
                logging.info("Phenotype for task %d saved under %s\n" % (task_idx,
                                                                         fname))

                # compute feature-phenotype correlations
                r2 = [st.pearsonr(Xtr[feat_idx, :].transpose(), y)[0]**2 \
                      for feat_idx in range(self.num_features)]
                fname = "%s/%s.scores_%d.txt" % (self.root_dir, self.simu_id, task_idx)
                np.savetxt(fname, r2, fmt='%.3e')
                logging.info("Node weights for task %d saved under %s\n" % (task_idx,
                                                                            fname))


        # Generate network in dimacs format
        # Careful: node indices must start at 1
        num_modules = self.num_features / MOD_SIZE
        num_edges = MOD_SIZE * (MOD_SIZE - 1) * num_modules + \
                    2 * (num_modules - 1) + 2 * (self.num_features - \
                                                 MOD_SIZE * num_modules)
        dimacs_f = '%s/%s.network.dimacs' % (self.root_dir, self.simu_id)
        with open(dimacs_f, 'w') as g:
            g.write("p max %d %d\n" % ((self.num_features), num_edges))

            # create fully connected modules of size MOD_SIZE
            # connect each to the next one
            for mod_idx in range(num_modules):
                x = mod_idx * MOD_SIZE
                if mod_idx > 0:
                    g.write("a %d %d 1\n" % ((x+1), (x))) # connect to previous module
                for x_idx1 in range(MOD_SIZE):
                    for x_idx2 in range(MOD_SIZE):
                        if x_idx1 != x_idx2:
                            g.write("a %d %d 1\n" % ((x+x_idx1+1), (x+x_idx2+1)))
                if (x+MOD_SIZE) < self.num_features: # connect to next module
                    g.write("a %d %d 1\n" % ((x+MOD_SIZE), (x+MOD_SIZE+1)))
            # connect each of the remaining nodes to its neighbor
            for x_idx in range(x+MOD_SIZE+1, self.num_features):
                g.write("a %d %d 1\n" % ((x_idx), (x_idx-1)))
                g.write("a %d %d 1\n" % ((x_idx), (x_idx+1)))
            # last connection (mirror from the previous one)
            g.write("a %d %d 1\n" % ((x_idx+1), (x_idx)))

            g.close()
        logging.info("Network saved under %s\n" % fname)
コード例 #25
0
def collect_skeletons(experiments_df,
                      main_file,
                      file_ext='_featuresN.hdf5',
                      gap_to_interp_seconds=3,
                      sample_size_frames_s=10):

    assert all(x in experiments_df
               for x in ('directory', 'base_name', 'fps', 'id', 'strain'))
    with tables.File(main_file, 'w') as tab_fid:
        r_dtype = []
        for col in experiments_df:
            dat = experiments_df[col]
            if dat.dtype == np.dtype('O'):
                n_s = dat.str.len().max()
                dt = np.dtype('S%i' % n_s)
            else:
                dt = dat.dtype
            r_dtype.append((col, dt))

        #save the experiments table. I do it after the loop to store the fps information

        tab_recarray = experiments_df.to_records(index=False).astype(
            np.dtype(r_dtype))

        tab_fid.create_table('/',
                             'experiments_data',
                             obj=tab_recarray,
                             filters=TABLE_FILTERS)

        table_type = np.dtype([('experiment_id', np.int32),
                               ('worm_index', np.int32), ('strain', 'S10'),
                               ('ini_time_aprox', np.float32),
                               ('ini', np.int32), ('fin', np.int32)])

        data_table = tab_fid.create_table('/',
                                          "skeletons_groups",
                                          table_type,
                                          "Worm feature List",
                                          filters=TABLE_FILTERS)

        skeletons_data = tab_fid.create_earray(
            '/',
            'skeletons_data',
            atom=tables.Float32Atom(),
            shape=(0, 49, 2),
            expectedrows=experiments_df.shape[0] * 22500,
            filters=TABLE_FILTERS)

        is_bad_skeleton_data = tab_fid.create_earray(
            '/',
            'is_bad_skeleton',
            atom=tables.Int8Atom(),
            shape=(0, ),
            expectedrows=experiments_df.shape[0] * 22500,
            filters=TABLE_FILTERS)

        #timer = TimeCounter(tot_frames = len(experiments_df))
        tot_skels = 0
        for irow, row in tqdm.tqdm(experiments_df.iterrows(),
                                   total=len(experiments_df)):
            try:
                features_file = os.path.join(row['directory'],
                                             row['base_name'] + file_ext)
                with pd.HDFStore(features_file, 'r') as fid:
                    assert '/timeseries_data' in fid
            except AssertionError:
                continue

            for output in _process_file(features_file, row['fps'],
                                        gap_to_interp_seconds,
                                        sample_size_frames_s):
                worm_index, worm_data, skeletons, is_bad_skeleton, borders = output

                if not borders:
                    continue

                for bb in borders:
                    skels = skeletons[bb[0]:bb[1]]
                    assert not np.any(np.isnan(skels))
                    is_bad = is_bad_skeleton[bb[0]:bb[1]]

                    ini_t = worm_data['timestamp'].values[bb[0]] / row['fps']
                    rr = (row['id'], int(worm_index), np.array(row['strain']),
                          ini_t, tot_skels, tot_skels + skels.shape[0] - 1)
                    data_table.append([rr])
                    skeletons_data.append(skels)
                    is_bad_skeleton_data.append(is_bad)

                    tot_skels += skels.shape[0]

                    #print(rr[3:], tot_skels, skeletons_data.shape)

                data_table.flush()
                skeletons_data.flush()

            #print(timer.get_str(irow+1))

    #SAVE STRAIN CODES
    #I am reading the skeletons_group instead of the experiment data, to ignore strains without a valid skeleton
    with pd.HDFStore(main_file, 'r') as fid:
        skeletons_groups = fid['/skeletons_groups']
    #get strain data
    ss = skeletons_groups['strain'].unique()
    n_c = max(len(x) for x in ss)
    strains_dict = {x: ii for ii, x in enumerate(np.sort(ss))}
    strains_codes = np.array(
        list(strains_dict.items()),
        np.dtype([('strain', 'S' + str(n_c)), ('strain_id', np.int)]))

    with tables.File(main_file, 'r+') as fid:
        if '/strains_codes' in fid:
            fid.remove_node('/strains_codes')
        fid.create_table('/',
                         'strains_codes',
                         obj=strains_codes,
                         filters=TABLE_FILTERS)
コード例 #26
0
    def _create_table_list(self, name, example):
        """
        Create a new table within the HDF file, where the tables shape and its
        datatype are determined by *example*.
        The modified version for creating table with appendList
        """
        type_map = {
            np.dtype(np.float64): tables.Float64Atom(),
            np.dtype(np.float32): tables.Float32Atom(),
            np.dtype(np.int): tables.Int64Atom(),
            np.dtype(np.int8): tables.Int8Atom(),
            np.dtype(np.uint8): tables.UInt8Atom(),
            np.dtype(np.int16): tables.Int16Atom(),
            np.dtype(np.uint16): tables.UInt16Atom(),
            np.dtype(np.int32): tables.Int32Atom(),
            np.dtype(np.uint32): tables.UInt32Atom(),
            np.dtype(np.bool): tables.BoolAtom(),
        }

        try:
            if type(example) == np.ndarray:
                h5type = type_map[example.dtype]
            elif type(example) == list and type(example[0]) == str:
                h5type = tables.VLStringAtom()
        except KeyError:
            raise TypeError("Don't know how to handle dtype '%s'" %
                            example.dtype)

        if type(example) == np.ndarray:
            h5dim = (0, ) + example.shape[1:]

            h5 = self.h5
            filters = tables.Filters(complevel=self.compression_level,
                                     complib='zlib',
                                     shuffle=True)

            nodes = h5.list_nodes(h5.root)

            nmpt = name.replace('.', '/\n')
            nmpt = nmpt.split('\n')

            path = '/'
            for kay in range(len(nmpt) - 1):
                #if not path+nmpt[kay][:-1] in str(nodes): h5.create_group(path,nmpt[kay][:-1])
                try:
                    h5.is_visible_node(path + nmpt[kay][:-1])
                except:
                    h5.create_group(path, nmpt[kay][:-1])
                path += nmpt[kay]

            self.tables[name] = h5.create_earray(path,
                                                 nmpt[-1],
                                                 h5type,
                                                 h5dim,
                                                 filters=filters)

        elif type(example) == list and type(example[0]) == str:
            h5 = self.h5
            filters = tables.Filters(complevel=self.compression_level,
                                     complib='zlib',
                                     shuffle=True)

            nodes = h5.list_nodes(h5.root)

            nmpt = name.replace('.', '/\n')
            nmpt = nmpt.split('\n')

            path = '/'
            for kay in range(len(nmpt) - 1):
                #if not path+nmpt[kay][:-1] in str(nodes): h5.create_group(path,nmpt[kay][:-1])
                try:
                    h5.is_visible_node(path + nmpt[kay][:-1])
                except:
                    h5.create_group(path, nmpt[kay][:-1])
                path += nmpt[kay]

            self.tables[name] = h5.create_vlarray(path,
                                                  nmpt[-1],
                                                  h5type,
                                                  filters=filters)

        self.types[name] = type(example)
コード例 #27
0
targets_location = base + 'data/6464/targets/' 
dst = base + 'data/6464/h5/{}.h5'


files = []

for filename in os.listdir(targets_location):
	files += [filename]

	
	

for filename in files:
	f = tables.open_file(dst.format(filename[:12]),'w')
	targets = f.create_earray(f.root,'targets',tables.Int8Atom(),(0,),expectedrows=7476)
	print('{}'.format(filename))

	t = open(targets_location + filename)
	targets_csv = csv.reader(t)
	targets_single = []

	for row in targets_csv:
		targets_single += [row[1]]

	t.close()

	targets_single = targets_single[:7476]
	targets.append(np.array(targets_single))

コード例 #28
0
def main(fname,
         concurrent_edges=None,
         concurrent_bin=None,
         exlude_ranges=None):
    """
    creates table to store artifact information
    """
    for sign in SIGNS:

        # why is this here?
        if READONLY:
            mode = 'r'
        else:
            mode = 'r+'
        h5fid = tables.open_file(fname, mode)

        try:
            node = h5fid.get_node('/' + sign + '/times')
        except tables.NoSuchNodeError:
            print('{} has no {} spikes'.format(fname, sign))
            continue

        if len(node.shape) == 0:
            continue

        elif node.shape[0] == 0:
            continue

        times = node[:]
        num_spk = times.shape[0]

        spikes = h5fid.get_node('/' + sign, 'spikes')[:, :]

        assert num_spk == spikes.shape[0]

        try:
            artifacts = h5fid.get_node('/' + sign + '/artifacts')
        except tables.NoSuchNodeError:
            h5fid.create_array('/' + sign,
                               'artifacts',
                               atom=tables.Int8Atom(),
                               shape=(num_spk, ))
            artifacts = h5fid.get_node('/' + sign + '/artifacts')
        if RESET:
            artifacts[:] = 0

        arti_by_diff, arti_by_diff_id = mark_by_diff(times)
        add_id(artifacts, arti_by_diff, arti_by_diff_id, sign)
        # artifacts[arti_by_diff != 0] = arti_by_diff_id

        # if DEBUG:
        #    print('Marked {} {} spikes by diff'.
        #          format(arti_by_diff.sum(), sign))

        arti_by_height, arti_by_height_id = mark_by_height(spikes, sign)
        add_id(artifacts, arti_by_height, arti_by_height_id, sign)

        # artifacts[arti_by_height != 0] = arti_by_height_id
        # if DEBUG:
        #    print('Marked {} {} spikes by height'.
        #          format(arti_by_height.sum(), sign))

        arti_by_double, double_id = mark_double_detection(times, spikes, sign)
        add_id(artifacts, arti_by_double, double_id, sign)
        # artifacts[arti_by_double != 0] = double_id
        # if DEBUG:
        #    print('Marked {} {} spikes as detected twice'.
        #          format(arti_by_double.sum(), sign))

        if concurrent_edges is not None:
            arti_by_conc, arti_by_conc_id = mark_by_bincount(
                times, concurrent_edges, concurrent_bin)
            add_id(artifacts, arti_by_conc, arti_by_conc_id, sign)

            # artifacts[arti_by_conc != 0] = arti_by_conc_id
            # if DEBUG:
            #    print('Marked {} {} spikes by concurrent occurence'.
            #          format(arti_by_conc.sum(), sign))

        if exlude_ranges is not None:
            arti_by_ranges, range_id = mark_range_detection(
                times, exlude_ranges)
            add_id(artifacts, arti_by_ranges, range_id, sign)
            # artifacts[arti_by_ranges != 0] = range_id

            # if DEBUG:
            #    print('Marked {} {} spikes within supplied range '.
            #          format(arti_by_ranges.sum(), sign))

        h5fid.close()
コード例 #29
0
ファイル: new_approach_merge_2.py プロジェクト: SMLMS/ALEX
    # file 2, apd2
    f2 = tables.open_file('tempAPD2_copy.hdf', 'r')
    ts_2 = f2.root.timestamps

    # lengths
    f1_num = f1.root.timestamps.nrows
    f2_num = f2.root.timestamps.nrows
    row_num = (f1_num + f2_num)

    # file 3, outfile
    f3 = tables.open_file('sortedFile.hdf', mode='w')
    f3.create_group(f3.root, name='photon_data')
    filters = tables.Filters(complevel=6, complib='zlib')
    atom1 = tables.UInt32Atom()
    atom2 = tables.Int8Atom()
    ts = f3.create_carray('/photon_data',
                          name='timestamps',
                          atom=atom1,
                          shape=(row_num, 1),
                          filters=filters)
    det = f3.create_carray('/photon_data',
                           name='detectors',
                           atom=atom2,
                           shape=(row_num, 1),
                           filters=filters)

    # Calculations
    start = time.time()
    merge_files(ts_1, ts_2, ts, det, f1_num, f2_num)
    print("Merging took %f seconds." % (time.time() - start))