コード例 #1
0
def save_arr(arr,
             storage_type,
             file_path,
             key='/data',
             axis=0,
             chunks_shape=None,
             compression=None):
    """ Save dask array to hdf5 dataset or numpy file stack.
    """

    if storage_type == "hdf5":
        if chunks_shape:
            print(f'Using chunk shape {chunks_shape}')
            da.to_hdf5(file_path, key, arr, chunks=chunks_shape)
        else:
            if compression == "gzip":
                print('Using gzip compression')
                da.to_hdf5(file_path,
                           key,
                           arr,
                           chunks=None,
                           compression="gzip")
            else:
                print('Without compression')
                da.to_hdf5(file_path, key, arr, chunks=None)
    elif storage_type == "numpy":
        da.to_npy_stack(os.path.join(file_path, "npy/"), arr, axis=axis)
コード例 #2
0
def to_npy_stack(source_h5_path,
                 dest_path,
                 verbose=False,
                 channel_len=1048576):
    """
    Convert original h5 file to npy stack

    :param source_h5_path:
    :param dest_path:
    :param channel_len:
    :return:
    """
    if verbose:
        start = time()
        print("Converting to npy stack")
    h5_file = h5py.File(source_h5_path, "r")
    print(channel_len * num_chans_per_block)
    arr = da.from_array(h5_file["data"],
                        chunks=(2, 1, channel_len * num_chans_per_block))
    if not os.path.isdir(dest_path):
        os.mkdir(dest_path)
    if not os.path.isdir(dest_path + "/original"):
        os.mkdir(dest_path + "/original")
    da.to_npy_stack(dest_path + "/original", arr, axis=2)
    if verbose:
        end = time()
        print("Converted to npy stack in %.4f seconds." % (end - start))
コード例 #3
0
ファイル: utils.py プロジェクト: GTimothee/dask_utils_perso
def save_arr(arr, storage_type, file_path, key='/data', axis=0, chunks_shape=None):
    """ Save array to hdf5 dataset or numpy file stack.
    """
    if storage_type == "hdf5":
        if chunks_shape:
            da.to_hdf5(file_path, key, arr, chunks=chunks_shape)
        else:
            da.to_hdf5(file_path, key, arr)
    elif storage_type == "numpy":
        da.to_npy_stack(os.path.join(file_path, "npy/"), arr, axis=axis)
    return
コード例 #4
0
def test_to_npy_stack():
    x = np.arange(5 * 10 * 10).reshape((5, 10, 10))
    d = da.from_array(x, chunks=(2, 4, 4))

    dirname = mkdtemp()
    try:
        da.to_npy_stack(dirname, d, axis=0)
        assert os.path.exists(os.path.join(dirname, '0.npy'))
        assert (np.load(os.path.join(dirname, '1.npy')) == x[2:4]).all()

        e = da.from_npy_stack(dirname)
        assert eq(d, e)
    finally:
        shutil.rmtree(dirname)
コード例 #5
0
ファイル: test_array_core.py プロジェクト: hc10024/dask
def test_to_npy_stack():
    x = np.arange(5*10*10).reshape((5, 10, 10))
    d = da.from_array(x, chunks=(2, 4, 4))

    dirname = mkdtemp()
    try:
        da.to_npy_stack(dirname, d, axis=0)
        assert os.path.exists(os.path.join(dirname, '0.npy'))
        assert (np.load(os.path.join(dirname, '1.npy')) == x[2:4]).all()

        e = da.from_npy_stack(dirname)
        assert eq(d, e)
    finally:
        shutil.rmtree(dirname)
コード例 #6
0
def split_array(arr, f, nb_blocks=None):
    """ Split an array given its chunk shape. Output is a hdf5 file with as many datasets as chunks.
    
    Arguments:
    ----------
        nb_blocks: nb blocks we want to extract
    """
    # arr_list = get_arr_list(arr, nb_blocks)
    datasets = list()

    # for hdf5:
    """for i, a in enumerate(arr_list):
        # print("creating dataset in split file -> dataset path: ", '/data' + str(i))
        # print("storing data of shape", a.shape)
        datasets.append(f.create_dataset('/data' + str(i), shape=a.shape))
    return da.store(arr_list, datasets, compute=False)"""

    # for numpy storage
    return da.to_npy_stack('data/numpy_data', arr, axis=0)
コード例 #7
0
    X = np.zeros((len(corpus), maxlen))
    for i in range(len(corpus)):
        for no, k in enumerate(corpus[i].split()[:maxlen][::-1]):
            X[i, -1 - no] = dic.get(k, UNK)
    return X


sentiment_label = ['negative', 'positive']
g_sentiment = load_graph('sentiment.pb')
x_sentiment = g_sentiment.get_tensor_by_name('import/Placeholder:0')
logits_sentiment = g_sentiment.get_tensor_by_name('import/logits:0')
sess_sentiment = tf.InteractiveSession(graph=g_sentiment)

with open('fast-text-sentiment.json') as fopen:
    dict_sentiment = json.load(fopen)


def classify(texts):
    batch_x_text = [clearstring(t) for t in texts]
    batch_x = str_idx(batch_x_text, dict_sentiment['dictionary'], 100)
    output_sentiment = sess_sentiment.run(logits_sentiment,
                                          feed_dict={x_sentiment: batch_x})
    labels = [sentiment_label[l] for l in np.argmax(output_sentiment, 1)]
    return da.stack(labels, axis=0)


b = db.read_text('big-text.txt')
stacked = da.stack(b, axis=0).rechunk(20)
result = stacked.map_blocks(classify).rechunk(-1)
da.to_npy_stack('./', result)
コード例 #8
0
def to_stack(p, d):

    x = da.stack(d, axis=0)
    da.to_npy_stack(p, x, axis=0)

    return x
コード例 #9
0
ファイル: DistanceDaskMPI.py プロジェクト: zbsun/DiffusionMap
            tmp_data_holder = h5file_holder[file_name][data_name]
            tmp_dask_data_holder = da.from_array(tmp_data_holder[data_ends_list[data_idx][0]:
                                                                 data_ends_list[data_idx][1]]
                                                 , chunks='auto')
            dataset_holder.append(tmp_dask_data_holder)

    # Create dask arrays based on these h5 files
    dataset = da.concatenate(dataset_holder, axis=0)

    # Calculate the correlation matrix.
    num_dim = len(dataset.shape)
    inner_prod_matrix = da.tensordot(dataset, dataset, axes=(list(range(1, num_dim)), list(range(1, num_dim))))

    # Save the distance patch
    name_to_save = address_output + "/distances/patch_{}_{}.npy".format(comm_rank - 1, comm_rank - 1)
    da.to_npy_stack(name_to_save, inner_prod_matrix)

    # comm.Barrier()  # There is no need to synchronize here

    """
    Step Four: Calculate the off-diagonal patch
    """
    # Construct the data for off-diagonal patch
    patch_number = len(job_list[comm_rank - 1]) - 1

    for _local_idx in range(1, patch_number):  # The first patch calculated for each row is the diagonal patch.

        # Get to know which patch is to process
        job_idx = job_list[comm_rank - 1][_local_idx]
        col_info_holder = data_source.batch_ends_local[job_idx[1]]  # For different horizontal patches
コード例 #10
0
import dask.array as da
import numpy as np
dirname = '../data/'
filename = dirname + 'gyroidUniform.npy'

# Map numpy to .npy file 
np_map = np.memmap(filename, dtype=np.float64, 
                   mode='r', shape=(200,200,200))

# Map Dask array to numpy array
dask_arr = da.from_array(np_map, chunks=(25))

# Split into Blocks
da.to_npy_stack(dirname, dask_arr, axis=0)

# Load Blocks
one = np.load(dirname +'0.npy')
two = np.load(dirname +'1.npy')
three = np.load(dirname +'2.npy')
four = np.load(dirname +'3.npy')
five = np.load(dirname +'4.npy')
six = np.load(dirname +'5.npy')
seven = np.load(dirname +'6.npy')
eight = np.load(dirname +'7.npy')
コード例 #11
0
ファイル: ecog_emotion.py プロジェクト: valentina-s/OpenMind
def find_filename_data(au_emote_dict_loc,
                       classifier_loc,
                       real_time_file_loc,
                       out_loc,
                       out_q,
                       filename,
                       return_plot_data=False,
                       event_delta_seconds=1):
    """
    Find PSD data for given file and dump it

    :param au_emote_dict_loc: location of au_emote_dict, in json format
    :param classifier loc: location of pickled classifier
    :param real_time_file_loc: location of real times
    :param out_loc: Where to dump PSD data
    :param out_q: Output queue for multiprocessing
    :param filename: File to find data of
    :param return_plot_data: Whether this is used for plotting
    :param event_delta_seconds: Seconds around an event to do PSD of
    """

    tqdm_num, filename = filename
    tqdm_num = (tqdm_num % 5) + 1
    au_emote_dict = json.load(open(au_emote_dict_loc))
    raw = map_raw(filename)

    if raw is None:
        return

    events, times, corr = get_events(filename, au_emote_dict, classifier_loc,
                                     real_time_file_loc)

    if times is None:
        times = []
    # print("{0} number of times: {1}".format(filename, len(times)))

    if times:
        predicDic = {time: predic for time, predic in zip(times, corr)}
        eventTimes = set(x[0] for x in events)
        picks = mne.pick_types(raw.info, ecog=True, ecg=True)

        if return_plot_data:
            return get_window_data(raw, times, corr, picks, eventTimes,
                                   tqdm_num, filename, return_plot_data,
                                   event_delta_seconds)

        freqs, temp_all_data, temp_labels, temp_times, test_data, test_times = get_window_data(
            raw, times, corr, picks, eventTimes, tqdm_num, filename,
            return_plot_data, event_delta_seconds)

        if freqs is not None:
            freqs = da.from_array(freqs, chunks=(100, ))

        if freqs is not None:
            filename_out_dir = os.path.join(
                out_loc, 'classifier_data',
                os.path.basename(filename).replace('.edf', ''))

            if not os.path.exists(filename_out_dir):
                os.makedirs(filename_out_dir)

            da.to_npy_stack(os.path.join(filename_out_dir, 'freqs'), freqs)

            conditional_dump(temp_all_data,
                             os.path.join(filename_out_dir, 'data'))
            conditional_dump(temp_labels,
                             os.path.join(filename_out_dir, 'labels'))
            conditional_dump(temp_times, os.path.join(filename, 'times'))
            conditional_dump(test_data,
                             os.path.join(filename_out_dir, 'test_data'))
            conditional_dump(test_times,
                             os.path.join(filename_out_dir, 'test_times'))

            # if temp_all_data is not None:
            # da.to_npy_stack(
            # os.path.join(filename_out_dir, 'data'),
            # da.from_array(temp_all_data, chunks=(10000, -1, -1)))

            # if temp_labels is not None:
            # da.to_npy_stack(
            # os.path.join(filename_out_dir, 'labels'),
            # da.from_array(temp_labels, chunks=(10000, )))

            # if temp_times is not None:
            # da.to_npy_stack(
            # os.path.join(filename_out_dir, 'times'),
            # da.from_array(temp_times, chunks=(10000, )))
    out_q.put((filename, len(times)))
コード例 #12
0
ファイル: ecog_emotion.py プロジェクト: valentina-s/OpenMind
def conditional_dump(array, location):
    if array is not None:
        da.to_npy_stack(location, array)
コード例 #13
0
ファイル: ecog_emotion.py プロジェクト: emilaz/OpenMind
def find_filename_data(au_emote_dict_loc,
                       classifier_loc,
                       real_time_file_loc,
                       out_loc,
                       out_q,
                       filename,
                       return_plot_data=False,
                       event_delta_seconds=1):
    tqdm_num, filename = filename
    tqdm_num = (tqdm_num % 5) + 1
    au_emote_dict = json.load(open(au_emote_dict_loc))
    raw = map_raw(filename)

    if raw is None:
        return

    events, times, corr = get_events(filename, au_emote_dict, classifier_loc,
                                     real_time_file_loc)

    if times is None:
        times = []
    # print("{0} number of times: {1}".format(filename, len(times)))

    if times:
        predicDic = {time: predic for time, predic in zip(times, corr)}
        eventTimes = set(x[0] for x in events)
        picks = mne.pick_types(raw.info, ecog=True, ecg=True)

        if return_plot_data:
            return get_window_data(raw, times, corr, picks, eventTimes,
                                   tqdm_num, filename, return_plot_data,
                                   event_delta_seconds)

        freqs, temp_all_data, temp_labels, temp_times, test_data, test_times = get_window_data(
            raw, times, corr, picks, eventTimes, tqdm_num, filename,
            return_plot_data, event_delta_seconds)

        if freqs is not None:
            freqs = da.from_array(freqs, chunks=(100, ))

        if freqs is not None:
            filename_out_dir = os.path.join(out_loc, 'classifier_data',
                                            os.path.basename(filename).replace(
                                                '.edf', ''))

            if not os.path.exists(filename_out_dir):
                os.makedirs(filename_out_dir)

            da.to_npy_stack(os.path.join(filename_out_dir, 'freqs'), freqs)

            conditional_dump(temp_all_data,
                             os.path.join(filename_out_dir, 'data'))
            conditional_dump(temp_labels,
                             os.path.join(filename_out_dir, 'labels'))
            conditional_dump(temp_times, os.path.join(filename, 'times'))
            conditional_dump(test_data,
                             os.path.join(filename_out_dir, 'test_data'))
            conditional_dump(test_times,
                             os.path.join(filename_out_dir, 'test_times'))

            # if temp_all_data is not None:
            # da.to_npy_stack(
            # os.path.join(filename_out_dir, 'data'),
            # da.from_array(temp_all_data, chunks=(10000, -1, -1)))

            # if temp_labels is not None:
            # da.to_npy_stack(
            # os.path.join(filename_out_dir, 'labels'),
            # da.from_array(temp_labels, chunks=(10000, )))

            # if temp_times is not None:
            # da.to_npy_stack(
            # os.path.join(filename_out_dir, 'times'),
            # da.from_array(temp_times, chunks=(10000, )))
    out_q.put((filename, len(times)))
コード例 #14
0
        pattern=pattern,
    ).generator():
        b = block_array
        print("Done:", b.shape)



if __name__ == '__main__':
    if len(list(Path('.').glob('fail.zarr'))) == 0:
        from numcodecs import Blosc
        compressor = Blosc()
        x = da.random.random((100, 100, 3), chunks=(100, 100,3))
        x.to_zarr('fail.zarr', compressor=compressor, overwrite=True)
    if len(list(Path('.').glob('fail.npystack'))) == 0:
        x = da.random.random((100, 100, 3), chunks=(100, 100,3))
        da.to_npy_stack('fail.npystack', x)


    from dask.distributed import LocalCluster, Client
    # cluster = LocalCluster(host='0.0.0.0', n_workers=1, threads_per_worker=1)
    # client = Client(cluster)              # 'fail'
    client = Client(threads_per_worker=1)   # 'fail'
    # client = Client(processes=False, threads_per_worker=1)   # 'not fail'

    load_zarr() # 'fail' with zarr

    #load_zarr('fail.npystack') # 'not fail' with npystack



コード例 #15
0
    tst_predictions = []
    for data in tqdm(test):
        with torch.no_grad():
            #load data onto gpu
            #then forward pass
            images = data['image'].cuda(non_blocking=True)

            output = model.eval()(images)
            pred = nn.Sigmoid()(output)

        tst_predictions.append(pred.detach().cpu().numpy())

    tst_predictions = np.concatenate(tst_predictions, axis=0)

    #create an array of labels that are all zeros and fill in the values from a combination
    #of the ground truth labels from training and validation sets and the predicted
    #labels for unlabeled indices
    #convert gt_labels from strings to integers
    predicted_labels = (gt_labels == 'informative').astype(np.uint8)
    predicted_labels[unlabeled_indices] = (tst_predictions[:, 0] > 0.5).astype(
        np.uint8)

    print(f'Saving predictions...')
    np.save(os.path.join(savedir, "nn_predictions.npy"), predicted_labels)

    print(f'Saving filepaths...')
    filtered_fpaths = da.from_array(impaths[predicted_labels == 1].compute())
    da.to_npy_stack(os.path.join(savedir, 'nn_filtered_fpaths.npz'),
                    filtered_fpaths)

    print('Finished.')
コード例 #16
0
            #remove all the matched images from both hashes and impaths
            hashes = np.delete(hashes, matches, axis=0)
            impaths = np.delete(impaths, matches, axis=0)

        #because this script can take a long time to complete, let's save checkpoint
        #results for each dataset when it's finished with deduplication, then we have
        #the option to resume later on
        np.save(exemplar_fpath, np.array(exemplars))

    #run the dataset level deduplication on multiple groups at once
    #results for each group are saved in separate .npy files, if the
    #.npy file already exists, then it will be skipped. This makes it
    #easier to add new datasets to the existing directory structure
    with Pool(processes) as pool:
        pool.map(group_dedupe, list(zip(unq_datasets, groups_impaths)))

    #now that all the patches from individual datasets are deduplicated,
    #we'll combine all the separate .npy arrays into a single dask array and save it
    exemplar_fpaths = glob(os.path.join(savedir, '*_exemplars.npy'))
    deduplicated_fpaths = np.concatenate(
        [np.load(fp) for fp in exemplar_fpaths])

    #convert to dask and save
    deduplicated_fpaths = da.from_array(deduplicated_fpaths)
    da.to_npy_stack(os.path.join(savedir, 'deduplicated_fpaths.npz'),
                    deduplicated_fpaths)

    #print the total number of deduplicated patches
    print(f'{len(deduplicated_fpaths)} patches remaining after deduplication.')
コード例 #17
0
def write_to_npy_stack(out_dir, arr):
    a__, b__, c__ = da.to_npy_stack(out_dir, arr, axis=0)
    _ = dask.base.compute_as_if_collection(a__, b__, c__)