def test_export(self): path_hdf5 = tempfile.mktemp(".hdf5") path_hdf5_ui = tempfile.mktemp(".hdf5") path_fits = tempfile.mktemp(".fits") path_fits_ui = tempfile.mktemp(".fits") for dataset in [self.dataset]: self.app.dataset_selector.add(dataset) for fraction in [1, 0.5]: dataset.set_active_fraction(fraction) dataset.select("x > 3") length = len(dataset) # TODO: gui doesn't export virtual columns, add "z" to this list for column_names in [["x", "y"], ["x"], ["y"]]: for byteorder in "=<>": for shuffle in [False, True]: for selection in [False, True]: for export in [dataset.export_fits, dataset.export_hdf5] if byteorder == ">" else [dataset.export_hdf5]: type = "hdf5" if export == dataset.export_hdf5 else "fits" if shuffle and selection: continue # TODO: export should fail on this combination #print column_names, byteorder, shuffle, selection, type if export == dataset.export_hdf5: path = path_hdf5 path_ui = path_hdf5_ui export(path, column_names=column_names, byteorder=byteorder, shuffle=shuffle, selection=selection) else: path = path_fits path_ui = path_fits_ui export(path, column_names=column_names, shuffle=shuffle, selection=selection) compare_direct = vx.open(path) dialogs.set_choose(1 if selection else 0).then("=<>".index(byteorder)) # select columns dialogs.set_select_many(True, [name in column_names for name in dataset.get_column_names()]) counter_confirm = CallCounter(return_value=shuffle) counter_info = CallCounter() dialogs.dialog_confirm = counter_confirm dialogs.dialog_info = counter_info dialogs.get_path_save = lambda *args: path_ui dialogs.ProgressExecution = dialogs.FakeProgressExecution import sys sys.stdout.flush() self.app.export(type=type) compare_ui = vx.open(path_ui) column_names = column_names or ["x", "y", "z"] self.assertEqual(compare_direct.get_column_names(), compare_ui.get_column_names()) for column_name in column_names: values_ui = compare_ui.evaluate(column_name) values = compare_direct.evaluate(column_name) self.assertEqual(sorted(values), sorted(values_ui))
def test_export(self): path_hdf5 = tempfile.mktemp(".hdf5") path_hdf5_ui = tempfile.mktemp(".hdf5") path_fits = tempfile.mktemp(".fits") path_fits_ui = tempfile.mktemp(".fits") for dataset in [self.dataset]: self.app.dataset_selector.add(dataset) for fraction in [1, 0.5]: dataset.set_active_fraction(fraction) dataset.select("x > 3") length = len(dataset) # TODO: gui doesn't export virtual columns, add "z" to this list for column_names in [["x", "y"], ["x"], ["y"]]: for byteorder in "=<>": for shuffle in [False, True]: for selection in [False, True]: for export in [ dataset.export_fits, dataset.export_hdf5 ] if byteorder == ">" else [ dataset.export_hdf5 ]: type = "hdf5" if export == dataset.export_hdf5 else "fits" if shuffle and selection: continue # TODO: export should fail on this combination #print column_names, byteorder, shuffle, selection, type if export == dataset.export_hdf5: path = path_hdf5 path_ui = path_hdf5_ui export(path, column_names=column_names, byteorder=byteorder, shuffle=shuffle, selection=selection) else: path = path_fits path_ui = path_fits_ui export(path, column_names=column_names, shuffle=shuffle, selection=selection) compare_direct = vx.open(path) dialogs.set_choose( 1 if selection else 0).then( "=<>".index(byteorder)) # select columns dialogs.set_select_many( True, [ name in column_names for name in dataset.get_column_names() ]) counter_confirm = CallCounter( return_value=shuffle) counter_info = CallCounter() dialogs.dialog_confirm = counter_confirm dialogs.dialog_info = counter_info dialogs.get_path_save = lambda *args: path_ui dialogs.ProgressExecution = dialogs.FakeProgressExecution import sys sys.stdout.flush() self.app.export(type=type) compare_ui = vx.open(path_ui) column_names = column_names or [ "x", "y", "z" ] self.assertEqual( compare_direct.get_column_names(), compare_ui.get_column_names()) for column_name in column_names: values_ui = compare_ui.evaluate( column_name) values = compare_direct.evaluate( column_name) self.assertEqual( sorted(values), sorted(values_ui))
def export_fits(dataset, path, column_names=None, shuffle=False, selection=False, progress=None, virtual=True, sort=None, ascending=True): """ :param DatasetLocal dataset: dataset to export :param str path: path for file :param lis[str] column_names: list of column names to export or None for all columns :param bool shuffle: export rows in random order :param bool selection: export selection or not :param progress: progress callback that gets a progress fraction as argument and should return True to continue, or a default progress bar when progress=True :param: bool virtual: When True, export virtual columns :return: """ if shuffle: random_index_name = "random_index" while random_index_name in dataset.get_column_names(): random_index_name += "_new" column_names = column_names or dataset.get_column_names(virtual=virtual, strings=True) logger.debug("exporting columns(fits): %r" % column_names) N = len(dataset) if not selection else dataset.selected_length(selection) data_types = [] data_shapes = [] ucds = [] units = [] for column_name in column_names: if column_name in dataset.get_column_names(strings=True, virtual=False): column = dataset.columns[column_name] shape = (N,) + column.shape[1:] dtype = column.dtype if dataset.is_string(column_name): max_length = dataset[column_name].apply(lambda x: len(x)).max(selection=selection) dtype = np.dtype('S'+str(int(max_length))) else: dtype = np.float64().dtype shape = (N,) ucds.append(dataset.ucds.get(column_name)) units.append(dataset.units.get(column_name)) data_types.append(dtype) data_shapes.append(shape) if shuffle: column_names.append(random_index_name) data_types.append(np.int64().dtype) data_shapes.append((N,)) ucds.append(None) units.append(None) else: random_index_name = None # TODO: all expressions can have missing values.. how to support that? null_values = {key: dataset.columns[key].fill_value for key in dataset.get_column_names() if dataset.is_masked(key) and dataset.data_type(key).kind != "f"} empty(path, N, column_names, data_types, data_shapes, ucds, units, null_values=null_values) if shuffle: del column_names[-1] del data_types[-1] del data_shapes[-1] dataset_output = vaex.astro.fits.FitsBinTable(path, write=True) df_output = vaex.dataframe.DataFrameLocal(dataset_output) vaex.export._export(dataset_input=dataset, dataset_output=df_output, path=path, random_index_column=random_index_name, column_names=column_names, selection=selection, shuffle=shuffle, progress=progress, sort=sort, ascending=ascending) dataset_output.close()
def unique_column_names(dataset): #return list(set(dataset.column_names) | set(dataset.virtual_columns.keys())) return dataset.get_column_names(virtual=True)