def __init__(self, path_to_db_files, path_to_output_db_file=None): """Initialize V1BaseDBHandler. Args: path_to_db_files (list): list fo db-files with extension '.arrow' path_to_output_db_file (str): output database for saving """ if isinstance(path_to_db_files, str): path_to_db_files = [path_to_db_files] # Check file extension for file in path_to_db_files: if not file.endswith('.arrow'): raise ValueError('DB file path must ends with ".arrow"') self.path_to_db_file = path_to_db_files[0] if path_to_output_db_file is not None: self.path_to_db_file = path_to_output_db_file self._cursor = 0 self.logger = logging.getLogger(__name__) # load config self._config = load_config(self.__version__) self.columns = self._config[self.df_name]['columns'] # initialize dataframe try: self._df = vaex.open_many(path_to_db_files) except IOError: self._df = None
def _consolidate_folder(self, folder): abs_path = os.path.abspath(self.folder_path + '/' + folder) contents = os.listdir(abs_path) concatenated_df = vaex.open_many( [str(abs_path + '/' + content) for content in contents]) _ = [os.remove(abs_path + '/' + content) for content in contents] concatenated_df.export_hdf5(abs_path + '/' + folder.lower() + '.hdf5', )
def h5pandas_to_vaex_combine(tmp_search_pattern: str, path_out_str: str, check_files_number: int = None, del_found_tmp_files: bool = False, **export_hdf5_args) -> int: """ Combine many vaex.hdf5 files to one :param tmp_search_pattern: :param path_out_str: path argument of vaex.dataframe.export_hdf5() :param check_files_number: if not None must be equl to number of found files :param del_found_tmp_files: not implemented feature :param export_hdf5_args, dict, optional. Note that here default of 'progress' is set to True :return: number of tmp files found """ # Find files hdf5_list = glob.glob(tmp_search_pattern) hdf5_list.sort() # hdf5_list_array = np.array(hdf5_list) # Check files existence if Path(path_out_str).is_file(): lf.warning('Overwriting {:s}!', path_out_str) if check_files_number: assert len(hdf5_list) == check_files_number, "Incorrect number of files" lf.info('Combining {:d} found {:s} files to {:s}', check_files_number, tmp_search_pattern, PurePath(path_out_str).name) else: check_files_number = len(hdf5_list) lf.info('Combining {:s} to {:s}', tmp_search_pattern, PurePath(path_out_str).name) master_df = vaex.open_many(hdf5_list) try: master_df.export_hdf5(**{'path': path_out_str, 'progress': True, **export_hdf5_args}) except AttributeError as e: # , progress=True gets AttributeError: 'ProgressBar' object has no attribute 'stime0' lf.debug('Try install progressbar2') pass # delete tmp files found if del_found_tmp_files: # will not work, todo: do only when export finished (use custom progress func?) try: i = 0 for i, path_tmp in enumerate(hdf5_list): Path(path_tmp).unlink() # remove file except Exception: lf.exception('Combined {0:d} but removed {i:d} temporary vaex.hdf5 files:', check_files_number, i=i) else: lf.info('Combined and removed {0:d} files.', check_files_number) else: lf.info('Combined {:d} files ({:s}), they remains', check_files_number, tmp_search_pattern) return check_files_number
def read_dataframes_from_file( path_list: str) -> Optional[vaex.dataframe.DataFrame]: """Only read dataframe present in data/processed. Args: path_list: list of path relative to data/processed. Returns: vaex dataframe. """ base_path = os.path.join(get_base_data_path(), "processed") file_path_list: List[str] = [] for path in path_list: file_path = os.path.join(base_path, f"{path}.arrow") if not os.path.exists(file_path): raise OSError file_path_list.append(file_path) return vaex.open_many(file_path_list)
def main(argv): import argparse parser = argparse.ArgumentParser(argv[0]) parser.add_argument('--verbose', '-v', action='count', default=0) parser.add_argument('--quiet', '-q', default=False, action='store_true', help="do not output anything") parser.add_argument('--list', '-l', default=False, action='store_true', help="list columns of input") parser.add_argument('--progress', help="show progress (default: %(default)s)", default=True, action='store_true') parser.add_argument('--no-progress', dest="progress", action='store_false') parser.add_argument('--shuffle', "-s", dest="shuffle", action='store_true', default=False) parser.add_argument('--sort', dest="sort", default=None) parser.add_argument('--virtual', dest="virtual", action='store_true', default=False, help="Also export virtual columns") parser.add_argument('--fraction', "-f", dest="fraction", type=float, default=1.0, help="fraction of input dataset to export") parser.add_argument('--filter', dest="filter", default=None, help="filter to apply before exporting") subparsers = parser.add_subparsers(help='type of input source', dest="task") parser_soneira = subparsers.add_parser( 'soneira', help='create soneira peebles dataset') parser_soneira.add_argument('output', help='output file') parser_soneira.add_argument( "columns", help="list of columns to export (or all when empty)", nargs="*") parser_soneira.add_argument('--dimension', '-d', type=int, help='dimensions', default=4) # parser_soneira.add_argument('--eta','-e', type=int, help='dimensions', default=3) parser_soneira.add_argument('--max-level', '-m', type=int, help='dimensions', default=28) parser_soneira.add_argument('--lambdas', '-l', type=int, help='lambda values for fractal', default=[1.1, 1.3, 1.6, 2.]) parser_tap = subparsers.add_parser( 'tap', help='use TAP (Table Access Protocol) as source') parser_tap.add_argument("tap_url", help="input source or file") parser_tap.add_argument("table_name", help="input source or file") parser_tap.add_argument("output", help="output file (ends in .fits or .hdf5)") parser_tap.add_argument( "columns", help="list of columns to export (or all when empty)", nargs="*") parser_file = subparsers.add_parser( 'file', help= 'use a file as source (e.g. .hdf5, .fits, .vot (VO table), .asc (ascii)' ) parser_file.add_argument( "input", help= "input source or file, when prefixed with @ it is assumed to be a text file with a file list (one file per line)" ) parser_file.add_argument("output", help="output file (ends in .fits or .hdf5)") parser_file.add_argument( "columns", help="list of columns to export (or all when empty)", nargs="*") parser_file = subparsers.add_parser( 'csv', help= 'use a csv file as source (e.g. .hdf5, .fits, .vot (VO table), .asc (ascii)' ) parser_file.add_argument( "input", help= "input source or file, when prefixed with @ it is assumed to be a text file with a file list (one file per line)" ) parser_file.add_argument("output", help="output file (ends in .hdf5)") parser_file.add_argument( "columns", help="list of columns to export (or all when empty)", nargs="*") args = parser.parse_args(argv[1:]) verbosity = ["ERROR", "WARNING", "INFO", "DEBUG"] logging.getLogger("vaex").setLevel(verbosity[min(3, args.verbose)]) dataset = None if args.task == "soneira": if vaex.utils.check_memory_usage(4 * 8 * 2**args.max_level, vaex.utils.confirm_on_console): if not args.quiet: print("generating soneira peebles dataset...") dataset = vaex.dataset_misc.SoneiraPeebles(args.dimension, 2, args.max_level, args.lambdas) dataset = vaex.dataframe.DataFrameLocal(dataset) else: return 1 if args.task == "tap": dataset = vaex.dataset.DatasetTap(args.tap_url, args.table_name) if not args.quiet: print( "exporting from {tap_url} table name {table_name} to {output}". format(tap_url=args.tap_url, table_name=args.table_name, output=args.output)) if args.task == "csv": # dataset = vaex.dataset.DatasetTap(args.tap_url, args.table_name) if not args.quiet: print("exporting from {input} to {output}".format( input=args.input, output=args.output)) if args.task == "file": if args.input[0] == "@": inputs = open(args.input[1:]).readlines() dataset = vaex.open_many(inputs) else: dataset = vaex.open(args.input) if not args.quiet: print("exporting from {input} to {output}".format( input=args.input, output=args.output)) if dataset is None and args.task not in ["csv"]: if not args.quiet: print("Cannot open input") return 1 if dataset: dataset.set_active_fraction(args.fraction) if args.list: if not args.quiet: print("columns names: " + " ".join(dataset.get_column_names())) else: if args.task == "csv": row_count = -1 # the header does not count with file(args.input) as lines: for line in lines: row_count += 1 # print line logger.debug("row_count: %d", row_count) with file(args.input) as lines: line = next(lines).strip() # print line names = line.strip().split(",") line = next(lines).strip() values = line.strip().split(",") numerics = [] for value in values: try: float(value) numerics.append(True) except: numerics.append(False) names_numeric = [ name for name, numeric in zip(names, numerics) if numeric ] print(names_numeric) output = vaex.dataset_misc.Hdf5MemoryMapped.create( args.output, row_count, names_numeric) Ncols = len(names) cols = [ output.columns[name] if numeric else None for name, numeric in zip(names, numerics) ] def copy(line, row_index): values = line.strip().split(",") for column_index in range(Ncols): if numerics[column_index]: value = float(values[column_index]) cols[column_index][row_index] = value row = 0 copy(line, row) row += 1 progressbar = vaex.utils.progressbar( title="exporting") if args.progress else None for line in lines: # print line copy(line, row) row += 1 if row % 1000: progressbar.update(row / float(row_count)) progressbar.finish() # print names else: if args.columns: columns = args.columns else: columns = None if columns is None: columns = dataset.get_column_names(strings=True, virtual=args.virtual) for column in columns: if column not in dataset.get_column_names(strings=True, virtual=True): if not args.quiet: print( "column %r does not exist, run with --list or -l to list all columns" % column) return 1 base, output_ext = os.path.splitext(args.output) if output_ext not in [".hdf5", ".fits", ".arrow"]: if not args.quiet: print( "extension %s not supported, only .hdf5, .arrow and .fits are" % output_ext) return 1 if not args.quiet: print("exporting %d rows and %d columns" % (len(dataset), len(columns))) print("columns: " + " ".join(columns)) progressbar = vaex.utils.progressbar( title="exporting") if args.progress else None def update(p): if progressbar: progressbar.update(p) return True if args.filter: dataset.select(args.filter, name='export') selection = 'export' else: selection = None if output_ext == ".hdf5": export_hdf5(dataset, args.output, column_names=columns, progress=update, shuffle=args.shuffle, sort=args.sort, selection=selection) elif output_ext == ".arrow": from vaex.arrow.export import export as export_arrow export_arrow(dataset, args.output, column_names=columns, progress=update, shuffle=args.shuffle, sort=args.sort, selection=selection) elif output_ext == ".fits": export_fits(dataset, args.output, column_names=columns, progress=update, shuffle=args.shuffle, sort=args.sort, selection=selection) if progressbar: progressbar.finish() if not args.quiet: print("\noutput to %s" % os.path.abspath(args.output)) dataset.close() return 0
def open(self): return vx.open_many(self.filenames_vaex) if len(self.filenames_vaex) != 1 else vx.open(self.filenames_vaex[0])
def open(self): return vx.open_many( self.filenames_vaex) if len(self.filenames_vaex) != 1 else vx.open( self.filenames_vaex[0])
def main(argv): import argparse parser = argparse.ArgumentParser(argv[0]) parser.add_argument('--verbose', '-v', action='count', default=0) parser.add_argument('--quiet', '-q', default=False, action='store_true', help="do not output anything") parser.add_argument('--list', '-l', default=False, action='store_true', help="list columns of input") parser.add_argument('--progress', help="show progress (default: %(default)s)", default=True, action='store_true') parser.add_argument('--no-progress', dest="progress", action='store_false') parser.add_argument('--shuffle', "-s", dest="shuffle", action='store_true', default=False) parser.add_argument('--sort', dest="sort", default=None) parser.add_argument('--virtual', dest="virtual", action='store_true', default=False, help="Also export virtual columns") parser.add_argument('--fraction', "-f", dest="fraction", type=float, default=1.0, help="fraction of input dataset to export") parser.add_argument('--filter', dest="filter", default=None, help="filter to apply before exporting") subparsers = parser.add_subparsers(help='type of input source', dest="task") parser_soneira = subparsers.add_parser('soneira', help='create soneira peebles dataset') parser_soneira.add_argument('output', help='output file') parser_soneira.add_argument("columns", help="list of columns to export (or all when empty)", nargs="*") parser_soneira.add_argument('--dimension', '-d', type=int, help='dimensions', default=4) # parser_soneira.add_argument('--eta','-e', type=int, help='dimensions', default=3) parser_soneira.add_argument('--max-level', '-m', type=int, help='dimensions', default=28) parser_soneira.add_argument('--lambdas', '-l', type=int, help='lambda values for fractal', default=[1.1, 1.3, 1.6, 2.]) parser_tap = subparsers.add_parser('tap', help='use TAP (Table Access Protocol) as source') parser_tap.add_argument("tap_url", help="input source or file") parser_tap.add_argument("table_name", help="input source or file") parser_tap.add_argument("output", help="output file (ends in .fits or .hdf5)") parser_tap.add_argument("columns", help="list of columns to export (or all when empty)", nargs="*") parser_file = subparsers.add_parser('file', help='use a file as source (e.g. .hdf5, .fits, .vot (VO table), .asc (ascii)') parser_file.add_argument("input", help="input source or file, when prefixed with @ it is assumed to be a text file with a file list (one file per line)") parser_file.add_argument("output", help="output file (ends in .fits or .hdf5)") parser_file.add_argument("columns", help="list of columns to export (or all when empty)", nargs="*") parser_file = subparsers.add_parser('csv', help='use a csv file as source (e.g. .hdf5, .fits, .vot (VO table), .asc (ascii)') parser_file.add_argument("input", help="input source or file, when prefixed with @ it is assumed to be a text file with a file list (one file per line)") parser_file.add_argument("output", help="output file (ends in .hdf5)") parser_file.add_argument("columns", help="list of columns to export (or all when empty)", nargs="*") args = parser.parse_args(argv[1:]) verbosity = ["ERROR", "WARNING", "INFO", "DEBUG"] logging.getLogger("vaex").setLevel(verbosity[min(3, args.verbose)]) dataset = None if args.task == "soneira": if vaex.utils.check_memory_usage(4 * 8 * 2**args.max_level, vaex.utils.confirm_on_console): if not args.quiet: print("generating soneira peebles dataset...") dataset = vaex.file.other.SoneiraPeebles(args.dimension, 2, args.max_level, args.lambdas) else: return 1 if args.task == "tap": dataset = vaex.dataset.DatasetTap(args.tap_url, args.table_name) if not args.quiet: print("exporting from {tap_url} table name {table_name} to {output}".format(tap_url=args.tap_url, table_name=args.table_name, output=args.output)) if args.task == "csv": # dataset = vaex.dataset.DatasetTap(args.tap_url, args.table_name) if not args.quiet: print("exporting from {input} to {output}".format(input=args.input, output=args.output)) if args.task == "file": if args.input[0] == "@": inputs = open(args.input[1:]).readlines() dataset = vaex.open_many(inputs) else: dataset = vaex.open(args.input) if not args.quiet: print("exporting from {input} to {output}".format(input=args.input, output=args.output)) if dataset is None and args.task not in ["csv"]: if not args.quiet: print("Cannot open input") return 1 if dataset: dataset.set_active_fraction(args.fraction) if args.list: if not args.quiet: print("columns names: " + " ".join(dataset.get_column_names())) else: if args.task == "csv": row_count = -1 # the header does not count with file(args.input) as lines: for line in lines: row_count += 1 # print line logger.debug("row_count: %d", row_count) with file(args.input) as lines: line = next(lines).strip() # print line names = line.strip().split(",") line = next(lines).strip() values = line.strip().split(",") numerics = [] for value in values: try: float(value) numerics.append(True) except: numerics.append(False) names_numeric = [name for name, numeric in zip(names, numerics) if numeric] print(names_numeric) output = vaex.file.other.Hdf5MemoryMapped.create(args.output, row_count, names_numeric) Ncols = len(names) cols = [output.columns[name] if numeric else None for name, numeric in zip(names, numerics)] def copy(line, row_index): values = line.strip().split(",") for column_index in range(Ncols): if numerics[column_index]: value = float(values[column_index]) cols[column_index][row_index] = value row = 0 copy(line, row) row += 1 progressbar = vaex.utils.progressbar(title="exporting") if args.progress else None for line in lines: # print line copy(line, row) row += 1 if row % 1000: progressbar.update(row / float(row_count)) progressbar.finish() # print names else: if args.columns: columns = args.columns else: columns = None if columns is None: columns = dataset.get_column_names(strings=True, virtual=args.virtual) for column in columns: if column not in dataset.get_column_names(strings=True, virtual=True): if not args.quiet: print("column %r does not exist, run with --list or -l to list all columns" % column) return 1 base, output_ext = os.path.splitext(args.output) if output_ext not in [".hdf5", ".fits", ".arrow"]: if not args.quiet: print("extension %s not supported, only .hdf5, .arrow and .fits are" % output_ext) return 1 if not args.quiet: print("exporting %d rows and %d columns" % (len(dataset), len(columns))) print("columns: " + " ".join(columns)) progressbar = vaex.utils.progressbar(title="exporting") if args.progress else None def update(p): if progressbar: progressbar.update(p) return True if args.filter: dataset.select(args.filter, name='export') selection = 'export' else: selection = None if output_ext == ".hdf5": export_hdf5(dataset, args.output, column_names=columns, progress=update, shuffle=args.shuffle, sort=args.sort, selection=selection) elif output_ext == ".arrow": from vaex_arrow.export import export as export_arrow export_arrow(dataset, args.output, column_names=columns, progress=update, shuffle=args.shuffle, sort=args.sort, selection=selection) elif output_ext == ".fits": export_fits(dataset, args.output, column_names=columns, progress=update, shuffle=args.shuffle, sort=args.sort, selection=selection) if progressbar: progressbar.finish() if not args.quiet: print("\noutput to %s" % os.path.abspath(args.output)) dataset.close_files() return 0
def main(argv): import argparse parser = argparse.ArgumentParser(argv[0]) parser.add_argument('--verbose', '-v', action='count', default=0) parser.add_argument('--quiet', '-q', default=False, action='store_true', help="do not output anything") parser.add_argument('--list', '-l', default=False, action='store_true', help="list columns of input") parser.add_argument('--progress', help="show progress (default: %(default)s)", default=True, action='store_true') parser.add_argument('--no-progress', dest="progress", action='store_false') parser.add_argument('--no-delete', help="Delete file on failure (default: %(default)s)", dest='delete', default=True, action='store_false') parser.add_argument('--shuffle', "-s", dest="shuffle", action='store_true', default=False) parser.add_argument('--sort', dest="sort", default=None) parser.add_argument('--fraction', "-f", dest="fraction", type=float, default=1.0, help="fraction of input dataset to export") parser.add_argument('--filter', dest="filter", default=None, help="filter to apply before exporting") parser.add_argument( "input", help= "input source or file, when prefixed with @ it is assumed to be a text file with a file list (one file per line)" ) parser.add_argument("output", help="output file (ends in .hdf5)") parser.add_argument("columns", help="list of columns to export (or all when empty)", nargs="*") args = parser.parse_args(argv[1:]) verbosity = ["ERROR", "WARNING", "INFO", "DEBUG"] logging.getLogger("vaex").setLevel(verbosity[min(3, args.verbose)]) if args.input[0] == "@": inputs = open(args.input[1:]).readlines() df = vaex.open_many(inputs) else: df = vaex.open(args.input) if df: df.set_active_fraction(args.fraction) if args.list: print("\n".join(df.get_column_names())) else: if args.columns: all_columns = df.get_column_names() columns = args.columns for column in columns: if column not in all_columns: # if not args.quiet: print( "column %r does not exist, run with --list or -l to list all columns" % column) return 1 df = df[columns] else: columns = df.get_column_names() if not args.quiet: print("exporting %d rows and %d columns" % (len(df), len(columns))) print("columns: " + " ".join(columns)) if args.filter: df = df.filter(args.filter) if args.sort: df = df.sort(args.sort) try: df.export(args.output, progress=args.progress) if not args.quiet: print("\noutput to %s" % os.path.abspath(args.output)) df.close() except: if not args.quiet: print("\nfailed to write to%s" % os.path.abspath(args.output)) if args.delete: if args.delete: os.remove(args.output) print( "\ndeleted output %s (pass --no-delete to avoid that)" % os.path.abspath(args.output)) raise return 0
# to change mz_idx to m/z there are two possibilities. # 0. use the built in method MZ = D.mzIdx2mz(S.mz_idx) I = S.i plot_spectrum(MZ, I) # or used a fitted model MS2 = D.mzIdx2mz_model(S.mz_idx) D.plot_models() # making it all faster hdf5 = Path("/mnt/samsung/bruker/testHDF5/prec_prec_100ms") hdf5_files = [str(f) for f in hdf5.glob('*.hdf5')] p = Path('/home/matteo/Projects/bruker/BrukerMIDIA/MIDIA_CE10_precursor/20190912_HeLa_Bruker_TEN_MIDIA_200ng_CE10_100ms_Slot1-9_1_488.d') D = TimsDIA(p) R = vx.open_many(hdf5_files) R['rt'] = D.frame2rt_model(R.frame) R['im'] = D.scan2im_model(R.scan) R['mz'] = D.tof2mz_model(R.tof) R.plot(R.mz, R.im, shape=(1000,919)) plt.show() D.plot_models() frames = range(1,100) list(frames)
import vaex as vx from timspy.timspy import TimsDIA from timspy.iterators import ranges from time import time p = Path('/home/matteo/Projects/bruker/BrukerMIDIA/MIDIA_CE10_precursor/20190912_HeLa_Bruker_TEN_MIDIA_200ng_CE10_100ms_Slot1-9_1_488.d') D = TimsDIA(p) # output_folder = Path("/home/matteo/Projects/bruker/data_dumps/prec_prec_100ms") output_folder = Path('/mnt/samsung/bruker/testHDF5/prec_prec_100ms') # D.to_hdf5(output_folder) D.tof2mz_model.plot() D.tof2mz_model.params df = vx.open_many([str(p) for p in output_folder.glob("*.hdf5")]) df.plot(df.tof, df.scan, what=vx.stat.sum(df.i)) plt.tight_layout() plt.show() df.plot(df.tof, df.scan, what=vx.stat.sum(df.i)) plt.tight_layout() plt.show() # check if all stats are there x = df.count(df.i, binby=[df.scan], shape=1000) S = df.groupby(df.scan).agg({'i':'sum'}) np.sort(S.scan.values) S = df.groupby(df.scan).agg({'i':'sum'})
import numpy as np def tryint(s): try: return int(s) except: return s def alphanum_key(s): """ Turn a string into a list of string and number chunks. "z23a" -> ["z", 23, "a"] """ return [ tryint(c) for c in re.split('([0-9]+)', s) ] # print(dic) hdf5_list = glob.glob('./data_output/*.hdf5') hdf5_list.sort(key=alphanum_key) hdf5_list = np.array(hdf5_list) print(hdf5_list) # df = vaex.open(hdf5_list[0]) # print(df) # This is an important step master_df = vaex.open_many(hdf5_list) # exporting master_df.export_hdf5(path='./green_tripdata.hdf5') # df = vaex.open('./green_tripdata.hdf5') print(df)
def perform_update(): # Print that database is being updated print(f"Updating micro-lensing database in {ARGS.dir!r}.") # Open the master HDF5-file, creating it if it does not exist yet with h5py.File(ARGS.master_file, mode='a') as m_file: # Set the version of MLDatabase m_file.attrs['version'] = __version__ # Obtain what exposures the database knows about n_expnums_known = m_file.attrs.setdefault('n_expnums', 0) expnums_dset =\ m_file.require_dataset('expnums', shape=(n_expnums_known,), dtype=[*list(XTR_HEADER.items())[:-1], ('last_modified', int)], maxshape=(None,)) expnums_known = expnums_dset[:] # Obtain sorted string of all files available filenames = str(sorted(next(os.walk(ARGS.dir))[2])) # Create a regex iterator re_iter = re.finditer(EXP_REGEX, filenames) # Create dict with up to ARGS.n_expnums exposure files exp_dict = {int(m['expnum']): (path.join(ARGS.dir, m['exp_file']), path.join(ARGS.dir, m['xtr_file'])) for m in islice(re_iter, ARGS.n_expnums)} # Add the required flat exposure files (REGEX above explicitly ignores it) # exp_dict[0] = (path.join(ARGS.dir, REQ_FILES[0]), # path.join(ARGS.dir, REQ_FILES[1])) # Initialize the number of exposures found and their types n_expnums = len(exp_dict) expnums_outdated = [] # Create empty list of temporary HDF5-files temp_files = [] # Determine which ones require updating for expnum, *_, mtime in expnums_known: # Try to obtain the exp_files of this expnum exp_files = exp_dict.get(expnum) # If this is not None, it is already known if exp_files is not None: # Check if it requires updating by comparing last-modified times if(path.getmtime(exp_files[0]) > mtime): # If so, add to expnums_outdated expnums_outdated.append(expnum) continue else: # If not, remove from dict exp_dict.pop(expnum) # Determine path to temporary HDF5-file of exposure temp_hdf5 = path.join(ARGS.mld, TEMP_EXP_FILE.format(expnum)) # If it already exists, add it to temp_files if path.exists(temp_hdf5): temp_files.append(temp_hdf5) # Print the number of exposure files found n_expnums_outdated = len(expnums_outdated) n_expnums_new = len(exp_dict)-n_expnums_outdated n_expnums_temp = len(temp_files) print(f"\nFound {n_expnums:,} exposure files, of which {n_expnums_new:,} " f"are new and {n_expnums_outdated:,} are outdated. Also found " f"{n_expnums_temp:,} processed exposure files that require merging.") # If exp_dict contains at least 1 item if exp_dict: # Create tqdm iterator for processing exp_iter = tqdm(exp_dict.items(), desc="Processing exposure files", dynamic_ncols=True) # Process all exposure files try: for expnum, exp_files in exp_iter: # Set which exposure is being processed in exp_iter exp_iter.set_postfix_str(path.basename(exp_files[0])) # Process this exposure temp_files.append(process_exp_files(expnum, exp_files)) # If a KeyboardInterrupt is raised, update database with progress except KeyboardInterrupt: print("WARNING: Processing has been interrupted. Updating " "database with currently processed exposures.") # Open master file with h5py.File(ARGS.master_file, 'r+') as m_file: # Obtain the total number of exposures now n_expnums_known = m_file.attrs['n_expnums'] # If temp_files contains at least 1 item if temp_files: # Import vaex import vaex # Update database print("\nUpdating database with processed exposures (NOTE: This may " "take a while for large databases).") # Divide temp_files up into lists of length 100 with last of length 150 n_temp = len(temp_files) temp_files = [temp_files[slc] for slc in dyn_range(len(temp_files))] # If the master exposure file exists and there are outdated exposures if path.exists(ARGS.master_exp_file) and expnums_outdated: # Wrap in try-statement to ensure file is closed try: # Open the master exposure file master_df = vaex.open(ARGS.master_exp_file) # Solely select the exposures that were not outdated for expnum in expnums_outdated: master_df = master_df.filter(master_df.expnum != expnum, 'and') # Extract the master DataFrame master_df = master_df.extract() # Export to HDF5 master_temp_file = path.join(ARGS.mld, 'temp.hdf5') master_df.export_hdf5(master_temp_file) # Close master exposure file finally: master_df.close() # Remove original master file os.remove(ARGS.master_exp_file) # Rename master_temp_file to master exposure file name os.rename(master_temp_file, ARGS.master_exp_file) # Create tqdm iterator for merging temp_iter = tqdm(desc="Merging processed exposure files", total=n_temp, dynamic_ncols=True) # Loop over all temporary exposure HDF5-files # TODO: Figure out how to avoid copying over all the data every time for temp_files_list in temp_files: # Determine number of files in this list n_temp_list = len(temp_files_list) # Wrap in try-statement to ensure files are closed try: # Open all temporary exposure HDF5-files in this list temp_df = vaex.open_many(temp_files_list) # Add to master_df if it exists if path.exists(ARGS.master_exp_file): # Open the master exposure file master_df = vaex.open(ARGS.master_exp_file) master_df = master_df.concat(temp_df) temp_files_list.append(ARGS.master_exp_file) else: master_df = temp_df # Export to HDF5 master_temp_file = path.join(ARGS.mld, 'temp.hdf5') master_df.export_hdf5(master_temp_file) # Close all temporary HDF5-files finally: master_df.close_files() # Remove all temporary files for temp_file in temp_files_list: os.remove(temp_file) # Rename master_temp_file to master exposure file name os.rename(master_temp_file, ARGS.master_exp_file) # Update tqdm iterator temp_iter.update(n_temp_list) # Close the tqdm iterator temp_iter.close() # Determine all objids that are known print("\nDetermining all objects in the database.") try: master_df = vaex.open(ARGS.master_exp_file) objids, counts = np.unique(master_df['objid'].values, return_counts=True) finally: master_df.close() # Open master file with h5py.File(ARGS.master_file, 'r+') as m_file: # Obtain previously known objids n_objids_known = m_file.attrs.setdefault('n_objids', 0) objids_dset = m_file.require_dataset('objids', shape=(n_objids_known,), dtype=[('objid', int), ('count', int)], maxshape=(None,)) # Save currently known objids n_objids = len(objids) objids_dset.resize(n_objids, axis=0) objids_dset['objid'] = objids objids_dset['count'] = counts m_file.attrs['n_objids'] = n_objids # Obtain the total number of exposures now n_expnums = m_file.attrs['n_expnums'] # Print that processing is finished print(f"The database now contains {n_expnums:,} exposures with " f"{n_objids:,} objects.") # If no new exposure files are found, database is already up-to-date else: print("Database is already up-to-date.")