def main(): "Main function" optmgr = OptionParser() opts = optmgr.parser.parse_args() fin = opts.fin fout = opts.fout verbose = int(opts.verbose) nevts = int(opts.nevts) chunk_size = int(opts.chunk_size) nan = float(opts.nan) nevts = int(opts.nevts) preproc = None if opts.preproc: preproc = load_code(opts.preproc, 'preprocessing') specs = opts.specs branch = opts.branch branches = opts.branches.split(',') if opts.branches else [] exclude_branches = [] if opts.exclude_branches: if os.path.isfile(opts.exclude_branches): exclude_branches = \ [r.replace('\n', '') for r in open(opts.exclude_branches).readlines()] else: exclude_branches = opts.exclude_branches.split(',') hists = opts.hists identifier = [k.strip() for k in opts.identifier.split(',')] label = None if file_type(fin) == 'root': reader = RootDataReader(fin, branch=branch, selected_branches=branches, \ identifier=identifier, exclude_branches=exclude_branches, \ histograms=hists, nan=nan, chunk_size=chunk_size, \ nevts=nevts, specs=specs, redirector=opts.redirector, verbose=verbose) elif file_type(fin) == 'csv': reader = CsvReader(fin, label, chunk_size, nevts, preproc, verbose) elif file_type(fin) == 'json': reader = JsonReader(fin, label, chunk_size, nevts, preproc, verbose) elif file_type(fin) == 'parquet': reader = ParquetReader(fin, label, chunk_size, nevts, preproc, verbose) elif file_type(fin) == 'avro': reader = AvroReader(fin, label, chunk_size, nevts, preproc, verbose) if opts.info: reader.info() else: parse(reader, nevts, fout, hists)
def __init__(self, fin, labels, params=None, preproc=None, dtype=None): "Initialization function for Data Generator" time0 = time.time() self.dtype = str(dtype).lower() self.preproc = preproc if not params: params = {} # parse given parameters batch_size = params.get('batch_size', 256) self.verbose = params.get('verbose', 0) chunk_size = params.get('chunk_size', 1000) self.evts = params.get('nevts', -1) self.shuffle = params.get('shuffle', False) # convert input fin parameter into file list if necessary if isinstance(fin, str): self.files = [fin] elif isinstance(fin, list): self.files = fin else: raise Exception("Unsupported data-type '%s' for fin parameter" % type(fin)) if isinstance(labels, str): self.labels = [labels for _ in range(len(self.files))] elif isinstance(labels, list): self.labels = labels else: raise Exception("Unsupported data-type '%s' for labels parameter" % type(labels)) self.file_label_dict = dict(zip(self.files, self.labels)) self.reader = {} # global reader will handle all files readers self.reader_counter = { } # reader counter keeps track of nevts read by readers if self.verbose: print(timestamp('Generator: {}'.format(self))) print("model parameters: {}".format(json.dumps(params))) self.start_idx = 0 self.chunk_size = chunk_size self.stop_idx = chunk_size self.batch_size = batch_size # loop over files and create individual readers for them, then put them in a global reader for fname, label in self.file_label_dict.items(): if self.dtype == 'json' or file_type(fname) == 'json': reader = JsonReader(fname, label, chunk_size=chunk_size, nevts=self.evts, \ preproc=self.preproc, verbose=self.verbose) elif self.dtype == 'csv' or file_type(fname) == 'csv': reader = CsvReader(fname, label, chunk_size=chunk_size, nevts=self.evts, \ preproc=self.preproc, verbose=self.verbose) elif self.dtype == 'avro' or file_type(fname) == 'avro': reader = AvroReader(fname, label, chunk_size=chunk_size, nevts=self.evts, \ preproc=self.preproc, verbose=self.verbose) elif self.dtype == 'parquet' or file_type(fname) == 'parquet': reader = ParquetReader(fname, label, chunk_size=chunk_size, nevts=self.evts, \ preproc=self.preproc, verbose=self.verbose) self.reader[fname] = reader self.reader_counter[fname] = 0 self.current_file = self.files[0] print("init MetaDataGenerator in {} sec".format(time.time() - time0)) print("available readers") for fname, reader in self.reader.items(): print("{} {}".format(fname, reader))