コード例 #1
0
ファイル: reader.py プロジェクト: Panos512/MLaaS4HEP
def main():
    "Main function"
    optmgr = OptionParser()
    opts = optmgr.parser.parse_args()
    fin = opts.fin
    fout = opts.fout
    verbose = int(opts.verbose)
    nevts = int(opts.nevts)
    chunk_size = int(opts.chunk_size)
    nan = float(opts.nan)
    nevts = int(opts.nevts)
    preproc = None
    if opts.preproc:
        preproc = load_code(opts.preproc, 'preprocessing')
    specs = opts.specs
    branch = opts.branch
    branches = opts.branches.split(',') if opts.branches else []
    exclude_branches = []
    if opts.exclude_branches:
        if os.path.isfile(opts.exclude_branches):
            exclude_branches = \
                    [r.replace('\n', '') for r in open(opts.exclude_branches).readlines()]
        else:
            exclude_branches = opts.exclude_branches.split(',')
    hists = opts.hists
    identifier = [k.strip() for k in opts.identifier.split(',')]
    label = None
    if file_type(fin) == 'root':
        reader = RootDataReader(fin, branch=branch, selected_branches=branches, \
                identifier=identifier, exclude_branches=exclude_branches, \
                histograms=hists, nan=nan, chunk_size=chunk_size, \
                nevts=nevts, specs=specs, redirector=opts.redirector, verbose=verbose)
    elif file_type(fin) == 'csv':
        reader = CsvReader(fin, label, chunk_size, nevts, preproc, verbose)
    elif file_type(fin) == 'json':
        reader = JsonReader(fin, label, chunk_size, nevts, preproc, verbose)
    elif file_type(fin) == 'parquet':
        reader = ParquetReader(fin, label, chunk_size, nevts, preproc, verbose)
    elif file_type(fin) == 'avro':
        reader = AvroReader(fin, label, chunk_size, nevts, preproc, verbose)
    if opts.info:
        reader.info()
    else:
        parse(reader, nevts, fout, hists)
コード例 #2
0
    def __init__(self, fin, labels, params=None, preproc=None, dtype=None):
        "Initialization function for Data Generator"
        time0 = time.time()
        self.dtype = str(dtype).lower()
        self.preproc = preproc
        if not params:
            params = {}
        # parse given parameters
        batch_size = params.get('batch_size', 256)
        self.verbose = params.get('verbose', 0)
        chunk_size = params.get('chunk_size', 1000)
        self.evts = params.get('nevts', -1)
        self.shuffle = params.get('shuffle', False)

        # convert input fin parameter into file list if necessary
        if isinstance(fin, str):
            self.files = [fin]
        elif isinstance(fin, list):
            self.files = fin
        else:
            raise Exception("Unsupported data-type '%s' for fin parameter" %
                            type(fin))
        if isinstance(labels, str):
            self.labels = [labels for _ in range(len(self.files))]
        elif isinstance(labels, list):
            self.labels = labels
        else:
            raise Exception("Unsupported data-type '%s' for labels parameter" %
                            type(labels))
        self.file_label_dict = dict(zip(self.files, self.labels))

        self.reader = {}  # global reader will handle all files readers
        self.reader_counter = {
        }  # reader counter keeps track of nevts read by readers

        if self.verbose:
            print(timestamp('Generator: {}'.format(self)))
            print("model parameters: {}".format(json.dumps(params)))

        self.start_idx = 0
        self.chunk_size = chunk_size
        self.stop_idx = chunk_size
        self.batch_size = batch_size

        # loop over files and create individual readers for them, then put them in a global reader
        for fname, label in self.file_label_dict.items():
            if self.dtype == 'json' or file_type(fname) == 'json':
                reader = JsonReader(fname, label, chunk_size=chunk_size, nevts=self.evts, \
                        preproc=self.preproc, verbose=self.verbose)
            elif self.dtype == 'csv' or file_type(fname) == 'csv':
                reader = CsvReader(fname, label, chunk_size=chunk_size, nevts=self.evts, \
                        preproc=self.preproc, verbose=self.verbose)
            elif self.dtype == 'avro' or file_type(fname) == 'avro':
                reader = AvroReader(fname, label, chunk_size=chunk_size, nevts=self.evts, \
                        preproc=self.preproc, verbose=self.verbose)
            elif self.dtype == 'parquet' or file_type(fname) == 'parquet':
                reader = ParquetReader(fname, label, chunk_size=chunk_size, nevts=self.evts, \
                        preproc=self.preproc, verbose=self.verbose)
            self.reader[fname] = reader
            self.reader_counter[fname] = 0

        self.current_file = self.files[0]

        print("init MetaDataGenerator in {} sec".format(time.time() - time0))
        print("available readers")
        for fname, reader in self.reader.items():
            print("{} {}".format(fname, reader))