Python DataCollection Examples, DeepJetCore.DataCollection.DataCollection Python Examples

Example #1

0

Show file

File: check_conversion.py Project: yiiyama/DeepJetCore

def merge_successful():
    output_dir=get_output_dir()
    from DeepJetCore.DataCollection import DataCollection
    alldc=[]
    for s in successful:
        in_path=output_dir+'/conversion.'+str(s)+'.dc'
        dc=None
        try: 
            dc=DataCollection(in_path)
        except Exception as e:
            print('problems adding '+in_path+" will continue nevertheless... (error see below)")
            print(e)
            dc=0
        if dc:
            alldc.append(DataCollection(in_path))
    print("merging DataCollections")
    merged = alldc[0]
    merged_c=1
    for i in range(1,len(alldc)):
        try: 
            merged += alldc[i]
            merged_c+=1
        except Exception as e:
            print(e)
            print('...continue adding nevertheless')
        
    if merged_c != len(alldc):
        print('lost '+str(100* (1. - float(merged_c)/float(len(alldc)))) +'%')
    print("saving merged DataCollection")
    merged.writeToFile('%s/dataCollection.dc' % output_dir)
    print('successfully merged to %s/dataCollection.dc' % output_dir)
    return merged

Example #2

0

Show file

File: trainingInfo.py Project: schoef/DeepJet

    def __init__( self, directory ):

        filename = os.path.join( directory, 'dataCollection.dc')
        file_    = open( filename, 'rb')

        self.samples    =   pickle.load(file_)
        sampleentries   =   pickle.load(file_)
        originRoots     =   pickle.load(file_)
        nsamples        =   pickle.load(file_)
        useweights      =   pickle.load(file_)
        batchsize       =   pickle.load(file_)
        dataclass       =   pickle.load(file_)
        weighter        =   pickle.load(file_)
        self._means     =   pickle.load(file_)
        file_.close()


        # Get means dictionary
        self.means = {name : (self._means[0][i], self._means[1][i]) for i, name in enumerate( self._means.dtype.names) }

        # Get DeepJetCore DataCollection
        self.dataCollection = DataCollection()
        self.dataCollection.readFromFile(filename) 

        # Reading first sample & get branch structure
        fullpath = self.dataCollection.getSamplePath(self.samples[0])
        self.dataCollection.dataclass.readIn(fullpath)
        self.branches = self.dataCollection.dataclass.branches

        print "Branches:"
        for i in range(len(self.branches)):
            print "Collection", i
            for i_b, b in enumerate(self.branches[i]):
                print "  branch %2i/%2i %40s   mean %8.5f var %8.5f" %( i, i_b, b, self.means[b][0], self.means[b][1])
            print

Example #3

0

Show file

File: funcs.py Project: EsmaeelEskandari/DeepJet

def loadModel(inputDir,
              trainData,
              model,
              LoadModel,
              sampleDatasets=None,
              removedVars=None):
    inputModel = '%s/KERAS_check_best_model.h5' % inputDir
    # inputModel = '%s/KERAS_model.h5'%inputDir
    inputWeights = '%s/KERAS_check_best_model_weights.h5' % inputDir

    from DeepJetCore.DataCollection import DataCollection
    traind = DataCollection()
    traind.readFromFile(trainData)
    traind.dataclass.regressiontargetclasses = range(0, NBINS)
    print traind.getNRegressionTargets()

    if (LoadModel):
        evalModel = load_model(inputModel, custom_objects=global_loss_list)
        shapes = traind.getInputShapes()

    else:
        shapes = traind.getInputShapes()
        train_inputs = []
        for s in shapes:
            train_inputs.append(keras.layers.Input(shape=s))
        evalModel = model(train_inputs, traind.getNClassificationTargets(),
                          traind.getNRegressionTargets(), sampleDatasets,
                          removedVars)
        evalModel.load_weights(inputWeights)

    return evalModel

Example #4

0

Show file

    def __init__(self,
                 input_source_files_list,
                 training_data_collection,
                 predict_dir,
                 unbuffered=False,
                 model_path=None,
                 max_files=4,
                 inputdir=None):
        self.input_data_files = []
        self.inputdir = None
        self.predict_dir = predict_dir
        self.unbuffered = unbuffered
        self.max_files = max_files
        print("Using HGCal predictor class")

        ## prepare input lists for different file formats

        if input_source_files_list[-6:] == ".djcdc":
            print('reading from data collection', input_source_files_list)
            predsamples = DataCollection(input_source_files_list)
            self.inputdir = predsamples.dataDir
            for s in predsamples.samples:
                self.input_data_files.append(s)

        elif input_source_files_list[-6:] == ".djctd":
            self.inputdir = os.path.abspath(
                os.path.dirname(input_source_files_list))
            infile = os.path.basename(input_source_files_list)
            self.input_data_files.append(infile)
        else:
            print('reading from text file', input_source_files_list)
            self.inputdir = os.path.abspath(
                os.path.dirname(input_source_files_list))
            with open(input_source_files_list, "r") as f:
                for s in f:
                    self.input_data_files.append(
                        s.replace('\n', '').replace(" ", ""))

        self.dc = None
        if input_source_files_list[
                -6:] == ".djcdc" and not training_data_collection[
                    -6:] == ".djcdc":
            self.dc = DataCollection(input_source_files_list)
        else:
            self.dc = DataCollection(training_data_collection)

        if inputdir is not None:
            self.inputdir = inputdir

        self.model_path = model_path
        if max_files > 0:
            self.input_data_files = self.input_data_files[
                0:min(max_files, len(self.input_data_files))]

Example #5

0

Show file

File: mergeDataCollections.py Project: riga/DeepJetCore

def sumDCandWrite(filelist, outname):
    alldc = []
    for f in filelist:
        dc = DataCollection(f)
        alldc.append(dc)
        rel = os.path.relpath(dc.dataDir, os.getcwd())
        dc.prependToSampleFiles(rel + '/')
        dc.dataDir = os.getcwd()

    merged = sum(alldc)
    print(outname)
    merged.writeToFile(outname)

Example #6

0

Show file

def sumDCandWrite(filelist, outname):
    alldc = []
    for f in filelist:
        try:
            dc = DataCollection(f)
        except:
            print('read in of ' + f + ' not working, skip')
            continue
        alldc.append(dc)
        rel = os.path.relpath(dc.dataDir, os.getcwd())
        dc.prependToSampleFiles(rel + '/')
        dc.dataDir = os.getcwd()

    merged = sum(alldc)
    print(outname)
    merged.writeToFile(outname)

Example #7

0

Show file

File: testRaggedGenerator.py Project: schoef/DeepJetCore

 def test(self):
     
     passed = True
     
     dc = DataCollection()
     dc.dataclass = TrainData_test
     dc.sourceList = [f for f in self.files.filenames]
     dc.createDataFromRoot(TrainData_test, outputDir=self.dcoutdir.path)
     
     gen = dc.invokeGenerator()
     gen.setBatchSize(self.n_per_batch)
     
     for epoch in range(10):
         gen.prepareNextEpoch()
         print("epoch",epoch,'batches',gen.getNBatches())
         for b in range(gen.getNBatches()):
             d,_ = next(gen.feedNumpyData())
             data,rs = d[0],d[1]
             rs = np.array(rs[:,0],dtype='int')
             rs = rs[:rs[-1]]
             #print(data)
             #print(rs[-1])
             if not raggedtester.checkData(data, rs):
                 print('epoch',epoch, 'batch',b,'broken')
                 passed=False
                 break
             if rs[-1] > self.n_per_batch:
                 print('maximum batch size exceeded for batch ',b, 'epoch', epoch)
             
         print('shuffling')
         gen.shuffleFilelist()
         
     return passed

Example #8

0

Show file

def merge_successful():
    output_dir = get_output_dir()
    from DeepJetCore.DataCollection import DataCollection
    alldc = []
    for s in successful:
        in_path = output_dir + '/conversion.' + str(s) + '.dc'
        dc = None
        try:
            dc = DataCollection(in_path)
        except Exception as e:
            print('problems adding ' + in_path +
                  " will continue nevertheless... (error see below)")
            print(e)
        if dc:
            alldc.append(DataCollection(in_path))
    print("merging DataCollections")
    merged = sum(alldc)
    print("saving merged DataCollection")
    merged.writeToFile('%s/dataCollection.dc' % output_dir)
    print('successfully merged to %s/dataCollection.dc' % output_dir)
    return merged

Example #9

0

Show file

File: dcToDf.py Project: rsyarif/DeepJet

def dcToDf(dc_file, df_out):
    dc = DataCollection()
    dc.readFromFile(dc_file)

    NENT = 1  # Can skip some events
    filelist = []
    i = 0
    storeInputs = True
    count = 0

    feature_names = dc.dataclass.branches[1]
    spectator_names = dc.dataclass.branches[0]
    labels_names = dc.getUsedTruth()
    labels_names = ['truth' + l for l in labels_names]

    for s in dc.samples:
        if count > 1000000: break
        spath = dc.getSamplePath(s)
        filelist.append(spath)
        h5File = h5py.File(spath)
        f = h5File
        features_val_i = [
            h5File['x%i' % j][()]
            for j in range(0, h5File['x_listlength'][()][0])
        ]
        features_val_i = features_val_i[0][::NENT, 0, :]
        #predict_test_i = model.predict(features_val)
        weights_val_i = h5File['w0'][()]
        labels_val_i = h5File['y0'][()][::NENT, :]
        spectators_val_i = h5File['z0'][()][::NENT, 0, :]
        if storeInputs: raw_features_val_i = h5File['z1'][()][::NENT, 0, :]
        if i == 0:
            #predict_test = predict_test_i
            weights_val = weights_val_i
            labels_val = labels_val_i
            spectators_val = spectators_val_i
            features_val = features_val_i
            if storeInputs: raw_features_val = raw_features_val_i
        else:
            #predict_test = np.concatenate((predict_test,predict_test_i))
            weights_val = np.concatenate((weights_val, weights_val_i))
            labels_val = np.concatenate((labels_val, labels_val_i))
            features_val = np.concatenate((features_val, features_val_i))
            spectators_val = np.concatenate((spectators_val, spectators_val_i))
            if storeInputs:
                raw_features_val = np.concatenate(
                    (raw_features_val, raw_features_val_i))
        i += 1
        count += labels_val.shape[0]

    entries = np.hstack((raw_features_val, spectators_val, labels_val,
                         weights_val.reshape((len(weights_val), 1))))
    df = pd.DataFrame(entries,
                      columns=feature_names + spectator_names + labels_names +
                      ['weight'])
    #df = pd.DataFrame(raw_features_val+spectators_val , columns = feature_names+spectator_names)
    #print df
    if df_out != None:
        df.to_pickle(df_out)
        print "Saved df to", df_out

Example #10

0

Show file

File: eval_functions.py Project: rsyarif/DeepJet

def loadModel(inputDir,
              trainData,
              model,
              LoadModel,
              sampleDatasets=None,
              removedVars=None,
              adv=False):
    inputModel = '%s/KERAS_check_best_model.h5' % inputDir

    from DeepJetCore.DataCollection import DataCollection
    traind = DataCollection()
    traind.readFromFile(trainData)
    traind.dataclass.regressiontargetclasses = range(0, NBINS)
    print(traind.getNRegressionTargets())

    if (LoadModel):
        evalModel = load_model(inputModel, custom_objects=global_loss_list)
        shapes = traind.getInputShapes()

    else:
        shapes = traind.getInputShapes()
        train_inputs = []
        for s in shapes:
            train_inputs.append(keras.layers.Input(shape=s))
        modelargs = {}
        if adv:
            modelargs.update({
                'nRegTargets': NBINS,
                'discTrainable': True,
                'advTrainable': True
            })
        evalModel = model(train_inputs, traind.getNClassificationTargets(),
                          traind.getNRegressionTargets(), sampleDatasets,
                          removedVars, **modelargs)
        evalModel.load_weights(inputModel)

    return evalModel

Example #11

0

Show file

File: gravnet_lc.py Project: gvonsem/HGCalML

    x = Dense(nregressions,
              activation=None,
              name='dense_fracs',
              kernel_initializer=keras.initializers.RandomNormal(
                  mean=0.0, stddev=0.01))(x)

    x = Concatenate(name="concatlast",
                    axis=-1)([x] + coords + [n_showers] + [etas_phis])
    x = Multiply()([x, mask])
    predictions = [x]
    return Model(inputs=Inputs, outputs=predictions)


train = training_base(testrun=False, resumeSilently=True, renewtokens=True)

plotdc = DataCollection(
    os.path.dirname(os.path.realpath(train.inputData)) + '/merged_test.dc')

samplefile = plotdc.getSamplePath(plotdc.samples[0])


#gets called every epoch
def decay_function(aftern_batches):
    return aftern_batches  # int(aftern_batches+5)


ppdts = [
    plot_truth_pred_plus_coords_during_training(
        samplefile=samplefile,
        output_file=train.outputDir + '/train_progress' + str(0),
        use_event=use_event,
        x_index=5,

Example #12

0

Show file

File: replace_with_dep_energy.py Project: shahrukhqasim/HGCalML

    farr = simpleArray()
    farr.createFromNumpy(feat, rs)

    truth[:, 1] = truth[:, 16]

    tarr = simpleArray()
    tarr.createFromNumpy(truth, rs)

    #tarr.cout()

    td_out = TrainData_window()
    td_out._store([farr], [tarr], [])

    td_out.writeToFile(outdir + infile)
    print(infile, 'done')


dc = DataCollection(inputdcfile)
inputdir = dc.dataDir
if not inputdir[:-1] == os.getcwd():
    print('needs to be called in same dir as dataCollection file', inputdir,
          os.getcwd())

inputdatafiles = []
for s in dc.samples:
    inputdatafiles.append(s)

from multiprocessing import Pool
p = Pool()
res = p.map(replace, inputdatafiles)

Example #13

0

Show file

import tempfile
import atexit
import os
from keras.models import load_model
from keras import backend as K
from DeepJetCore.customObjects import get_custom_objects
from DeepJetCore.training.gpuTools import DJCSetGPUs

inputdatafiles=[]
inputdir=None

## prepare input lists for different file formats

if args.inputSourceFileList[-6:] == ".djcdc":
    print('reading from data collection',args.inputSourceFileList)
    predsamples = DataCollection(args.inputSourceFileList)
    inputdir = predsamples.dataDir
    for s in predsamples.samples:
        inputdatafiles.append(s)
        
elif args.inputSourceFileList[-6:] == ".djctd":
    inputdir = os.path.abspath(os.path.dirname(args.inputSourceFileList))
    infile = os.path.basename(args.inputSourceFileList)
    inputdatafiles.append(infile)
else:
    print('reading from text file',args.inputSourceFileList)
    inputdir = os.path.abspath(os.path.dirname(args.inputSourceFileList))
    with open(args.inputSourceFileList, "r") as f:
        for s in f:
            inputdatafiles.append(s.replace('\n', '').replace(" ",""))

Example #14

0

Show file

if len(args.files) < 1:
    print('you must provide at least one input file')
    exit()
if not len(args.o):
    print('you must provide an output file name')
    exit()

indir = os.path.dirname(args.files[0])
if len(indir):
    indir += "/"
class_name = args.c

if class_name in class_options:
    traind = class_options[class_name]
else:
    print('available classes:')
    for key, val in class_options.iteritems():
        print(key)
    raise Exception('wrong class selection')

dc = DataCollection()
dc.setDataClass(traind)

for f in args.files:
    dc.samples.append(os.path.basename(f))

outfile = args.o
if not outfile[-6:] == ".djcdc":
    outfile += ".djcdc"
dc.writeToFile(indir + outfile)

Example #15

0

Show file

File: deepDoubleB_eval.py Project: rsyarif/DeepJet

LoadModel = False
removedVars = None
forceNClasses = False
signals = [1]
sigNames = ['Hbb']
backgrounds = [0]
backNames = ['QCD']
NClasses = len(signals) + len(backgrounds)

if True:
    evalModel = loadModel(trainDir, inputTrainDataCollection, trainingModel,
                          LoadModel, forceNClasses, NClasses, inputDataset,
                          removedVars)

    evalDir = opts.o

    from DeepJetCore.DataCollection import DataCollection
    testd = DataCollection()
    testd.readFromFile(inputTestDataCollection)

    if os.path.isdir(evalDir):
        raise Exception('output directory: %s must not exists yet' % evalDir)
    else:
        os.mkdir(evalDir)

    df, features_val = makePlots(testd, evalModel, evalDir)
    makeLossPlot(trainDir, evalDir)

    #df = evaluate(testd, inputTrainDataCollection, evalModel, evalDir)
    #make_plots(evalDir, savedir='Plots')

Example #16

0

Show file

from collections import Counter

from argparse import ArgumentParser

parser = ArgumentParser('Dataset validation hplots script')
parser.add_argument('-d', help="Data collection file")
parser.add_argument('-p',
                    help="PDF file path (will be ignored in validate mode)")
parser.add_argument('-n',
                    help="Number of events to produce dataset stats pdf on",
                    default="50")
parser.add_argument('--validate', dest='validate', action='store_true')
parser.set_defaults(validate=False)

args = parser.parse_args()
dc = DataCollection(args.d)
td = dc.dataclass()  #this is actually saved
#JK: this combination enforces one event per batch, then the extra row split loop is not needed
batchsize = 1
dc.setBatchSize(batchsize)
print("Invoking generator")
gen = dc.invokeGenerator()
gen.setSkipTooLargeBatches(False)

# gen.setBuffer(td)
print("n batches")
n_batches = gen.getNBatches()
print(n_batches)
print("probably ready")
#gpus = tf.config.list_physical_devices('GPU')
gpus = 0

Example #17

0

Show file

File: load_dc.py Project: daseith/lwtnn

from DeepJetCore.DataCollection import DataCollection
from pprint import pprint

dc = DataCollection()
dc.readFromFile(
    'dc/dataCollection.dc'
)  #/storage/9/dseith/DeepJet/deepCSV/results/../../Ntuples/Thu_135917_batch/dataCollections/deepCSV/train/dataCollection.dc')
#dc.readFromFile('/storage/9/dseith/DeepJet/deepCSV/results/../../Ntuples/Thu_135917_batch/dataCollections/deepFlavour_FT_reg/train/dataCollection.dc')
#pprint (dc.means[0])
#print '-'*100
#pprint (dc.means[1])
#print '-'*100
#pprint (dc.means.dtype.names)
#pprint (dc.means[0][0].dtype)
#pprint (dc.useweights)
#pprint (dc.weighter)
#pprint (dc.samples)
#pprint (dc.sampleentries)
#pprint (dc.originRoots)
#pprint (dc.nsamples)
#pprint (dc.useweights)
##pprint (dc.__batchsize)
pprint(dc.dataclass)
#pprint (dc.weighter)
#pprint (dc.means)

six_times = [
    'TagVarCSVTrk_trackJetDistVal', 'TagVarCSVTrk_trackPtRel',
    'TagVarCSVTrk_trackDeltaR', 'TagVarCSVTrk_trackPtRatio',
    'TagVarCSVTrk_trackSip3dSig', 'TagVarCSVTrk_trackSip2dSig',
    'TagVarCSVTrk_trackDecayLenVal'

Example #18

0

Show file

File: mergeOrSplitFiles.py Project: DL4Jets/DeepJetCore

from argparse import ArgumentParser
parser = ArgumentParser('merge or split files belonging to a dataCollection differently. The output will be written to the current working directory!')
parser.add_argument("infile", help="input \"dc\" file")
parser.add_argument("nelementsperfile", help="number of entries per file (output), for ragged, maximum number of elements")
parser.add_argument("--randomise", help="randomise order, could be helpful if difference samples need to be mixed", action='store_true')
args=parser.parse_args()


from DeepJetCore.DataCollection import DataCollection
from DeepJetCore.dataPipeline import TrainDataGenerator

infile=args.infile
nbatch=int(args.nelementsperfile)
randomise = args.randomise

dc = DataCollection(infile)
dc2 = DataCollection(infile)
samples = dc.samples

dir = dc.dataDir
if len(dir)<1:
    dir='.'
insamples = [dir+'/'+s for s in samples]

gen = TrainDataGenerator()
gen.setBatchSize(nbatch)
gen.setSkipTooLargeBatches(False)
gen.setFileList(insamples)

if randomise:
    gen.shuffleFileList()

Example #19

0

Show file

args = parser.parse_args()
batchsize = int(args.b)

#if os.path.isdir(args.outputDir):
#    raise Exception('output directory must not exists yet')

custom_objs = {}
custom_objs.update(djc_global_loss_list)
custom_objs.update(djc_global_layers_list)
custom_objs.update(global_loss_list)
custom_objs.update(global_layers_list)
custom_objs.update(global_metrics_list)

model = load_model(args.inputModel, custom_objects=custom_objs)
dc = DataCollection(args.trainingDataCollection)
td = dc.dataclass()
outputs = []
inputdir = os.path.abspath(os.path.dirname(args.inputSourceFileList))
os.system('mkdir -p ' + args.outputDir)

with open(args.inputSourceFileList, "r") as f:
    for inputfile in f:
        inputfile = inputfile.replace('\n', '')
        outfilename = "pred_" + inputfile
        print('converting ' + inputfile)

        tmpdir = tempfile.mkdtemp(suffix="djcpred", dir="/dev/shm")

        def removeTmp():
            os.system("rm -rf " + tmpdir)

Example #20

0

Show file

    print(
        'creating a dummy datacollection for means/norms and weighter (can take a while)...'
    )

    from DeepJetCore.DataCollection import DataCollection
    from DeepJetCore.conversion.conversion import class_options

    try:
        cls = class_options[args.c]
    except KeyError:
        raise Exception('wrong class selection')

    if not args.classArgs:
        args.classArgs = tuple()

    dc = DataCollection(nprocs=-1)
    dc.meansnormslimit = int(args.nforweighter)
    try:
        dc.convertListOfRootFiles(
            args.infile,
            cls(*args.classArgs),
            args.out,
            means_only=True,
            output_name='batch_template.dc',
            relpath=('' if args.noRelativePaths else os.path.dirname(
                os.path.realpath(args.infile))))

    except:
        print 'The first round of root conversion failed'
        raise

Example #21

0

Show file

if os.path.isdir(args.outputDir):
    raise Exception('output directory must not exists yet')

custom_objs = {}
custom_objs.update(global_loss_list)
custom_objs.update(global_layers_list)
model=load_model(args.inputModel, custom_objects=custom_objs)


td=testDescriptor()
if args.use:
	td.use_only = [int(i) for i in args.use.split(',')]

from DeepJetCore.DataCollection import DataCollection

testd=DataCollection()
testd.readFromFile(args.inputDataCollection)


os.mkdir(args.outputDir)

td.makePrediction(
    model, testd, args.outputDir,
    store_labels = args.labels,
    monkey_class = args.monkey_class
)

td.writeToTextFile(args.outputDir+'/tree_association.txt')

#    make the file reading entirely C++
#    then it can be used for other studies

Example #22

0

Show file

File: testRaggedData.py Project: shahrukhqasim/HGCalML

args = parser.parse_args()

minbatch = int(args.min)
maxbatch = int(args.max)
n_plots = int(args.n)
infile = args.inputFile
batchsize = int(args.b)

from DeepJetCore.DataCollection import DataCollection
from index_dicts import create_truth_dict, create_feature_dict
from ragged_plotting_tools import make_original_truth_shower_plot, createRandomizedColors
import matplotlib
import matplotlib.pyplot as plt
import random

dc = DataCollection(infile)
dc.setBatchSize(batchsize)
gen = dc.invokeGenerator()
nbatches = gen.getNBatches()

if maxbatch >= nbatches:
    raise ValueError("maxbatch >= nbatches in sample")
if minbatch >= maxbatch:
    raise ValueError("minbatch >= maxbatch")

events = random.sample(range(minbatch, maxbatch), n_plots)
lastev = -1
n_plots_done = 0
print('scanning...')
for i in range(nbatches):
    f, t = next(gen.feedNumpyData())

Example #23

0

Show file

File: validateData.py Project: schoef/DeepJetCore

#!/usr/bin/env python3

from argparse import ArgumentParser
parser = ArgumentParser(
    'Check if all files in a dataset (datacollection) are ok or remove a specific entry\n'
)
parser.add_argument('inputDataCollection')
parser.add_argument('--remove', default="")
parser.add_argument('--skip_first', default=0)
args = parser.parse_args()

from DeepJetCore.DataCollection import DataCollection

dc = DataCollection(args.inputDataCollection)
dc.writeToFile(args.inputDataCollection + ".backup")

if not len(args.remove):
    dc.validate(remove=True, skip_first=int(args.skip_first))
else:
    dc.removeEntry(args.remove)
    print('total size after: ' + str(dc.nsamples))

dc.writeToFile(args.inputDataCollection)

Example #24

0

Show file

File: convertFromRoot.py Project: jkaulen/DeepJetCore

            do_write = False
            for iline, line in enumerate(source):
                if iline == args.inRange[0]:
                    do_write = True
                elif iline == args.inRange[1]:
                    break
                if do_write:
                    path = os.path.realpath(os.path.join(relpath, line))
                    my_infile.write(path)

    infile = my_infile.name
    # new infile will always have absolute path
    relpath = ''

# MAIN BODY #
dc = DataCollection(nprocs=(1 if args.nothreads else -1))
dc.meansnormslimit = int(args.nforweighter)
if len(nchilds):
    dc.nprocs = int(nchilds)
if args.batch is not None:
    dc.batch_mode = True

traind = None
if class_name in class_options:
    traind = class_options[class_name]
elif not recover and not testdatafor:
    print('available classes:')
    for key, val in class_options.iteritems():
        print(key)
    raise Exception('wrong class selection')

Example #25

0

Show file

else:
    print('available classes:')
    for key, val in class_options.iteritems():
        print(key)
    raise Exception('wrong class selection')

if not ".dc" in infile:
    raise Exception('wrong input file '+infile)
    
dir = os.path.dirname(infile)

dcold = DCOld()
dcold.readRawFromFile(infile)


dcnew = DataCollection()
dcnew.dataclass = traind()
dcnew.samples = [s[:-4]+'djctd' for s in dcold.samples]
print(dcnew.samples)
dcnew.sourceList = dcold.originRoots
# leave traindata undefined no way to convert.
dcnew.__nsamples = 0 # determine again, also check

outfile = infile[:-2] +'djcdc'
print("infile: ", infile, " outfile", outfile)

def worker(i):

    td = TDOld()
    tdnew = TrainData()
    print("converting",dcold.samples[i])

Example #26

0

Show file

File: convertFromRoot.py Project: emilbols/DeepJetCore

        'When running in batch mode you should also '
        'provide a means source through the --usemeansfrom option')

if args.v:
    logging.getLogger().setLevel(logging.DEBUG)
elif args.q:
    logging.getLogger().setLevel(logging.WARNING)

if infile:
    logging.info("infile = %s" % infile)
if outPath:
    logging.info("outPath = %s" % outPath)

# MAIN BODY #
dc = DataCollection(
    nprocs=(1 if args.nothreads else -1),
    useRelativePaths=True if not args.noRelativePaths else False)
if len(nchilds):
    dc.nprocs = int(nchilds)

if class_name in class_options:
    traind = class_options[class_name]
elif not recover and not testdatafor:
    print('available classes:')
    for key, val in class_options.iteritems():
        print(key)
    raise Exception('wrong class selection')
if testdatafor:
    logging.info('converting test data, no weights applied')
    dc.createTestDataForDataCollection(
        testdatafor,

Example #27

0

Show file

File: convertFromSource.py Project: sznajder/DeepJetCore

            do_write = False
            for iline, line in enumerate(source):
                if iline == args.inRange[0]:
                    do_write = True
                elif iline == args.inRange[1]:
                    break
                if do_write:
                    path = os.path.realpath(os.path.join(relpath, line))
                    my_infile.write(path)

    infile = my_infile.name
    # new infile will always have absolute path
    relpath = ''

# MAIN BODY #
dc = DataCollection(nprocs=(1 if args.nothreads else -1))
dc.meansnormslimit = int(args.nforweighter)
dc.no_copy_on_convert = args.noramcopy
if len(nchilds):
    dc.nprocs = int(nchilds)
if args.batch is not None:
    dc.batch_mode = True

traind = None
if class_name in class_options:
    traind = class_options[class_name]
elif not recover and not testdatafor:
    print('available classes:')
    for key, val in class_options.iteritems():
        print(key)
    raise Exception('wrong class selection')

Example #28

0

Show file

class training_base(object):
    def __init__(self,
                 splittrainandtest=0.85,
                 useweights=False,
                 testrun=False,
                 testrun_fraction=0.1,
                 resumeSilently=False,
                 renewtokens=True,
                 collection_class=DataCollection,
                 parser=None,
                 recreate_silently=False):

        import sys
        scriptname = sys.argv[0]

        if parser is None: parser = ArgumentParser('Run the training')
        parser.add_argument('inputDataCollection')
        parser.add_argument('outputDir')
        parser.add_argument(
            '--modelMethod',
            help=
            'Method to be used to instantiate model in derived training class',
            metavar='OPT',
            default=None)
        parser.add_argument("--gpu",
                            help="select specific GPU",
                            metavar="OPT",
                            default="")
        parser.add_argument("--gpufraction",
                            help="select memory fraction for GPU",
                            type=float,
                            metavar="OPT",
                            default=-1)
        parser.add_argument("--submitbatch",
                            help="submits the job to condor",
                            default=False,
                            action="store_true")
        parser.add_argument(
            "--walltime",
            help=
            "sets the wall time for the batch job, format: 1d5h or 2d or 3h etc",
            default='1d')
        parser.add_argument("--isbatchrun",
                            help="is batch run",
                            default=False,
                            action="store_true")
        parser.add_argument("--valdata",
                            help="set validation dataset (optional)",
                            default="")
        parser.add_argument(
            "--takeweights",
            help=
            "Applies weights from the model given as relative or absolute path. Matches by names and skips layers that don't match.",
            default="")

        args = parser.parse_args()
        self.args = args
        import sys
        self.argstring = sys.argv
        #sanity check
        if args.isbatchrun:
            args.submitbatch = False
            resumeSilently = True

        if args.submitbatch:
            print(
                'submitting batch job. Model will be compiled for testing before submission (GPU settings being ignored)'
            )

        import matplotlib
        #if no X11 use below
        matplotlib.use('Agg')
        DJCSetGPUs(args.gpu)

        if args.gpufraction > 0 and args.gpufraction < 1:
            import sys
            import tensorflow as tf
            gpu_options = tf.GPUOptions(
                per_process_gpu_memory_fraction=args.gpufraction)
            sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
            import keras
            from keras import backend as K
            K.set_session(sess)
            print('using gpu memory fraction: ' + str(args.gpufraction))

        import keras

        self.ngpus = 1
        self.dist_strat_scope = None
        if len(args.gpu):
            self.ngpus = len([i for i in args.gpu.split(',')])
            print('running on ' + str(self.ngpus) + ' gpus')
            if self.ngpus > 1:
                import tensorflow as tf
                self.dist_strat_scope = tf.distribute.MirroredStrategy()

        self.keras_inputs = []
        self.keras_inputsshapes = []
        self.keras_model = None
        self.keras_model_method = args.modelMethod
        self.keras_weight_model_path = args.takeweights
        self.train_data = None
        self.val_data = None
        self.startlearningrate = None
        self.optimizer = None
        self.trainedepoches = 0
        self.compiled = False
        self.checkpointcounter = 0
        self.renewtokens = renewtokens
        if args.isbatchrun:
            self.renewtokens = False
        self.callbacks = None
        self.custom_optimizer = False
        self.copied_script = ""
        self.submitbatch = args.submitbatch

        self.GAN_mode = False

        self.inputData = os.path.abspath(args.inputDataCollection) \
             if ',' not in args.inputDataCollection else \
              [os.path.abspath(i) for i in args.inputDataCollection.split(',')]
        self.outputDir = args.outputDir
        # create output dir

        isNewTraining = True
        if os.path.isdir(self.outputDir):
            if not (resumeSilently or recreate_silently):
                var = input(
                    'output dir exists. To recover a training, please type "yes"\n'
                )
                if not var == 'yes':
                    raise Exception('output directory must not exists yet')
            isNewTraining = False
            if recreate_silently:
                isNewTraining = True
        else:
            os.mkdir(self.outputDir)
        self.outputDir = os.path.abspath(self.outputDir)
        self.outputDir += '/'

        if recreate_silently:
            os.system('rm -rf ' + self.outputDir + '*')

        #copy configuration to output dir
        if not args.isbatchrun:
            try:
                shutil.copyfile(scriptname,
                                self.outputDir + os.path.basename(scriptname))
            except shutil.SameFileError:
                pass
            except BaseException as e:
                raise e

            self.copied_script = self.outputDir + os.path.basename(scriptname)
        else:
            self.copied_script = scriptname

        self.train_data = collection_class()
        self.train_data.readFromFile(self.inputData)
        self.train_data.useweights = useweights

        if len(args.valdata):
            print('using validation data from ', args.valdata)
            self.val_data = DataCollection(args.valdata)

        else:
            if testrun:
                if len(self.train_data) > 1:
                    self.train_data.split(testrun_fraction)

                self.train_data.dataclass_instance = None  #can't be pickled
                self.val_data = copy.deepcopy(self.train_data)

            else:
                self.val_data = self.train_data.split(splittrainandtest)

        shapes = self.train_data.getKerasFeatureShapes()
        inputdtypes = self.train_data.getKerasFeatureDTypes()
        inputnames = self.train_data.getKerasFeatureArrayNames()
        for i in range(len(inputnames)):
            if inputnames[i] == "" or inputnames[i] == "_rowsplits":
                inputnames[i] = "input_" + str(i) + inputnames[i]

        print("shapes", shapes)
        print("inputdtypes", inputdtypes)
        print("inputnames", inputnames)

        self.keras_inputs = []
        self.keras_inputsshapes = []
        counter = 0
        for s, dt, n in zip(shapes, inputdtypes, inputnames):
            self.keras_inputs.append(
                keras.layers.Input(shape=s, dtype=dt, name=n))
            self.keras_inputsshapes.append(s)

        if not isNewTraining:
            kfile = self.outputDir+'/KERAS_check_model_last.h5' \
        if os.path.isfile(self.outputDir+'/KERAS_check_model_last.h5') else \
        self.outputDir+'/KERAS_model.h5'
            if os.path.isfile(kfile):
                print(kfile)

                if self.dist_strat_scope is not None:
                    with self.dist_strat_scope.scope():
                        self.loadModel(kfile)
                else:
                    self.loadModel(kfile)
                self.trainedepoches = 0
                if os.path.isfile(self.outputDir + 'losses.log'):
                    for line in open(self.outputDir + 'losses.log'):
                        valloss = line.split(' ')[1][:-1]
                        if not valloss == "None":
                            self.trainedepoches += 1
                else:
                    print(
                        'incomplete epochs, starting from the beginning but with pretrained model'
                    )
            else:
                print(
                    'no model found in existing output dir, starting training from scratch'
                )

    def __del__(self):
        if hasattr(self, 'train_data'):
            del self.train_data
            del self.val_data

    def modelSet(self):
        return (not self.keras_model == None) and not len(
            self.keras_weight_model_path)

    def setDJCKerasModel(self, model, *args, **kwargs):
        if len(self.keras_inputs) < 1:
            raise Exception('setup data first')
        self.keras_model = model(*args, **kwargs)
        if hasattr(self.keras_model, "_is_djc_keras_model"):
            self.keras_model.setInputShape(self.keras_inputs)
            self.keras_model.build(None)
        if not self.keras_model:
            raise Exception('Setting DJCKerasModel not successful')

    def setModel(self, model, **modelargs):
        if len(self.keras_inputs) < 1:
            raise Exception('setup data first')
        if self.dist_strat_scope is not None:
            with self.dist_strat_scope.scope():
                self.keras_model = model(self.keras_inputs, **modelargs)
        else:
            self.keras_model = model(self.keras_inputs, **modelargs)
        if hasattr(self.keras_model, "_is_djc_keras_model"):  #compatibility
            self.keras_model.setInputShape(self.keras_inputs)
            self.keras_model.build(None)

        if len(self.keras_weight_model_path):
            from DeepJetCore.modeltools import apply_weights_where_possible, load_model
            self.keras_model = apply_weights_where_possible(
                self.keras_model, load_model(self.keras_weight_model_path))
        #try:
        #    self.keras_model=model(self.keras_inputs,**modelargs)
        #except BaseException as e:
        #    print('problem in setting model. Reminder: since DJC 2.0, NClassificationTargets and RegressionTargets must not be specified anymore')
        #    raise e
        if not self.keras_model:
            raise Exception('Setting model not successful')

    def saveCheckPoint(self, addstring=''):

        self.checkpointcounter = self.checkpointcounter + 1
        self.saveModel("KERAS_model_checkpoint_" +
                       str(self.checkpointcounter) + "_" + addstring + ".h5")

    def loadModel(self, filename):
        from keras.models import load_model
        self.keras_model = load_model(filename,
                                      custom_objects=custom_objects_list)
        self.optimizer = self.keras_model.optimizer
        self.compiled = True
        if self.ngpus > 1:
            self.compiled = False

    def setCustomOptimizer(self, optimizer):
        self.optimizer = optimizer
        self.custom_optimizer = True

    def compileModel(self,
                     learningrate,
                     clipnorm=None,
                     discriminator_loss=['binary_crossentropy'],
                     print_models=False,
                     metrics=None,
                     **compileargs):
        if not self.keras_model and not self.GAN_mode:
            raise Exception('set model first')

        if self.ngpus > 1 and not self.submitbatch:
            print('Model being compiled for ' + str(self.ngpus) + ' gpus')

        self.startlearningrate = learningrate

        if not self.custom_optimizer:
            from keras.optimizers import Adam
            if clipnorm:
                self.optimizer = Adam(lr=self.startlearningrate,
                                      clipnorm=clipnorm)
            else:
                self.optimizer = Adam(lr=self.startlearningrate)

        if self.dist_strat_scope is not None:
            with self.dist_strat_scope.scope():
                self.keras_model.compile(optimizer=self.optimizer,
                                         metrics=metrics,
                                         **compileargs)
        else:
            self.keras_model.compile(optimizer=self.optimizer,
                                     metrics=metrics,
                                     **compileargs)
        if print_models:
            print(self.keras_model.summary())
        self.compiled = True

    def compileModelWithCustomOptimizer(self, customOptimizer, **compileargs):
        raise Exception(
            'DEPRECATED: please use setCustomOptimizer before calling compileModel'
        )

    def saveModel(self, outfile):
        if not self.GAN_mode:
            self.keras_model.save(self.outputDir + outfile)
        else:
            self.gan.save(self.outputDir + 'GAN_' + outfile)
            self.generator.save(self.outputDir + 'GEN_' + outfile)
            self.discriminator.save(self.outputDir + 'DIS_' + outfile)

        #import h5py
        #f = h5py.File(self.outputDir+outfile, 'r+')
        #del f['optimizer_weights']
        #f.close()

    def _initTraining(self, nepochs, batchsize, use_sum_of_squares=False):

        if self.submitbatch:
            from DeepJetCore.training.batchTools import submit_batch
            submit_batch(self, self.args.walltime)
            exit()  #don't delete this!

        self.train_data.setBatchSize(batchsize)
        self.val_data.setBatchSize(batchsize)
        self.train_data.batch_uses_sum_of_squares = use_sum_of_squares
        self.val_data.batch_uses_sum_of_squares = use_sum_of_squares

        self.train_data.writeToFile(self.outputDir + 'trainsamples.djcdc')
        self.val_data.writeToFile(self.outputDir + 'valsamples.djcdc')

        #make sure tokens don't expire
        from .tokenTools import checkTokens, renew_token_process
        from _thread import start_new_thread

        if self.renewtokens:
            print('starting afs backgrounder')
            checkTokens()
            start_new_thread(renew_token_process, ())

        self.train_data.setBatchSize(batchsize)
        self.val_data.setBatchSize(batchsize)

    def trainModel(
            self,
            nepochs,
            batchsize,
            run_eagerly=False,
            batchsize_use_sum_of_squares=False,
            extend_truth_list_by=0,  #extend the truth list with dummies. Useful when adding more prediction outputs than truth inputs
            stop_patience=-1,
            lr_factor=0.5,
            lr_patience=-1,
            lr_epsilon=0.003,
            lr_cooldown=6,
            lr_minimum=0.000001,
            checkperiod=10,
            backup_after_batches=-1,
            additional_plots=None,
            additional_callbacks=None,
            load_in_mem=False,
            max_files=-1,
            plot_batch_loss=False,
            **trainargs):

        self.keras_model.run_eagerly = run_eagerly
        # write only after the output classes have been added
        self._initTraining(nepochs, batchsize, batchsize_use_sum_of_squares)

        self.keras_model.save(self.outputDir + 'KERAS_untrained_model.h5')
        print('setting up callbacks')
        from .DeepJet_callbacks import DeepJet_callbacks
        minTokenLifetime = 5
        if not self.renewtokens:
            minTokenLifetime = -1

        self.callbacks = DeepJet_callbacks(
            self.keras_model,
            stop_patience=stop_patience,
            lr_factor=lr_factor,
            lr_patience=lr_patience,
            lr_epsilon=lr_epsilon,
            lr_cooldown=lr_cooldown,
            lr_minimum=lr_minimum,
            outputDir=self.outputDir,
            checkperiod=checkperiod,
            backup_after_batches=backup_after_batches,
            checkperiodoffset=self.trainedepoches,
            additional_plots=additional_plots,
            batch_loss=plot_batch_loss,
            minTokenLifetime=minTokenLifetime)

        if additional_callbacks is not None:
            if not isinstance(additional_callbacks, list):
                additional_callbacks = [additional_callbacks]
            self.callbacks.callbacks.extend(additional_callbacks)

        print('starting training')
        if load_in_mem:
            if match_truth_and_pred_list:
                raise ValueError(
                    "match_truth_and_pred_list not available with load_in_mem")
            print('make features')
            X_train = self.train_data.getAllFeatures(nfiles=max_files)
            X_test = self.val_data.getAllFeatures(nfiles=max_files)
            print('make truth')
            Y_train = self.train_data.getAllLabels(nfiles=max_files)
            Y_test = self.val_data.getAllLabels(nfiles=max_files)
            self.keras_model.fit(X_train,
                                 Y_train,
                                 batch_size=batchsize,
                                 epochs=nepochs,
                                 callbacks=self.callbacks.callbacks,
                                 validation_data=(X_test, Y_test),
                                 max_queue_size=1,
                                 use_multiprocessing=False,
                                 workers=0,
                                 **trainargs)
        else:

            #prepare generator

            print("setting up generator... can take a while")
            traingen = self.train_data.invokeGenerator()
            valgen = self.val_data.invokeGenerator()
            #this is fixed
            traingen.extend_truth_list_by = extend_truth_list_by
            valgen.extend_truth_list_by = extend_truth_list_by

            while (self.trainedepoches < nepochs):

                #this can change from epoch to epoch
                #calculate steps for this epoch
                #feed info below
                traingen.prepareNextEpoch()
                valgen.prepareNextEpoch()
                nbatches_train = traingen.getNBatches(
                )  #might have changed due to shuffeling
                nbatches_val = valgen.getNBatches()

                print('>>>> epoch', self.trainedepoches, "/", nepochs)
                print('training batches: ', nbatches_train)
                print('validation batches: ', nbatches_val)

                self.keras_model.fit(traingen.feedNumpyData(),
                                     steps_per_epoch=nbatches_train,
                                     epochs=self.trainedepoches + 1,
                                     initial_epoch=self.trainedepoches,
                                     callbacks=self.callbacks.callbacks,
                                     validation_data=valgen.feedNumpyData(),
                                     validation_steps=nbatches_val,
                                     max_queue_size=1,
                                     use_multiprocessing=False,
                                     workers=0,
                                     **trainargs)
                self.trainedepoches += 1
                traingen.shuffleFilelist()
                #

            self.saveModel("KERAS_model.h5")

        return self.keras_model, self.callbacks.history

    def change_learning_rate(self, new_lr):
        import keras.backend as K
        if self.GAN_mode:
            K.set_value(self.discriminator.optimizer.lr, new_lr)
            K.set_value(self.gan.optimizer.lr, new_lr)
        else:
            K.set_value(self.keras_model.optimizer.lr, new_lr)

Example #29

0

Show file

File: convertDCtoNumpy.py Project: yongbinfeng/DeepJetCore

#!/usr/bin/env python
# encoding: utf-8

from argparse import ArgumentParser
from DeepJetCore.DataCollection import DataCollection

parser = ArgumentParser(
    'convert a data collection to a single set of numpy arrays. Warning, this can produce a large output'
)
parser.add_argument('inputDataCollection')
parser.add_argument('outputFilePrefix')
args = parser.parse_args()

print('reading data collection')

dc = DataCollection()
dc.readFromFile(args.inputDataCollection)

print('producing feature array')
feat = dc.getAllFeatures()

print('producing truth array')
truth = dc.getAllLabels()

print('producing weight array')
weight = dc.getAllWeights()

print('producing means and norms array')
means = dc.means

from numpy import save

Example #30

0

Show file

    def __init__(self,
                 splittrainandtest=0.85,
                 useweights=False,
                 testrun=False,
                 testrun_fraction=0.1,
                 resumeSilently=False,
                 renewtokens=True,
                 collection_class=DataCollection,
                 parser=None,
                 recreate_silently=False):

        import sys
        scriptname = sys.argv[0]

        if parser is None: parser = ArgumentParser('Run the training')
        parser.add_argument('inputDataCollection')
        parser.add_argument('outputDir')
        parser.add_argument(
            '--modelMethod',
            help=
            'Method to be used to instantiate model in derived training class',
            metavar='OPT',
            default=None)
        parser.add_argument("--gpu",
                            help="select specific GPU",
                            metavar="OPT",
                            default="")
        parser.add_argument("--gpufraction",
                            help="select memory fraction for GPU",
                            type=float,
                            metavar="OPT",
                            default=-1)
        parser.add_argument("--submitbatch",
                            help="submits the job to condor",
                            default=False,
                            action="store_true")
        parser.add_argument(
            "--walltime",
            help=
            "sets the wall time for the batch job, format: 1d5h or 2d or 3h etc",
            default='1d')
        parser.add_argument("--isbatchrun",
                            help="is batch run",
                            default=False,
                            action="store_true")
        parser.add_argument("--valdata",
                            help="set validation dataset (optional)",
                            default="")
        parser.add_argument(
            "--takeweights",
            help=
            "Applies weights from the model given as relative or absolute path. Matches by names and skips layers that don't match.",
            default="")

        args = parser.parse_args()
        self.args = args
        import sys
        self.argstring = sys.argv
        #sanity check
        if args.isbatchrun:
            args.submitbatch = False
            resumeSilently = True

        if args.submitbatch:
            print(
                'submitting batch job. Model will be compiled for testing before submission (GPU settings being ignored)'
            )

        import matplotlib
        #if no X11 use below
        matplotlib.use('Agg')
        DJCSetGPUs(args.gpu)

        if args.gpufraction > 0 and args.gpufraction < 1:
            import sys
            import tensorflow as tf
            gpu_options = tf.GPUOptions(
                per_process_gpu_memory_fraction=args.gpufraction)
            sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
            import keras
            from keras import backend as K
            K.set_session(sess)
            print('using gpu memory fraction: ' + str(args.gpufraction))

        import keras

        self.ngpus = 1
        self.dist_strat_scope = None
        if len(args.gpu):
            self.ngpus = len([i for i in args.gpu.split(',')])
            print('running on ' + str(self.ngpus) + ' gpus')
            if self.ngpus > 1:
                import tensorflow as tf
                self.dist_strat_scope = tf.distribute.MirroredStrategy()

        self.keras_inputs = []
        self.keras_inputsshapes = []
        self.keras_model = None
        self.keras_model_method = args.modelMethod
        self.keras_weight_model_path = args.takeweights
        self.train_data = None
        self.val_data = None
        self.startlearningrate = None
        self.optimizer = None
        self.trainedepoches = 0
        self.compiled = False
        self.checkpointcounter = 0
        self.renewtokens = renewtokens
        if args.isbatchrun:
            self.renewtokens = False
        self.callbacks = None
        self.custom_optimizer = False
        self.copied_script = ""
        self.submitbatch = args.submitbatch

        self.GAN_mode = False

        self.inputData = os.path.abspath(args.inputDataCollection) \
             if ',' not in args.inputDataCollection else \
              [os.path.abspath(i) for i in args.inputDataCollection.split(',')]
        self.outputDir = args.outputDir
        # create output dir

        isNewTraining = True
        if os.path.isdir(self.outputDir):
            if not (resumeSilently or recreate_silently):
                var = input(
                    'output dir exists. To recover a training, please type "yes"\n'
                )
                if not var == 'yes':
                    raise Exception('output directory must not exists yet')
            isNewTraining = False
            if recreate_silently:
                isNewTraining = True
        else:
            os.mkdir(self.outputDir)
        self.outputDir = os.path.abspath(self.outputDir)
        self.outputDir += '/'

        if recreate_silently:
            os.system('rm -rf ' + self.outputDir + '*')

        #copy configuration to output dir
        if not args.isbatchrun:
            try:
                shutil.copyfile(scriptname,
                                self.outputDir + os.path.basename(scriptname))
            except shutil.SameFileError:
                pass
            except BaseException as e:
                raise e

            self.copied_script = self.outputDir + os.path.basename(scriptname)
        else:
            self.copied_script = scriptname

        self.train_data = collection_class()
        self.train_data.readFromFile(self.inputData)
        self.train_data.useweights = useweights

        if len(args.valdata):
            print('using validation data from ', args.valdata)
            self.val_data = DataCollection(args.valdata)

        else:
            if testrun:
                if len(self.train_data) > 1:
                    self.train_data.split(testrun_fraction)

                self.train_data.dataclass_instance = None  #can't be pickled
                self.val_data = copy.deepcopy(self.train_data)

            else:
                self.val_data = self.train_data.split(splittrainandtest)

        shapes = self.train_data.getKerasFeatureShapes()
        inputdtypes = self.train_data.getKerasFeatureDTypes()
        inputnames = self.train_data.getKerasFeatureArrayNames()
        for i in range(len(inputnames)):
            if inputnames[i] == "" or inputnames[i] == "_rowsplits":
                inputnames[i] = "input_" + str(i) + inputnames[i]

        print("shapes", shapes)
        print("inputdtypes", inputdtypes)
        print("inputnames", inputnames)

        self.keras_inputs = []
        self.keras_inputsshapes = []
        counter = 0
        for s, dt, n in zip(shapes, inputdtypes, inputnames):
            self.keras_inputs.append(
                keras.layers.Input(shape=s, dtype=dt, name=n))
            self.keras_inputsshapes.append(s)

        if not isNewTraining:
            kfile = self.outputDir+'/KERAS_check_model_last.h5' \
        if os.path.isfile(self.outputDir+'/KERAS_check_model_last.h5') else \
        self.outputDir+'/KERAS_model.h5'
            if os.path.isfile(kfile):
                print(kfile)

                if self.dist_strat_scope is not None:
                    with self.dist_strat_scope.scope():
                        self.loadModel(kfile)
                else:
                    self.loadModel(kfile)
                self.trainedepoches = 0
                if os.path.isfile(self.outputDir + 'losses.log'):
                    for line in open(self.outputDir + 'losses.log'):
                        valloss = line.split(' ')[1][:-1]
                        if not valloss == "None":
                            self.trainedepoches += 1
                else:
                    print(
                        'incomplete epochs, starting from the beginning but with pretrained model'
                    )
            else:
                print(
                    'no model found in existing output dir, starting training from scratch'
                )