config = ConfigParser().parse_config(config=config, mode='evaluate') # load model if config['model'] == 'linear': model = models.LinearModel(input_shape=(config['history_length'], ), nb_output_units=1, nb_hidden_units=config['nb_hidden_units']) elif config['model'] == 'mlp': model = models.MLPModel(input_shape=(config['history_length'], ), nb_output_units=1, nb_hidden_units=config['nb_hidden_units'], nb_layers=config['nb_layers']) elif config['model'] == 'gru': model = models.GRUModel(input_shape=(config['history_length'], 1), nb_output_units=1, nb_hidden_units=config['nb_hidden_units'], nb_layers=config['nb_layers'], dropout=config['dropout'], recurrent_dropout=config['recurrent_dropout']) elif config['model'] == 'lstm': model = models.LSTMModel(input_shape=(config['history_length'], 1), nb_output_units=1, nb_hidden_units=config['nb_hidden_units'], nb_layers=config['nb_layers'], dropout=config['dropout'], recurrent_dropout=config['recurrent_dropout']) elif config['model'] == 'lstm_attention': model = models.LSTMAttentionModel( input_shape=(config['history_length'], 1), nb_output_units=1, nb_hidden_units=config['nb_hidden_units'], dropout=config['dropout'],
def train(spark, args): sc = spark.sparkContext numExecutors = int(sc._conf.get('spark.executor.instances')) exeCores = int(sc._conf.get('spark.executor.cores')) labelCol = 'encoded_label' if args.model == 'gru': featureCol = ['GRU_input'] model = models.GRUModel() elif args.model == 'hlf': featureCol = ['HLF_input'] model = models.HLFmodel() elif args.model == 'inclusive': featureCol = ['GRU_input', 'HLF_input'] model = models.InclusiveModel() else: sys.exit("Error, insert a valid model!") ## Load the parquet if args.frac != 1: sampleDF = True else: sampleDF = False trainDF = loadParquet(spark, args.dataset, featureCol, labelCol, sample=sampleDF, frac=args.frac) ## Convert in into an RDD of Sample trainRDD = createSample(trainDF, featureCol, labelCol) if args.validation != 'False': testDF = loadParquet(spark, args.validation, featureCol, labelCol, sample=False, frac=args.frac) testRDD = createSample(testDF, featureCol, labelCol) else: testRDD = False batchSize = args.batchMultiplier * numExecutors * exeCores appName = args.jobName + "_" + args.model + "_{}exe_{}cores".format( numExecutors, exeCores) optimizer = buildOptimizer(model=model, trainRDD=trainRDD, valRDD=testRDD, batchSize=batchSize, numEpochs=args.numEpochs, appName=appName, logDir=args.logDir) ## Start training start = time.time() optimizer.optimize() stop = time.time() print("\n\n Elapsed time: {:.2f}s\n\n".format(stop - start)) if args.saveModel == True: model.saveModel(modelPath=args.modelDir + '/' + appName + '.bigdl', weightPath=args.modelDir + '/' + appName + '.bin', over_write=True) if args.saveTime == True: with open(args.model + 'Times.csv', 'a') as file: file.write("{},{},{},{},{},{:.2f}\n".format( args.batchMultiplier, batchSize, exeCores, numExecutors, args.numEpochs, stop - start))