def predictMain(modelName,sc): timeSteps= 30 # No of past values that has to be used for Training purpose print "Going to Initialize the LSTM model" SMARTparameters=getSMARTParameters() print("The following are the SMART parameters:",SMARTparameters) lstm = ls.cloudLSTM(timeSteps=timeSteps,parms=SMARTparameters) # Initializing the DiskPrediction Model(LSTM Model) print "Initialized the Model" lstmModel = lstm.get_LSTM_Model() # Obtaining the LSTM model for initializing SparkModel Class trainSize= 0.2 # Fraction of input used for Training purpose acc = 0.0 # Model accuracy inputFilePath = os.environ.get('DATA_FILE_PATH') # Get the Input CSV filepath from environment year=sys.argv[1] # get the year from the Command Line arguments month=sys.argv[2] # get the month from the Command Line arguments inputFilePath=inputFilePath+str(year)+"/"+str(year)+"-"+str(month)+"*.csv" # For E.g "/home/user/Desktop/Cloud/Test/2014/2014-11*.csv" print("InputPath",inputFilePath) rd.generate_DataFrame(inputFilePath,SMARTparameters) inputCSVFilePath = os.environ.get('MODEL_CSV_FILEPATH')+str(modelName)+".csv" # For E.g "/hadoop/elephas/Output/ST4000DM000.csv" modelFeatures = pd.read_csv(filepath_or_buffer=inputCSVFilePath,usecols=SMARTparameters) modelLabel = pd.read_csv(filepath_or_buffer=inputCSVFilePath,usecols=['failure']) #"/hadoop/elephas/Output/ST4000DM000.csv" # Removing Not A Number values from the Input Dataframe modelFeatures = modelFeatures.fillna(0) modelLabel = modelLabel.fillna(0) # Obtaining 3D training and testing vectors (feature_train, label_train), (feature_test, label_test) = lstm.train_test_split(modelFeatures,modelLabel,trainSize,timeSteps) # Condition to check whether the failure cases exists in the data if len(feature_train)==0: print("DiskModel has no failure eleements. Training of the model cannot proceed!!") return # Initializing the Adam Optimizer for Elephas adam = elephas_optimizers.Adam() print "Adam Optimizer initialized" #Converting Dataframe to Spark RDD rddataset = to_simple_rdd(sc, feature_train, label_train) print "Training data converted into Resilient Distributed Dataset" #Initializing the SparkModel with Optimizer,Master-Worker Mode and Number of Workers spark_model = SparkModel(sc,lstmModel,optimizer=adam ,frequency='epoch', mode='asynchronous', num_workers=2) print "Spark Model Initialized" #Initial training run of the model spark_model.train(rddataset, nb_epoch=10, batch_size=200, verbose=1, validation_split=0) # Saving the model score = spark_model.evaluate(feature_test, label_test,show_accuracy=True) while(score <= 0.5): # Training the Input Data set spark_model.train(rddataset, nb_epoch=10, batch_size=200, verbose=1, validation_split=0) print "LSTM model training done !!" score = spark_model.evaluate(feature_test, label_test,show_accuracy=True) print "Saving weights!!" outFilePath=os.environ.get('GATOR_SQUAD_HOME') outFilePath=outFilePath+"Weights/"+str(year)+"/"+str(month)+"/"+str(modelName)+"_my_model_weights.h5" spark_model.save_weights(outFilePath) print "LSTM model testing commencing !!" predicted1=spark_model.predict_classes(feature_test) df_confusion = pd.crosstab(label_test.flatten(), predicted1.flatten(), rownames=['Actual'], colnames=['Predicted'], margins=True) print df_confusion
def run_train(master_name, filename, outname): import pyspark conf = pyspark.SparkConf().setAppName("CRF").setMaster(master_name) sc = pyspark.SparkContext(conf=conf) tfile = sc.textFile(filename) dataset = textFileToDataset(tfile) indexer = Indexer() indexer.prepareIndexer(dataset, min_count=0) print('[Prepare Trainloader] {} samples'.format(dataset.count())) trainset = indexer.convertToElephasFormat(dataset) embedding_size = 128 print('[Char account] {}'.format(len(indexer.chars))) crf_model = CRF(5, True, name='CRF') cnn_model = Sequential([ Embedding(len(indexer.chars)+1, embedding_size), Conv1D(128, 3, activation='relu', padding='same',\ kernel_constraint=maxnorm(1.0), name='conv1'), Conv1D(128, 3, activation='relu', padding='same',\ kernel_constraint=maxnorm(1.0), name='conv2'), Dense(5), Lambda(lambda x:x) #crf_model ]) ''' embed=Embedding(len(Indexer._chars)+1, embedding_size)(inph) cnn=Conv1D(128, 3, activation='relu', padding='same')(embed) cnn=Conv1D(128, 3, activation='relu', padding='same')(cnn) tag_score=Dense(5)(cnn) ''' crf_model.trans = cnn_model.layers[-1].add_weight(name='transM', \ shape=(crf_model.num_labels, crf_model.num_labels),\ initializer=glorot_normal()) cnn_model.compile(loss=crf_model.loss, optimizer='adam', metrics=[crf_model.accuracy]) cnn_model.summary() # momentum = 0., decay=0. nesterov=False optimizerE = elephas.optimizers.SGD(lr=0.0001, momentum=0.9, decay=0.7, nesterov=True) spark_model = SparkModel(sc, cnn_model, optimizer=optimizerE,\ frequency='epoch', mode='asynchronous', num_workers=2,\ ) #custom_objects={'CRF': crf_model}) spark_model.train(trainset, nb_epoch=2, batch_size=200, validation_split=0.3, verbose=1) model = spark_model.master_network model.save(outname) print('Train Finish')
print("Creating Training and Test Data") ((x_train, y_train), (x_test, y_test)) = train_test_split(testinput.fillna(0), testoutput.fillna(0), test_size=0.3) print("Training data : x") print(type(x_train)) print(x_train) print("Training data : y") print(type(y_train)) print(y_train) print("Test data : x") print(type(x_test)) print(x_test) print("Test data : y") print(type(y_test)) print(y_test) print("Converting training data to RDD") rddataset = to_simple_rdd(sc, x_train, y_train) print("Initializing SPark Model") sgd = elephas_optimizers.SGD() spark_model = SparkModel(sc, model, optimizer=sgd, frequency="epoch", mode="asynchronous", num_workers=2) print("Commencing training") spark_model.train(rddataset, nb_epoch=10, batch_size=200, verbose=1, validation_split=0) # model.fit(x_train, y_train, nb_epoch=5, batch_size=32) print("Training completed") sc.stop()
model.add(Activation('relu')) model.add(Convolution2D(nb_filters, nb_conv, nb_conv)) model.add(Activation('relu')) model.add(MaxPooling2D(pool_size=(nb_pool, nb_pool))) model.add(Dropout(0.25)) model.add(Flatten()) model.add(Dense(128)) model.add(Activation('relu')) model.add(Dropout(0.5)) model.add(Dense(nb_classes)) model.add(Activation('softmax')) model.compile(loss='categorical_crossentropy', optimizer='adadelta') ## spark conf = SparkConf().setAppName(APP_NAME).setMaster(MASTER_IP) sc = SparkContext(conf=conf) # Build RDD from numpy features and labels rdd = to_simple_rdd(sc, X_train, Y_train) # Initialize SparkModel from Keras model and Spark context spark_model = SparkModel(sc,model) # Train Spark model spark_model.train(rdd, nb_epoch=nb_epoch, batch_size=batch_size, verbose=0, validation_split=0.1, num_workers=24) # Evaluate Spark model by evaluating the underlying model score = spark_model.get_network().evaluate(X_test, Y_test, show_accuracy=True, verbose=2) print('Test accuracy:', score[1])
# Compile model sgd = SGD(lr=0.1) model.compile(loss='categorical_crossentropy', optimizer=sgd) # Build RDD from numpy features and labels rdd = to_simple_rdd(sc, x_train, y_train) # Initialize SparkModel from Keras model and Spark context adagrad = elephas_optimizers.Adagrad() spark_model = SparkModel(sc, model, optimizer=adagrad, frequency='epoch', mode='asynchronous', num_workers=2) # Train Spark model spark_model.train(rdd, nb_epoch=nb_epoch, batch_size=batch_size, verbose=2, validation_split=0.1) # Evaluate Spark model by evaluating the underlying model score = spark_model.master_network.evaluate(x_test, y_test, show_accuracy=True, verbose=2) print('Test accuracy:', score[1])
batch_size = 100 # Accuracy records stat_lines = [] adagrad = elephas_optimizers.Adagrad() for i in range(0, 200): # Train Spark model # Initialize SparkModel from Keras model and Spark context spark_model = SparkModel(sc, model, mode='asynchronous', frequency='epoch', num_workers=1, optimizer=adagrad) spark_model.train(rdd, nb_epoch=num_epoch_in_one_step, batch_size=batch_size, verbose=0, validation_split=0.1) score1 = model.evaluate(x_train, y_train, verbose=0) score2 = model.evaluate(x_test, y_test, verbose=0) print('#############################') print('Finished epochs', (i + 1) * num_epoch_in_one_step) print('Train accuracy:', score1[1]) print('Test accuracy:', score2[1]) print('#############################') stat_lines.append( str((i + 1) * 10) + ', ' + str(score1[1]) + ', ' + str(score2[1])) FileIO.write_lines_to_file('./cnn_1.log', stat_lines) if (i + 1) % 10 == 0 and i != 0: model.save('./models/cnn_1_' + str((i + 1) * 10) + 'ep.h5') # sc.stop()
model = Sequential() model.add(Dense(784, 128)) model.add(Activation('relu')) model.add(Dropout(0.2)) model.add(Dense(128, 128)) model.add(Activation('relu')) model.add(Dropout(0.2)) model.add(Dense(128, 10)) model.add(Activation('softmax')) # Compile model rms = RMSprop() model.compile(loss='categorical_crossentropy', optimizer=rms) # Create Spark context conf = SparkConf().setAppName('Mnist_Spark_MLP').setMaster('local[8]') sc = SparkContext(conf=conf) # Build RDD from numpy features and labels rdd = to_simple_rdd(sc, X_train, Y_train) # Initialize SparkModel from Keras model and Spark context spark_model = SparkModel(sc,model) # Train Spark model spark_model.train(rdd, nb_epoch=nb_epoch, batch_size=batch_size, verbose=0, validation_split=0.1, num_workers=8) # Evaluate Spark model by evaluating the underlying model score = spark_model.get_network().evaluate(X_test, Y_test, show_accuracy=True, verbose=2) print('Test accuracy:', score[1])
from elephas.utils.rdd_utils import to_labeled_point from elephas.utils.rdd_utils import to_simple_rdd lp_rdd = to_simple_rdd(sc, features_train, labels_train) #print(lp_rdd.take(5)) from elephas.spark_model import SparkModel from elephas import optimizers as elephas_optimizers adagrad = elephas_optimizers.Adagrad() spark_model = SparkModel(sc, model, optimizer=adagrad, frequency='epoch', mode='asynchronous', num_workers=8) spark_model.train(lp_rdd, nb_epoch=20, batch_size=32, verbose=0, validation_split=0.1) print(spark_model) prediction = spark_model.predict_classes(features_test) print(prediction) truth = [l[1] for l in labels_test] from sklearn.metrics import confusion_matrix print(confusion_matrix(truth, prediction))
class KerasNeuralNetworkSpark(object): def __init__(self, layers, spark, batch_size=64, epoch=10, num_workers=2, predictionCol='prediction', labelCol='target', featuresCol='feature'): self._batch_size = batch_size self._epoch = epoch self._model = None self._spark = spark self._labels = labelCol self._features = featuresCol self._prediction = predictionCol self._layers = layers self._worker_num = num_workers self._build_model() def _build_model(self): model = Sequential() adam = elephas_optimizers.Adam() layers = self._layers model.add(Dense(layers[1], input_dim=layers[0], init='normal', activation='relu')) for i in range(2, len(layers) - 1): model.add(Dense(layers[i], activation='relu')) model.add(Dense(layers[-1], activation='sigmoid')) self._model = SparkModel(self._spark.sparkContext, model, optimizer=adam, frequency='epoch', mode='asynchronous', master_loss='mse', num_workers=self._worker_num) def fit(self, df): if hasattr(self._model, 'server'): self._model.server.terminate() pdf = df.toPandas() rdd = to_simple_rdd(self._spark.sparkContext, pdf[self._features], pdf[self._labels]) self._model.train(rdd, self._epoch, self._batch_size, 0, 0.1) def transform(self, df): pdf = df.toPandas() # df.write.save('test_df.parquet') pnparray = pdf[self._features].values container = np.zeros((pnparray.shape[0], len(pnparray[0]))) for i in range(pnparray.shape[0]): container[i, :] = pnparray[i][:] result = self._model.predict(container) pdf[self._prediction] = result # import pickle # with open('ann_result.p', 'w') as f: # pickle.dump(result, f) # result_df = pd.DataFrame(pdf new_df = self._spark.createDataFrame(pdf) # df.join(new_df) return new_df def stop_server(self): if hasattr(self._model, 'server') and hasattr(self._model.server, 'terminate'): self._model.server.terminate()
# output signal. Here's the activation function is given be ReLU. model.add(Activation('relu')) model.add(Dropout(0.5)) # dropout is then applied # finally the 128 outputs of the previous FC layer are fully connected to num_classes of neurons, which # is activated by a softmax function model.add( Dense(nb_classes, W_regularizer=l2(0.01) )) model.add( Activation('softmax') ) # write the neural network model representation to a png image #grapher.plot(model, 'nn_mnist.png') model.compile(loss='categorical_crossentropy', optimizer='adadelta') # model.compile(loss='categorical_crossentropy', optimizer='sgd' or 'adam or 'adadelta') ## spark conf = SparkConf().setAppName(APP_NAME) #.setMaster(MASTER_IP) sc = SparkContext(conf=conf) # Build RDD from numpy features and labels rdd = to_simple_rdd(sc, X_train, Y_train) # Initialize SparkModel from Keras model and Spark context spark_model = SparkModel(sc,model) # Train Spark model spark_model.train(rdd, nb_epoch=nb_epoch, batch_size=batch_size, verbose=1, validation_split=0.15) # num_workers might not work in early spark version # Evaluate Spark model by evaluating the underlying model score = spark_model.get_network().evaluate(X_test, Y_test, show_accuracy=True, verbose=2) print('Test accuracy:', score[1])
#early_stopping = EarlyStopping(monitor='val_acc', patience=5) #print 'Start training...' #model.fit( X_train, Y_train, batch_size=batch_size, nb_epoch=nb_epoch, verbose=verbose, callbacks=[checkpointer],validation_split=validation_split, shuffle=shuffle,show_accuracy=show_accuracy) # Create Spark Context conf = SparkConf().setAppName(MODEL) sc = SparkContext(conf=conf) # Build RDD from numpy features and labels rdd = to_simple_rdd(sc, X_train, Y_train) # Initialize SparkModel from Keras model and Spark Context rmsprop = elephas_optimizers.RMSprop() spark_model = SparkModel(sc,\ model,\ optimizer=rmsprop,\ frequency='epoch',\ mode='asynchronous',\ num_workers=3) spark_model.train(rdd,\ nb_epoch=nb_epoch,\ batch_size=batch_size,\ verbose=2,\ validation_split=validation_split) spark_model.get_network().save_weights(MODEL_FILE_NAME)
#---(i.e. in training each worker will train on part of the data) rdd = to_simple_rdd(sc, X_train, y_train) #---Initialize SparkModel from Keras model and Spark context #---there are two optimizers needed: sgd = SGD(lr=0.1) #<---the master optimizer adagrad = elephas_optimizers.Adagrad() #<---the elephas opimizer spark_model = SparkModel(sc, model, optimizer=adagrad, frequency='epoch', mode='asynchronous', num_workers=args.N_workers, master_optimizer=sgd) #---Train Spark model spark_model.train(rdd, nb_epoch=args.nb_epoch, batch_size=args.batch_size, verbose=1, validation_split=0.25) #---Evaluate Spark model by evaluating the underlying Keras master model pred = spark_model.predict(X_test) print np.shape(pred) print np.shape(y_test) acc = accuracy_score([np.argmax(y) for y in y_test], [np.argmax(p) for p in pred]) print "--->test accuracy: ", acc print "--->number of workers: ", args.N_workers print "--->time: ", time.time() - start_time
model.add(Dense(784, 128)) model.add(Activation('relu')) model.add(Dropout(0.2)) model.add(Dense(128, 128)) model.add(Activation('relu')) model.add(Dropout(0.2)) model.add(Dense(128, 10)) model.add(Activation('softmax')) # Compile model rms = RMSprop() model.compile(loss='categorical_crossentropy', optimizer=rms) # Create Spark context conf = SparkConf().setAppName('Mnist_Spark_MLP').setMaster('local[8]') sc = SparkContext(conf=conf) # Build RDD from numpy features and labels rdd = to_simple_rdd(sc, X_train, Y_train) rdd = rdd.repartition(8) # Initialize SparkModel from Keras model and Spark context spark_model = SparkModel(sc,model) # Train Spark model spark_model.train(rdd, nb_epoch=20, batch_size=32, verbose=0, validation_split=0.1) # Evaluate Spark model by evaluating the underlying model score = spark_model.get_network().evaluate(X_test, Y_test, show_accuracy=True, verbose=2) print('Test accuracy:', score[1])
#checkpointer = ModelCheckpoint(filepath=MODEL_ROOT+MODEL+".h5", verbose=1, save_best_only=False) #early_stopping = EarlyStopping(monitor='val_acc', patience=5) #print 'Start training...' #model.fit( X_train, Y_train, batch_size=batch_size, nb_epoch=nb_epoch, verbose=verbose, callbacks=[checkpointer],validation_split=validation_split, shuffle=shuffle,show_accuracy=show_accuracy) # Create Spark Context conf = SparkConf().setAppName(MODEL) sc = SparkContext(conf=conf) # Build RDD from numpy features and labels rdd = to_simple_rdd(sc, X_train, Y_train) # Initialize SparkModel from Keras model and Spark Context rmsprop = elephas_optimizers.RMSprop() spark_model = SparkModel(sc,\ model,\ optimizer=rmsprop,\ frequency='epoch',\ mode='asynchronous',\ num_workers=3) spark_model.train(rdd,\ nb_epoch=nb_epoch,\ batch_size=batch_size,\ verbose=2,\ validation_split=validation_split) spark_model.get_network().save_weights(MODEL_FILE_NAME)
model.add(MaxPooling1D(pool_size=4)) model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2)) model.add(Dense(modelpara_dict['Lable_num'], activation='softmax')) print(model.summary()) sgd = SGD(lr=0.1) model.compile(loss='categorical_crossentropy', optimizer=sgd) adagrad = elephas_optimizers.Adagrad() spark_model = SparkModel(sc, model, optimizer=adagrad, frequency='epoch', mode='synchronous', num_workers=3) # Train Spark model spark_model.train(train_data, nb_epoch=1, batch_size=32, verbose=2, validation_split=0.1) spark_model.master_network.save('model/' + modelname + '/' + modelname + '.h5') # Evaluate Spark model by evaluating the underlying model #score = spark_model.master_network.evaluate(x_test, y_test, verbose=2) #print('Test accuracy:', score[1])
print('Test data : x') print(type(x_test)) print(x_test) print('Test data : y') print(type(y_test)) print(y_test) print('Converting training data to RDD') rddataset = to_simple_rdd(sc, x_train, y_train) print('Initializing SPark Model') sgd = elephas_optimizers.SGD() spark_model = SparkModel(sc, model, optimizer=sgd, frequency='epoch', mode='asynchronous', num_workers=2) print('Commencing training') spark_model.train(rddataset, nb_epoch=10, batch_size=200, verbose=1, validation_split=0) #model.fit(x_train, y_train, nb_epoch=5, batch_size=32) print('Training completed') sc.stop()
model.add(Activation('relu')) model.add(Dropout(0.2)) model.add(Dense(10)) model.add(Activation('softmax')) sgd = SGD(lr=0.1) # Build RDD from numpy features and labels rdd = to_simple_rdd(sc, x_train, y_train) # Initialize SparkModel from Keras model and Spark context adagrad = elephas_optimizers.Adagrad() spark_model = SparkModel(sc, model, optimizer=adagrad, frequency='epoch', mode='asynchronous', num_workers=2, master_optimizer=sgd) # Train Spark model spark_model.train(rdd, nb_epoch=nb_epoch, batch_size=batch_size, verbose=2, validation_split=0.1) # Evaluate Spark model by evaluating the underlying model score = spark_model.master_network.evaluate(x_test, y_test, verbose=2) print('Test accuracy:', score[1])
model.compile(loss='categorical_crossentropy', optimizer=SGD()) model.summary() # Create a Resilient Distributed Dataset (RDD) from training data # TODO: get data # TODO: is it possible to separate traininng data into multiple batches? rdd = to_simple_rdd(sc, X_train, Y_train) # Create the Elephas model instance spark_model = SparkModel(sc, model, optimizer = elephas_optimizers.Adagrad(), frequency = 'epoch', mode = 'asynchronous', num_workers = WORKERS ) # Train model spark_model.train(rdd, nb_epoch = EPOCHS, batch_size = BATCH_SIZE, verbose = False, validation_split = VAL_SPLIT, num_workers = WORKERS )