Esempio n. 1
0
def main():
    gps_files = glob.glob('../data/prototype/**/gps_points.csv')
    trip_files = glob.glob('../data/prototype/**/gps_trips.csv')

    file_results = process_file(trip_file = trip_files[0], gps_file = gps_files[0])
    seq_results = build_seq(input_df = file_results['df'], unique_trips = file_results['unique_trips'])

    X = seq_results['x']
    y = seq_results['y']

    print('Bulding training data from files..')
    for i in range(1, len(gps_files)):
        file_results = process_file(trip_file = trip_files[i], gps_file = gps_files[i])
        seq_results = build_seq(input_df = file_results['df'], unique_trips = file_results['unique_trips'])

        X = np.vstack((X, seq_results['x']))
        y = np.vstack((y, seq_results['y']))

    x_train, x_val, y_train, y_val = train_test_split(X, y, random_state=1, train_size=0.8)

    rdd = to_simple_rdd(sc, x_train, y_train)

    model = build_model()

    spark_model = SparkModel(model, frequency='epoch', mode='asynchronous')

    spark_model.fit(rdd, epochs=5, batch_size=32, verbose=0, validation_split=0.1)
#    model.fit(x_train, y_train, epochs=5, validation_data=(x_val, y_val))

    y_pred = spark_model.predict(x_val)

    acc = sum(np.argmax(y_pred, axis=1) == np.argmax(y_val, axis=1)) / y_pred.shape[0]

    print("Validation Accuracy: {number:.{digits}f}%".format(number=(acc*100), digits=2))
Esempio n. 2
0
    def train_gate(self, datagen, weights_file):
        model = self.gateModel
        model.compile(loss='categorical_crossentropy',
                      optimizer=Adam(),
                      metrics=['accuracy'])
        print(model.summary())
        self.gateModel = SparkModel(model,
                                    frequency='epoch',
                                    mode='asynchronous')
        score = self.gateModel.master_network.evaluate(self.x_test,
                                                       self.y_test,
                                                       verbose=2,
                                                       batch_size=50)
        self.gateModel.fit(self.rdd, epochs=1, batch_size=50, verbose=1)
        self.gateModel = self.gateModel.master_network
        self.gateModel.save_weights(weights_file + '.hdf5')

        file = '../lib/output.txt'
        if os.path.exists(file):
            append_write = 'a'
        else:
            append_write = 'w'

        #score = self.gateModel.evaluate(self.x_test, self.y_test, verbose=2, batch_size=50)
        print("------------------------------")
        print("Score is:" + str(score[1]))
        print("-------------------------------")
        text_file = open(file, append_write)
        text_file.write("Score: %s" % score[1])
        text_file.close()
def predictMain(modelName,sc):
    timeSteps= 30                                                                   # No of past values that has to be used for Training purpose
    print "Going to Initialize the LSTM model"
    SMARTparameters=getSMARTParameters()
    print("The following are the SMART parameters:",SMARTparameters)
    lstm = ls.cloudLSTM(timeSteps=timeSteps,parms=SMARTparameters)                  # Initializing the DiskPrediction Model(LSTM Model)
    print "Initialized the Model"
    lstmModel = lstm.get_LSTM_Model()                   			    # Obtaining the LSTM model for initializing SparkModel Class
    trainSize= 0.2                                                                  # Fraction of input used for Training purpose
    acc = 0.0                                                                       # Model accuracy
    inputFilePath = os.environ.get('DATA_FILE_PATH')                                # Get the Input CSV filepath from environment
    year=sys.argv[1]                                                                # get the year from the Command Line arguments
    month=sys.argv[2]                                                               # get the month from the Command Line arguments
    inputFilePath=inputFilePath+str(year)+"/"+str(year)+"-"+str(month)+"*.csv"  # For E.g "/home/user/Desktop/Cloud/Test/2014/2014-11*.csv"
    print("InputPath",inputFilePath)
    rd.generate_DataFrame(inputFilePath,SMARTparameters)
    inputCSVFilePath = os.environ.get('MODEL_CSV_FILEPATH')+str(modelName)+".csv"    # For E.g "/hadoop/elephas/Output/ST4000DM000.csv"

    modelFeatures = pd.read_csv(filepath_or_buffer=inputCSVFilePath,usecols=SMARTparameters)
    modelLabel = pd.read_csv(filepath_or_buffer=inputCSVFilePath,usecols=['failure'])   #"/hadoop/elephas/Output/ST4000DM000.csv"

    # Removing Not A Number values from the Input Dataframe
    modelFeatures = modelFeatures.fillna(0)
    modelLabel = modelLabel.fillna(0)

    # Obtaining 3D training and testing vectors
    (feature_train, label_train), (feature_test, label_test) = lstm.train_test_split(modelFeatures,modelLabel,trainSize,timeSteps)

    # Condition to check whether the failure cases exists in the data
    if len(feature_train)==0:
        print("DiskModel has no failure eleements. Training of the model cannot proceed!!")
        return
    # Initializing the Adam Optimizer for Elephas
    adam = elephas_optimizers.Adam()
    print "Adam Optimizer initialized"
    #Converting Dataframe to Spark RDD
    rddataset = to_simple_rdd(sc, feature_train, label_train)
    print "Training data converted into Resilient Distributed Dataset"
    #Initializing the SparkModel with Optimizer,Master-Worker Mode and Number of Workers
    spark_model = SparkModel(sc,lstmModel,optimizer=adam ,frequency='epoch', mode='asynchronous', num_workers=2)
    print "Spark Model Initialized"
    #Initial training run of the model
    spark_model.train(rddataset, nb_epoch=10, batch_size=200, verbose=1, validation_split=0)
    # Saving the model
    score = spark_model.evaluate(feature_test, label_test,show_accuracy=True)

    while(score <= 0.5):
        # Training the Input Data set
        spark_model.train(rddataset, nb_epoch=10, batch_size=200, verbose=1, validation_split=0)
        print "LSTM model training done !!"
        score = spark_model.evaluate(feature_test, label_test,show_accuracy=True)
    print "Saving weights!!"
    outFilePath=os.environ.get('GATOR_SQUAD_HOME')
    outFilePath=outFilePath+"Weights/"+str(year)+"/"+str(month)+"/"+str(modelName)+"_my_model_weights.h5"
    spark_model.save_weights(outFilePath)
    print "LSTM model testing commencing !!"
    predicted1=spark_model.predict_classes(feature_test)
    df_confusion = pd.crosstab(label_test.flatten(), predicted1.flatten(), rownames=['Actual'], colnames=['Predicted'], margins=True)
    print df_confusion
Esempio n. 4
0
def test_sequential_serialization(spark_context, classification_model):
    classification_model.compile(optimizer="sgd",
                                 loss="categorical_crossentropy",
                                 metrics=["acc"])
    spark_model = SparkModel(classification_model,
                             frequency='epoch',
                             mode='synchronous')
    spark_model.save("elephas_sequential.h5")
Esempio n. 5
0
def run_train(master_name, filename, outname):
    import pyspark
    conf = pyspark.SparkConf().setAppName("CRF").setMaster(master_name)
    sc = pyspark.SparkContext(conf=conf)
    tfile = sc.textFile(filename)
    dataset = textFileToDataset(tfile)
    indexer = Indexer()
    indexer.prepareIndexer(dataset, min_count=0)

    print('[Prepare Trainloader] {} samples'.format(dataset.count()))
    trainset = indexer.convertToElephasFormat(dataset)
    embedding_size = 128
    print('[Char account] {}'.format(len(indexer.chars)))

    crf_model = CRF(5, True, name='CRF')
    cnn_model = Sequential([
        Embedding(len(indexer.chars)+1, embedding_size),
        Conv1D(128, 3, activation='relu', padding='same',\
               kernel_constraint=maxnorm(1.0), name='conv1'),
        Conv1D(128, 3, activation='relu', padding='same',\
               kernel_constraint=maxnorm(1.0), name='conv2'),
        Dense(5),
        Lambda(lambda x:x)
        #crf_model
    ])
    '''
    embed=Embedding(len(Indexer._chars)+1, embedding_size)(inph)
    cnn=Conv1D(128, 3, activation='relu', padding='same')(embed)
    cnn=Conv1D(128, 3, activation='relu', padding='same')(cnn)
    tag_score=Dense(5)(cnn)
    '''
    crf_model.trans = cnn_model.layers[-1].add_weight(name='transM', \
                        shape=(crf_model.num_labels, crf_model.num_labels),\
                        initializer=glorot_normal())
    cnn_model.compile(loss=crf_model.loss,
                      optimizer='adam',
                      metrics=[crf_model.accuracy])
    cnn_model.summary()
    # momentum = 0., decay=0. nesterov=False
    optimizerE = elephas.optimizers.SGD(lr=0.0001,
                                        momentum=0.9,
                                        decay=0.7,
                                        nesterov=True)
    spark_model = SparkModel(sc, cnn_model, optimizer=optimizerE,\
                    frequency='epoch', mode='asynchronous', num_workers=2,\
                             ) #custom_objects={'CRF': crf_model})

    spark_model.train(trainset,
                      nb_epoch=2,
                      batch_size=200,
                      validation_split=0.3,
                      verbose=1)
    model = spark_model.master_network
    model.save(outname)
    print('Train Finish')
def test_sync_mode(spark_context):
    # Define basic parameters
    batch_size = 64
    nb_classes = 10
    epochs = 10

    # Load data
    (x_train, y_train), (x_test, y_test) = mnist.load_data()

    x_train = x_train.reshape(60000, 784)
    x_test = x_test.reshape(10000, 784)
    x_train = x_train.astype("float32")
    x_test = x_test.astype("float32")
    x_train /= 255
    x_test /= 255
    print(x_train.shape[0], 'train samples')
    print(x_test.shape[0], 'test samples')

    # Convert class vectors to binary class matrices
    y_train = np_utils.to_categorical(y_train, nb_classes)
    y_test = np_utils.to_categorical(y_test, nb_classes)

    model = Sequential()
    model.add(Dense(128, input_dim=784))
    model.add(Activation('relu'))
    model.add(Dropout(0.2))
    model.add(Dense(128))
    model.add(Activation('relu'))
    model.add(Dropout(0.2))
    model.add(Dense(10))
    model.add(Activation('softmax'))

    sgd = SGD(lr=0.1)
    model.compile(sgd, 'categorical_crossentropy', ['acc'])

    # Build RDD from numpy features and labels
    rdd = to_simple_rdd(spark_context, x_train, y_train)

    # Initialize SparkModel from Keras model and Spark context
    spark_model = SparkModel(model, mode='synchronous')

    # Train Spark model
    spark_model.fit(rdd,
                    epochs=epochs,
                    batch_size=batch_size,
                    verbose=2,
                    validation_split=0.1)

    # Evaluate Spark model by evaluating the underlying model
    score = spark_model.master_network.evaluate(x_test, y_test, verbose=2)
    assert score[1] >= 0.70
Esempio n. 7
0
File: HAN.py Progetto: sd12832/HAN
    def set_model(self):
        """
        Set the HAN model according to the given hyperparameters
        """
        if self.hyperparameters['l2_regulizer'] is None:
            kernel_regularizer = None
        else:
            kernel_regularizer = regularizers.l2(
                self.hyperparameters['l2_regulizer'])
        if self.hyperparameters['dropout_regulizer'] is None:
            dropout_regularizer = 1
        else:
            dropout_regularizer = self.hyperparameters['dropout_regulizer']
        word_input = Input(shape=(self.max_senten_len, ), dtype='float32')
        word_sequences = self.get_embedding_layer()(word_input)
        word_lstm = Bidirectional(self.hyperparameters['rnn'](
            self.hyperparameters['rnn_units'],
            return_sequences=True,
            kernel_regularizer=kernel_regularizer))(word_sequences)
        word_dense = TimeDistributed(
            Dense(self.hyperparameters['dense_units'],
                  kernel_regularizer=kernel_regularizer))(word_lstm)
        word_att = AttentionWithContext()(word_dense)
        wordEncoder = Model(word_input, word_att)

        sent_input = Input(shape=(self.max_senten_num, self.max_senten_len),
                           dtype='float32')
        sent_encoder = TimeDistributed(wordEncoder)(sent_input)
        sent_lstm = Bidirectional(self.hyperparameters['rnn'](
            self.hyperparameters['rnn_units'],
            return_sequences=True,
            kernel_regularizer=kernel_regularizer))(sent_encoder)
        sent_dense = TimeDistributed(
            Dense(self.hyperparameters['dense_units'],
                  kernel_regularizer=kernel_regularizer))(sent_lstm)
        sent_att = Dropout(dropout_regularizer)(
            AttentionWithContext()(sent_dense))
        preds = Dense(len(self.classes))(sent_att)
        self.model = Model(sent_input, preds)
        self.model.compile(loss=self.hyperparameters['loss'],
                           optimizer=self.hyperparameters['optimizer'],
                           metrics=self.hyperparameters['metrics'])
        self.spark_model = SparkModel(self.model,
                                      frequency='epoch',
                                      mode='asynchronous')
def test_sequential_serialization():
    # Create Spark context
    pytest.mark.usefixtures("spark_context")

    seq_model = Sequential()
    seq_model.add(Dense(128, input_dim=784))
    seq_model.add(Activation('relu'))
    seq_model.add(Dropout(0.2))
    seq_model.add(Dense(128))
    seq_model.add(Activation('relu'))
    seq_model.add(Dropout(0.2))
    seq_model.add(Dense(10))
    seq_model.add(Activation('softmax'))

    seq_model.compile(
        optimizer="sgd", loss="categorical_crossentropy", metrics=["acc"])
    spark_model = SparkModel(seq_model, frequency='epoch', mode='synchronous')
    spark_model.save("elephas_sequential.h5")
def test_model_serialization():
    # This returns a tensor
    inputs = Input(shape=(784,))

    # a layer instance is callable on a tensor, and returns a tensor
    x = Dense(64, activation='relu')(inputs)
    x = Dense(64, activation='relu')(x)
    predictions = Dense(10, activation='softmax')(x)

    # This creates a model that includes
    # the Input layer and three Dense layers
    model = Model(inputs=inputs, outputs=predictions)
    model.compile(optimizer='rmsprop',
                      loss='categorical_crossentropy',
                      metrics=['accuracy'])

    spark_model = SparkModel(model, frequency='epoch', mode='synchronous')
    spark_model.save("elephas_model.h5")
Esempio n. 10
0
def make_model(data):
    data.show()
    data = data.dropna()
    nb_classes = data.select("label").distinct().count()
    input_dim = len(data.select("features").first()[0])

    print(nb_classes, input_dim)

    model = Sequential()
    model.add(Embedding(input_dim=input_dim, output_dim=100))
    #model.add(LSTM(64,return_sequences=False,dropout=0.1,recurrent_dropout=0.1))
    model.add(Dense(100, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(nb_classes, activation='softmax'))
    #sgd = optimizers.SGD(lr=0.1)
    #model.compile(sgd, 'categorical_crossentropy', ['acc'])
    model.compile(loss='binary_crossentropy', optimizer='adam')

    #model.compile(loss='categorical_crossentropy', optimizer='adam')
    spark_model = SparkModel(model, frequency='epoch', mode='asynchronous')

    adam = optimizers.Adam(lr=0.01)
    opt_conf = optimizers.serialize(adam)

    estimator = ElephasEstimator()
    estimator.setFeaturesCol("features")
    estimator.setLabelCol("label")
    estimator.set_keras_model_config(model.to_yaml())
    estimator.set_categorical_labels(True)
    estimator.set_nb_classes(nb_classes)
    estimator.set_num_workers(1)
    estimator.set_epochs(20)
    estimator.set_batch_size(128)
    estimator.set_verbosity(1)
    estimator.set_validation_split(0.15)
    estimator.set_optimizer_config(opt_conf)
    estimator.set_mode("synchronous")
    estimator.set_loss("categorical_crossentropy")
    estimator.set_metrics(['acc'])

    #estimator = ElephasEstimator(model, epochs=20, batch_size=32, frequency='batch', mode='asynchronous', nb_classes=1)

    pipeline = Pipeline(stages=[estimator])
    #fitted_model = estimator.fit(data)
    #prediction = fitted_model.transform(data)

    fitted_pipeline = pipeline.fit(data)  # Fit model to data
    prediction = fitted_pipeline.transform(data)  # Evaluate on train data.
    # prediction = fitted_pipeline.transform(test_df) # <-- The same code evaluates test data.
    pnl = prediction.select("text", "prediction")
    pnl.show(100)

    prediction_and_label = pnl.map(lambda row: (row.text, row.prediction))
    metrics = MulticlassMetrics(prediction_and_label)
    print(metrics.precision())
    pnl = prediction.select("label", "prediction").show()
    pnl.show(100)
Esempio n. 11
0
def train_elephas_model(x, y):
    model = models.Sequential()

    # Input Layer
    sgd = optimizers.Adam(lr=0.01)
    model.add(Dense(256, activation="relu", input_shape=(x.shape[1],)))
    model.add(Dropout(0.05))

    model.add(Dense(256, activation="relu", input_shape=(x.shape[1],)))
    model.add(Dropout(0.05))

    # output layer
    model.add(Dense(1))
    model.compile(optimizer=sgd, loss="mse", metrics=["mse"])
    model.summary()

    rdd = to_simple_rdd(sc, x, y)
    spark_model = SparkModel(model, frequency='epoch', mode='asynchronous')
    # spark_model.fit(rdd, epochs=10, batch_size=64, verbose=1, validation_split=0.2)
    spark_model.fit(rdd, epochs=25, batch_size=64, verbose=1, validation_split=0.2)

    return spark_model
Esempio n. 12
0
def dist_training(n_iter):
    sbcnn = SBCNN_Model(field_size, bands, frames, num_channels, num_labels)

    sgd = SGD(lr=0.001, momentum=0.0, decay=0.0, nesterov=False)
    sbcnn.compile(loss='categorical_crossentropy',
                  metrics=['accuracy'],
                  optimizer=sgd)

    train_arr, train_labels_arr, test_arr, test_labels_arr = get_data()
    rdd = to_simple_rdd(sc, train_arr, train_labels_arr)

    spark_model = SparkModel(sbcnn, frequency='epoch', mode='asynchronous')
    spark_model.fit(rdd,
                    epochs=n_iter,
                    batch_size=32,
                    verbose=0,
                    validation_split=0.1)

    score = spark_model.master_network.evaluate(test_arr,
                                                test_labels_arr,
                                                verbose=2)
    print('Test accuracy:', score[1])
def test_sequential_serialization():
    # Define basic parameters
    batch_size = 64
    nb_classes = 10
    epochs = 1

    # Create Spark context
    pytest.mark.usefixtures("spark_context")

    # Load data
    (x_train, y_train), (x_test, y_test) = mnist.load_data()

    x_train = x_train.reshape(60000, 784)
    x_test = x_test.reshape(10000, 784)
    x_train = x_train.astype("float32")
    x_test = x_test.astype("float32")
    x_train /= 255
    x_test /= 255
    print(x_train.shape[0], 'train samples')
    print(x_test.shape[0], 'test samples')

    # Convert class vectors to binary class matrices
    y_train = np_utils.to_categorical(y_train, nb_classes)
    y_test = np_utils.to_categorical(y_test, nb_classes)

    seq_model = Sequential()
    seq_model.add(Dense(128, input_dim=784))
    seq_model.add(Activation('relu'))
    seq_model.add(Dropout(0.2))
    seq_model.add(Dense(128))
    seq_model.add(Activation('relu'))
    seq_model.add(Dropout(0.2))
    seq_model.add(Dense(10))
    seq_model.add(Activation('softmax'))

    seq_model.compile(optimizer="sgd", loss="categorical_crossentropy", metrics=["acc"])
    spark_model = SparkModel(seq_model, frequency='epoch', mode='synchronous')
    spark_model.save("elephas_sequential.h5")
Esempio n. 14
0
def test_training_classification(spark_context, mode, parameter_server_mode,
                                 mnist_data, classification_model):
    # Define basic parameters
    batch_size = 64
    epochs = 10

    # Load data
    x_train, y_train, x_test, y_test = mnist_data
    x_train = x_train[:1000]
    y_train = y_train[:1000]

    sgd = SGD(lr=0.1)
    classification_model.compile(sgd, 'categorical_crossentropy', ['acc'])

    # Build RDD from numpy features and labels
    rdd = to_simple_rdd(spark_context, x_train, y_train)

    # Initialize SparkModel from keras model and Spark context
    spark_model = SparkModel(classification_model,
                             frequency='epoch',
                             mode=mode,
                             parameter_server_mode=parameter_server_mode,
                             port=4000 + random.randint(0, 500))

    # Train Spark model
    spark_model.fit(rdd,
                    epochs=epochs,
                    batch_size=batch_size,
                    verbose=0,
                    validation_split=0.1)

    # run inference on trained spark model
    predictions = spark_model.predict(x_test)
    # run evaluation on trained spark model
    evals = spark_model.evaluate(x_test, y_test)

    # assert we can supply rdd and get same prediction results when supplying numpy array
    test_rdd = spark_context.parallelize(x_test)
    assert [np.argmax(x) for x in predictions
            ] == [np.argmax(x) for x in spark_model.predict(test_rdd)]

    # assert we get the same prediction result with calling predict on keras model directly
    assert [np.argmax(x) for x in predictions] == [
        np.argmax(x) for x in spark_model.master_network.predict(x_test)
    ]

    # assert we get the same evaluation results when calling evaluate on keras model directly
    assert isclose(evals[0],
                   spark_model.master_network.evaluate(x_test, y_test)[0],
                   abs_tol=0.01)
    assert isclose(evals[1],
                   spark_model.master_network.evaluate(x_test, y_test)[1],
                   abs_tol=0.01)
    def _build_model(self):
        model = Sequential()
        adam = elephas_optimizers.Adam()
        layers = self._layers
        model.add(Dense(layers[1], input_dim=layers[0], init='normal', activation='relu'))
        for i in range(2, len(layers) - 1):
            model.add(Dense(layers[i], activation='relu'))

        model.add(Dense(layers[-1], activation='sigmoid'))
        self._model = SparkModel(self._spark.sparkContext, model,
                                 optimizer=adam,
                                 frequency='epoch',
                                 mode='asynchronous',
                                 master_loss='mse',
                                 num_workers=self._worker_num)
Esempio n. 16
0
def test_training_regression(spark_context, mode, parameter_server_mode,
                             boston_housing_dataset, regression_model):
    x_train, y_train, x_test, y_test = boston_housing_dataset
    rdd = to_simple_rdd(spark_context, x_train, y_train)

    # Define basic parameters
    batch_size = 64
    epochs = 10
    sgd = SGD(lr=0.0000001)
    regression_model.compile(sgd, 'mse', ['mae'])
    spark_model = SparkModel(regression_model,
                             frequency='epoch',
                             mode=mode,
                             parameter_server_mode=parameter_server_mode,
                             port=4000 + random.randint(0, 500))

    # Train Spark model
    spark_model.fit(rdd,
                    epochs=epochs,
                    batch_size=batch_size,
                    verbose=0,
                    validation_split=0.1)

    # run inference on trained spark model
    predictions = spark_model.predict(x_test)
    # run evaluation on trained spark model
    evals = spark_model.evaluate(x_test, y_test)

    # assert we can supply rdd and get same prediction results when supplying numpy array
    test_rdd = spark_context.parallelize(x_test)
    assert all(
        np.isclose(x, y, 0.01)
        for x, y in zip(predictions, spark_model.predict(test_rdd)))

    # assert we get the same prediction result with calling predict on keras model directly
    assert all(
        np.isclose(x, y, 0.01) for x, y in zip(
            predictions, spark_model.master_network.predict(x_test)))

    # assert we get the same evaluation results when calling evaluate on keras model directly
    assert isclose(evals[0],
                   spark_model.master_network.evaluate(x_test, y_test)[0],
                   abs_tol=0.01)
    assert isclose(evals[1],
                   spark_model.master_network.evaluate(x_test, y_test)[1],
                   abs_tol=0.01)
Esempio n. 17
0
def test_training_custom_activation(mode, spark_context):
    def custom_activation(x):
        return sigmoid(x) + 1

    model = Sequential()
    model.add(Dense(1, input_dim=1, activation=custom_activation))
    model.add(Dense(1, activation='sigmoid'))

    sgd = SGD(lr=0.1)
    model.compile(sgd, 'binary_crossentropy', ['acc'])

    x_train = np.random.rand(1000)
    y_train = np.zeros(1000)
    x_test = np.random.rand(100)
    y_test = np.zeros(100)
    y_train[:500] = 1
    rdd = to_simple_rdd(spark_context, x_train, y_train)

    spark_model = SparkModel(model, frequency='epoch', mode=mode,
                             custom_objects={'custom_activation': custom_activation})
    spark_model.fit(rdd, epochs=1, batch_size=16, verbose=0, validation_split=0.1)
    assert spark_model.predict(x_test)
    assert spark_model.evaluate(x_test, y_test)
Esempio n. 18
0
               input_shape=(modelpara_dict['Column_num'], 1)))

    model.add(MaxPooling1D(pool_size=4))

    model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))

    model.add(Dense(modelpara_dict['Lable_num'], activation='softmax'))

    print(model.summary())

    sgd = SGD(lr=0.1)
    model.compile(loss='categorical_crossentropy', optimizer=sgd)
    adagrad = elephas_optimizers.Adagrad()
    spark_model = SparkModel(sc,
                             model,
                             optimizer=adagrad,
                             frequency='epoch',
                             mode='synchronous',
                             num_workers=3)

    # Train Spark model
    spark_model.train(train_data,
                      nb_epoch=1,
                      batch_size=32,
                      verbose=2,
                      validation_split=0.1)
    spark_model.master_network.save('model/' + modelname + '/' + modelname +
                                    '.h5')
# Evaluate Spark model by evaluating the underlying model
#score = spark_model.master_network.evaluate(x_test, y_test, verbose=2)
#print('Test accuracy:', score[1])
Esempio n. 19
0
    print("Creating Training and Test Data")
    ((x_train, y_train), (x_test, y_test)) = train_test_split(testinput.fillna(0), testoutput.fillna(0), test_size=0.3)

    print("Training data : x")
    print(type(x_train))
    print(x_train)
    print("Training data : y")
    print(type(y_train))
    print(y_train)

    print("Test data : x")
    print(type(x_test))
    print(x_test)
    print("Test data : y")
    print(type(y_test))
    print(y_test)

    print("Converting training data to RDD")
    rddataset = to_simple_rdd(sc, x_train, y_train)

    print("Initializing SPark Model")
    sgd = elephas_optimizers.SGD()
    spark_model = SparkModel(sc, model, optimizer=sgd, frequency="epoch", mode="asynchronous", num_workers=2)

    print("Commencing training")
    spark_model.train(rddataset, nb_epoch=10, batch_size=200, verbose=1, validation_split=0)
    # model.fit(x_train, y_train, nb_epoch=5, batch_size=32)
    print("Training completed")

    sc.stop()
Esempio n. 20
0
#early_stopping = EarlyStopping(monitor='val_acc', patience=5)
#print 'Start training...'
#model.fit( X_train, Y_train, batch_size=batch_size, nb_epoch=nb_epoch, verbose=verbose, callbacks=[checkpointer],validation_split=validation_split, shuffle=shuffle,show_accuracy=show_accuracy)

# Create Spark Context
conf = SparkConf().setAppName(MODEL)
sc = SparkContext(conf=conf)

# Build RDD from numpy features and labels
rdd = to_simple_rdd(sc, X_train, Y_train)

# Initialize SparkModel from Keras model and Spark Context

rmsprop = elephas_optimizers.RMSprop()

spark_model = SparkModel(sc,\
                        model,\
                        optimizer=rmsprop,\
                        frequency='epoch',\
                        mode='asynchronous',\
                        num_workers=3)

spark_model.train(rdd,\
                    nb_epoch=nb_epoch,\
                    batch_size=batch_size,\
                    verbose=2,\
                    validation_split=validation_split)

spark_model.get_network().save_weights(MODEL_FILE_NAME)

Esempio n. 21
0
File: HAN.py Progetto: sd12832/HAN
class HAN(object):
    """
    HAN model is implemented here.
    """
    def __init__(self,
                 text,
                 labels,
                 pretrained_embedded_vector_path,
                 max_features,
                 max_senten_len,
                 max_senten_num,
                 embedding_size,
                 num_categories=None,
                 validation_split=0.2,
                 verbose=0):
        """Initialize the HAN module
        Keyword arguments:
        text -- list of the articles for training.
        labels -- labels corresponding the given `text`.
        pretrained_embedded_vector_path -- path of any pretrained vector
        max_features -- max features embeddeding matrix can have. To more checkout https://keras.io/layers/embeddings/
        max_senten_len -- maximum sentence length. It is recommended not to use the maximum one but the one that covers 0.95 quatile of the data.
        max_senten_num -- maximum number of sentences. It is recommended not to use the maximum one but the one that covers 0.95 quatile of the data.
        embedding_size -- size of the embedding vector
        num_categories -- total number of categories.
        validation_split -- train-test split. 
        verbose -- how much you want to see.
        """
        try:
            self.verbose = verbose
            self.max_features = max_features
            self.max_senten_len = max_senten_len
            self.max_senten_num = max_senten_num
            self.embed_size = embedding_size
            self.validation_split = validation_split
            self.embedded_dir = pretrained_embedded_vector_path
            self.text = pd.Series(text)
            self.categories = pd.Series(labels)
            self.classes = self.categories.unique().tolist()
            # Initialize default hyperparameters
            # You can change it using `set_hyperparameters` function
            self.hyperparameters = {
                'l2_regulizer': None,
                'dropout_regulizer': None,
                'rnn': LSTM,
                'rnn_units': 150,
                'dense_units': 200,
                'activation': 'softmax',
                'optimizer': 'adam',
                'metrics': ['acc'],
                'loss': 'categorical_crossentropy'
            }
            if num_categories is not None:
                assert (num_categories == len(self.classes))
            assert (self.text.shape[0] == self.categories.shape[0])
            self.data, self.labels = self.preprocessing()
            self.x_train, self.y_train, self.x_val, self.y_val = self.split_dataset(
            )
            self.embedding_index = self.add_glove_model()
            self.set_model()
        except AssertionError:
            print('Input and label data must be of same size')

        # Implement this after you have seen all the different kinds of errors
        # try:
        #     conf = SparkConf().setAppName('HANMusicClassifier').setMaster('')
        #     self.sc = SparkContext(conf=conf)
        # except Error:
        conf = SparkConf().setAppName('HANMusicClassifier')
        self.sc = SparkContext(conf=conf)

    def set_hyperparameters(self, tweaked_instances):
        """Set hyperparameters of HAN model.
        Keywords arguemnts:
        tweaked_instances -- dictionary of all those keys you want to change
        """
        for key, value in tweaked_instances.items():
            if key in self.hyperparameters:
                self.hyperparameters[key] = value
            else:
                raise KeyError(key + ' does not exist in hyperparameters')
            self.set_model()

    def show_hyperparameters(self):
        """To check the values of all the current hyperparameters
        """
        print('Hyperparameter\tCorresponding Value')
        for key, value in self.hyperparameters.items():
            print(key, '\t\t', value)

    def clean_string(self, string):
        """
        Tokenization/string cleaning for dataset
        Every dataset is lower cased except
        """
        string = re.sub(r"\\", "", string)
        string = re.sub(r"\'", "", string)
        string = re.sub(r"\"", "", string)
        return string.strip().lower()

    def add_dataset(self, text, labels):
        try:
            self.text = pd.concat([self.text, pd.Series(text)])
            self.categories = pd.concat([self.categories, pd.Series(labels)])
            assert (len(self.classes) == self.categories.unique().tolist())
        except AssertionError:
            print("New class cannot be added in this manner")

    def preprocessing(self):
        """Preprocessing of the text to make it more resonant for training
        """
        paras = []
        labels = []
        texts = []
        for idx in range(self.text.shape[0]):
            text = self.clean_string(self.text[idx])
            texts.append(text)
            sentences = tokenize.sent_tokenize(text)
            paras.append(sentences)
        tokenizer = Tokenizer(num_words=self.max_features, oov_token=True)
        tokenizer.fit_on_texts(texts)
        data = np.zeros((len(texts), self.max_senten_num, self.max_senten_len),
                        dtype='int32')
        for i, sentences in enumerate(paras):
            for j, sent in enumerate(sentences):
                if j < self.max_senten_num:
                    wordTokens = text_to_word_sequence(sent)
                    k = 0
                    for _, word in enumerate(wordTokens):
                        if k < self.max_senten_len and word in tokenizer.word_index and tokenizer.word_index[
                                word] < self.max_features:
                            data[i, j, k] = tokenizer.word_index[word]
                            k = k + 1
        self.word_index = tokenizer.word_index
        if self.verbose == 1:
            print('Total %s unique tokens.' % len(self.word_index))
        labels = pd.get_dummies(self.categories)
        if self.verbose == 1:
            print('Shape of data tensor:', data.shape)
            print('Shape of labels tensor:', labels.shape)
        assert (len(self.classes) == labels.shape[1])
        assert (data.shape[0] == labels.shape[0])
        return data, labels

    def split_dataset(self):
        indices = np.arange(self.data.shape[0])
        np.random.shuffle(indices)
        self.data = self.data[indices]
        self.labels = self.labels.iloc[indices]
        nb_validation_samples = int(self.validation_split * self.data.shape[0])

        x_train = self.data[:-nb_validation_samples]
        y_train = self.labels[:-nb_validation_samples]
        x_val = self.data[-nb_validation_samples:]
        y_val = self.labels[-nb_validation_samples:]
        if self.verbose == 1:
            print(
                'Number of positive and negative reviews in traing and validation set'
            )
            print(y_train.columns.tolist())
            print(y_train.sum(axis=0).tolist())
            print(y_val.sum(axis=0).tolist())
        return x_train, y_train, x_val, y_val

    def get_model(self):
        """
        Returns the HAN model so that it can be used as a part of pipeline
        """
        return self.model

    def add_glove_model(self):
        """
        Read and save Pretrained Embedding model
        """
        embeddings_index = {}
        try:
            f = open(self.embedded_dir)
            for line in f:
                values = line.split()
                word = values[0]
                coefs = np.asarray(values[1:], dtype='float32')
                assert (coefs.shape[0] == self.embed_size)
                embeddings_index[word] = coefs
            f.close()
        except OSError:
            print('Embedded file does not found')
            exit()
        except AssertionError:
            print(
                "Embedding vector size does not match with given embedded size"
            )
        return embeddings_index

    def get_embedding_matrix(self):
        """
        Returns Embedding matrix
        """
        embedding_matrix = np.random.random(
            (len(self.word_index) + 1, self.embed_size))
        absent_words = 0
        for word, i in self.word_index.items():
            embedding_vector = self.embedding_index.get(word)
            if embedding_vector is not None:
                # words not found in embedding index will be all-zeros.
                embedding_matrix[i] = embedding_vector
            else:
                absent_words += 1
        if self.verbose == 1:
            print('Total absent words are', absent_words, 'which is',
                  "%0.2f" % (absent_words * 100 / len(self.word_index)),
                  '% of total words')
        return embedding_matrix

    def get_embedding_layer(self):
        """
        Returns Embedding layer
        """
        embedding_matrix = self.get_embedding_matrix()
        return Embedding(len(self.word_index) + 1,
                         self.embed_size,
                         weights=[embedding_matrix],
                         input_length=self.max_senten_len,
                         trainable=False)

    def set_model(self):
        """
        Set the HAN model according to the given hyperparameters
        """
        if self.hyperparameters['l2_regulizer'] is None:
            kernel_regularizer = None
        else:
            kernel_regularizer = regularizers.l2(
                self.hyperparameters['l2_regulizer'])
        if self.hyperparameters['dropout_regulizer'] is None:
            dropout_regularizer = 1
        else:
            dropout_regularizer = self.hyperparameters['dropout_regulizer']
        word_input = Input(shape=(self.max_senten_len, ), dtype='float32')
        word_sequences = self.get_embedding_layer()(word_input)
        word_lstm = Bidirectional(self.hyperparameters['rnn'](
            self.hyperparameters['rnn_units'],
            return_sequences=True,
            kernel_regularizer=kernel_regularizer))(word_sequences)
        word_dense = TimeDistributed(
            Dense(self.hyperparameters['dense_units'],
                  kernel_regularizer=kernel_regularizer))(word_lstm)
        word_att = AttentionWithContext()(word_dense)
        wordEncoder = Model(word_input, word_att)

        sent_input = Input(shape=(self.max_senten_num, self.max_senten_len),
                           dtype='float32')
        sent_encoder = TimeDistributed(wordEncoder)(sent_input)
        sent_lstm = Bidirectional(self.hyperparameters['rnn'](
            self.hyperparameters['rnn_units'],
            return_sequences=True,
            kernel_regularizer=kernel_regularizer))(sent_encoder)
        sent_dense = TimeDistributed(
            Dense(self.hyperparameters['dense_units'],
                  kernel_regularizer=kernel_regularizer))(sent_lstm)
        sent_att = Dropout(dropout_regularizer)(
            AttentionWithContext()(sent_dense))
        preds = Dense(len(self.classes))(sent_att)
        self.model = Model(sent_input, preds)
        self.model.compile(loss=self.hyperparameters['loss'],
                           optimizer=self.hyperparameters['optimizer'],
                           metrics=self.hyperparameters['metrics'])
        self.spark_model = SparkModel(self.model,
                                      frequency='epoch',
                                      mode='asynchronous')

    # Currently cannot plot learning curve
    def train_model(self,
                    rdd,
                    epochs,
                    batch_size,
                    verbose=1,
                    validation_split=0.1):
        """Training the model
        rdd  -- The actual data
        epochs -- Total number of epochs
        batch_size -- size of a batch
        verbose -- Whether or not we want verbose feedback
        validation_split -- What percentage of the data from the rdd is actually used as a validation set
        """

        self.spark_model.fit(self,
                             rdd,
                             epochs=epochs,
                             batch_size=batch_size,
                             verbose=verbose,
                             validation_split=validation_split)

    def predict(self, rdd):
        self.spark_model.predict(rdd)

    def plot_results(self):
        """
        Plotting learning curve of last trained model. 
        """
        # summarize history for accuracy
        plt.subplot(211)
        plt.plot(self.history.history['acc'])
        plt.plot(self.history.history['val_acc'])
        plt.title('model accuracy')
        plt.ylabel('accuracy')
        plt.xlabel('epoch')
        plt.legend(['train', 'test'], loc='upper left')

        # summarize history for loss
        plt.subplot(212)
        plt.plot(self.history.history['val_loss'])
        plt.plot(self.history.history['loss'])
        plt.title('model loss')
        plt.ylabel('loss')
        plt.xlabel('epoch')
        plt.legend(['train', 'test'], loc='upper left')
        plt.show()
        time.sleep(10)
        plt.close()
Esempio n. 22
0
model.add(Activation('relu'))
model.add(Convolution2D(nb_filters, nb_conv, nb_conv))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(nb_pool, nb_pool)))
model.add(Dropout(0.25))

model.add(Flatten())
model.add(Dense(128))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(nb_classes))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adadelta')

## spark
conf = SparkConf().setAppName(APP_NAME).setMaster(MASTER_IP)
sc = SparkContext(conf=conf)

# Build RDD from numpy features and labels
rdd = to_simple_rdd(sc, X_train, Y_train)

# Initialize SparkModel from Keras model and Spark context
spark_model = SparkModel(sc,model)

# Train Spark model
spark_model.train(rdd, nb_epoch=nb_epoch, batch_size=batch_size, verbose=0, validation_split=0.1, num_workers=24)

# Evaluate Spark model by evaluating the underlying model
score = spark_model.get_network().evaluate(X_test, Y_test, show_accuracy=True, verbose=2)
print('Test accuracy:', score[1])
Esempio n. 23
0
# score = model.evaluate(x_test, y_test, verbose=0)
#
# print('Test score:', score[0])
# print('Test accuracy:', score[1])

# Create Spark context
conf = SparkConf().setAppName('Mnist_Spark_MLP')
# .setMaster('local[8]')
sc = SparkContext(conf=conf)

# Build RDD from numpy features and labels
# lp_rdd = to_labeled_point(sc, x_train, y_train, categorical=True)
rdd = to_simple_rdd(sc, x_train, y_train)

# Train Spark model
spark_model = SparkModel(model, frequency='epoch', mode='asynchronous')

spark_model.fit(rdd,
                epochs=epochs,
                batch_size=batch_size,
                verbose=2,
                validation_split=0.1)

# Evaluate Spark model by evaluating the underlying model
score = spark_model.master_network.evaluate(x_test, y_test, verbose=2)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

model_file = 'save/mlp.h5'
import os
if not os.path.exists("save/"):
Esempio n. 24
0
model = Sequential()
model.add(Dense(18, input_dim=26))
model.add(Activation('sigmoid'))
model.add(Dense(6))
model.add(Activation('sigmoid'))
model.add(Dense(1))
model.add(Activation('sigmoid'))

spark = SparkSession.builder.appName('ElephasTest').getOrCreate()
rdd = to_simple_rdd(spark.sparkContext, train, y_train)

sgd = SGD(lr=0.1)
adagrad = elephas_optimizers.Adagrad()
spark_model = SparkModel(spark.sparkContext,
                         model,
                         optimizer=adagrad,
                         frequency='epoch',
                         mode='asynchronous',
                         master_loss='mse',
                         num_workers=2, master_optimizer=sgd)

# Train Spark model
spark_model.train(rdd, nb_epoch=nb_epoch, batch_size=batch_size, verbose=2, validation_split=0.1)

# Evaluate Spark model by evaluating the underlying model
score = spark_model.master_network.evaluate(test.values, y_test, verbose=2)
print('Test accuracy:', score[1])
print spark_model.predict(test.values)
print y_test
Esempio n. 25
0
    .getOrCreate())

sc = spark.sparkContext
"""### Training Model"""

from elephas.spark_model import SparkModel
from elephas.utils.rdd_utils import to_simple_rdd
# Compile the model.
model_9.compile(optimizer='adam',
                loss='binary_crossentropy',
                metrics=['accuracy'])
# Build RDD from features and labels.
rdd = to_simple_rdd(sc, x_train, y_train)
# Initialize SparkModel from Keras model and Spark context.
spark_model = SparkModel(model_9,
                         frequency='epoch',
                         mode='asynchronous',
                         num_workers=3)
# Train the Spark model.
spark_model.fit(rdd, epochs=10, batch_size=32, verbose=1, validation_split=0.1)

score = spark_model.master_network.evaluate(x_test, y_test, verbose=1)
print('Test accuracy:', score)
"""### Predcit and evaluate Model"""
"""### Save Model"""

import json
#lets assume 'model' is main model
model_json = model_9.to_json()
with open("model_in_json.json", "w") as json_file:
    json.dump(model_json, json_file)
Esempio n. 26
0
model = Sequential()
model.add(Dense(784, 128))
model.add(Activation('relu'))
model.add(Dropout(0.2))
model.add(Dense(128, 128))
model.add(Activation('relu'))
model.add(Dropout(0.2))
model.add(Dense(128, 10))
model.add(Activation('softmax'))

# Compile model
rms = RMSprop()
model.compile(loss='categorical_crossentropy', optimizer=rms)

# Create Spark context
conf = SparkConf().setAppName('Mnist_Spark_MLP').setMaster('local[8]')
sc = SparkContext(conf=conf)

# Build RDD from numpy features and labels
rdd = to_simple_rdd(sc, X_train, Y_train)

# Initialize SparkModel from Keras model and Spark context
spark_model = SparkModel(sc,model)

# Train Spark model
spark_model.train(rdd, nb_epoch=nb_epoch, batch_size=batch_size, verbose=0, validation_split=0.1, num_workers=8)

# Evaluate Spark model by evaluating the underlying model
score = spark_model.get_network().evaluate(X_test, Y_test, show_accuracy=True, verbose=2)
print('Test accuracy:', score[1])
import tensorflow as tf
(x_train, y_train), (x_test, y_test) = mnist.load_data()
print("length =  ( ", len(x_train), ", ", len(y_train), " )")
print("shape of the dataset = ", tf.shape(y_train))

x_train = x_train.reshape(60000, 784)
x_test = x_test.reshape(10000, 784)
x_train = x_train.astype("float32")
x_test = x_test.astype("float32")
x_train /= 255
x_test /= 255
print(x_train.shape[0], 'train samples')
print(x_test.shape[0], 'test samples')

nb_classes = 10
# Convert class vectors to binary class matrices
y_train = to_categorical(y_train, nb_classes)
y_test = to_categorical(y_test, nb_classes)

rdd = to_simple_rdd(sc, x_train, y_train)
print("rdd = ", rdd)

from elephas.spark_model import SparkModel
spark_model = SparkModel(model,
                         frequency='epoch',
                         mode='asynchronous',
                         num_workers=2)
spark_model.fit(rdd, epochs=10, batch_size=32, verbose=0, validation_split=0.1)
score = spark_model.evaluate(x_test, y_test, verbose=2)
print('Test accuracy:', score)
    print("Test size: %d" % test_data.count())

    # create model object
    model = Sequential()
    model.add(LSTM(128, activation="sigmoid", input_shape=(1, input_dim)))
    model.add(Dropout(0.2))
    model.add(Dense(1))

    metrics = ['MeanSquaredError', 'MeanAbsoluteError']
    model.compile(loss='mean_squared_error', optimizer='adam', metrics=metrics)
    print(model.summary())

    rdd = train_data.rdd.map(lambda x:
                             (x[0].toArray().reshape(1, len(x[0])), x[1]))
    spark_model = SparkModel(model,
                             frequency='epoch',
                             mode='synchronous',
                             metrics=metrics)
    start = time()
    spark_model.fit(rdd,
                    epochs=1,
                    batch_size=64,
                    verbose=0,
                    validation_split=0.1)
    fit_dt = time() - start
    print(f"Fit took: {fit_dt}")

    x_test = test_data.toPandas()['features']
    x_test = np.asarray(test_data.rdd.map(lambda x: x[0].toArray()).collect())
    x_test = x_test.reshape((x_test.shape[0], 1, x_test.shape[1]))
    y_test = test_data.toPandas()["Weighted_Price"].to_numpy()
    y_test = y_test.reshape((len(y_test), 1, 1))
Esempio n. 29
0
class SparseGate(ModelFrame):
    def __init__(self, x_train, y_train, x_test, y_test, inputs,
                 spark_context):
        ModelFrame.__init__(self, x_train, y_train, x_test, y_test,
                            spark_context)
        self.gateModel = None
        self.inputs = inputs

    def gating_network(self):
        c1 = Conv2D(32, (3, 3),
                    padding='same',
                    kernel_regularizer=regularizers.l2(weight_decay),
                    input_shape=self.x_train.shape[1:],
                    name='gate1')(self.inputs)
        c2 = Activation('elu', name='gate2')(c1)
        c3 = BatchNormalization(name='gate3')(c2)
        c4 = Conv2D(32, (3, 3),
                    padding='same',
                    kernel_regularizer=regularizers.l2(weight_decay),
                    name='gate4')(c3)
        c5 = Activation('elu', name='gate5')(c4)
        c6 = BatchNormalization(name='gate6')(c5)
        c7 = MaxPooling2D(pool_size=(2, 2), name='gate7')(c6)
        c8 = Dropout(0.2, name='gate26')(c7)
        c9 = Conv2D(32 * 2, (3, 3),
                    name='gate8',
                    padding='same',
                    kernel_regularizer=regularizers.l2(weight_decay))(c8)
        c10 = Activation('elu', name='gate9')(c9)
        c11 = BatchNormalization(name='gate25')(c10)
        c12 = Conv2D(32 * 2, (3, 3),
                     name='gate10',
                     padding='same',
                     kernel_regularizer=regularizers.l2(weight_decay))(c11)
        c13 = Activation('elu', name='gate11')(c12)
        c14 = BatchNormalization(name='gate12')(c13)
        c15 = MaxPooling2D(pool_size=(2, 2), name='gate13')(c14)
        c16 = Dropout(0.3, name='gate14')(c15)

        c25 = Flatten(name='gate23')(c16)
        c26 = Dense(5, name='gate24', activation='elu')(c25)

        model = Model(inputs=self.inputs, outputs=c26)
        return model

    def create_gate_model(self, expert_models):
        gate_network = self.gating_network()
        merged = Lambda(lambda x: K.tf.transpose(
            sum(
                K.tf.transpose(x[i]) * x[0][:, i - 1] for i in range(
                    1, len(x)))))([gate_network.layers[-1].output] +
                                  [m.layers[-1].output for m in expert_models])
        b = Activation('softmax', name='gatex')(merged)
        model = Model(inputs=self.inputs, outputs=b)
        model.compile(loss='categorical_crossentropy',
                      optimizer=Adam(),
                      metrics=['accuracy'])
        return model

    def train_gate(self, datagen, weights_file):
        model = self.gateModel
        model.compile(loss='categorical_crossentropy',
                      optimizer=Adam(),
                      metrics=['accuracy'])
        print(model.summary())
        self.gateModel = SparkModel(model,
                                    frequency='epoch',
                                    mode='asynchronous')
        score = self.gateModel.master_network.evaluate(self.x_test,
                                                       self.y_test,
                                                       verbose=2,
                                                       batch_size=50)
        self.gateModel.fit(self.rdd, epochs=1, batch_size=50, verbose=1)
        self.gateModel = self.gateModel.master_network
        self.gateModel.save_weights(weights_file + '.hdf5')

        file = '../lib/output.txt'
        if os.path.exists(file):
            append_write = 'a'
        else:
            append_write = 'w'

        #score = self.gateModel.evaluate(self.x_test, self.y_test, verbose=2, batch_size=50)
        print("------------------------------")
        print("Score is:" + str(score[1]))
        print("-------------------------------")
        text_file = open(file, append_write)
        text_file.write("Score: %s" % score[1])
        text_file.close()

    def load_gate_weights(self,
                          model_old,
                          weights_file='../lib/weights/moe_full.hdf5'):
        model_old.load_weights(weights_file)
        for l in self.gateModel.layers:
            for b in model_old.layers:
                if (l.name == b.name):
                    l.set_weights(b.get_weights())
                    print("loaded gate layer " + str(l.name))
early_stop = EarlyStopping(monitor='val_loss', patience=4, verbose=1)

model = Sequential()
model.add(Dense(32, input_shape=(239, ), activation='tanh'))
model.add(Dense(1))

opt = Adam(lr=0.001)
model.compile(loss='mean_squared_error', optimizer=opt)

model.summary()

from elephas.utils.rdd_utils import to_simple_rdd

rdd = train_data.rdd

from elephas.spark_model import SparkModel
from elephas.optimizers import Adam

spark_model = SparkModel(model,
                         frequency='epoch',
                         mode='synchronous',
                         num_workers=4,
                         elephas_optimizer=Adam())
spark_model.fit(rdd,
                epochs=20,
                batch_size=500,
                verbose=1,
                validation_split=0.1)

spark_model.fit()
Esempio n. 31
0
from elephas.mllib.adapter import to_vector, from_vector

from pyspark import SparkContext, SparkConf

# Create Spark context
conf = SparkConf().setAppName('LSTM_Spark_MLP')
sc = SparkContext(conf=conf)


def deal_x(x):
    x = np.array(x)
    x_data = x[1:]
    x_data = np.expand_dims(x_data, axis=2)
    return x_data


test_data = sc.textFile("output/data/z2.csv").map(
    lambda ln: deal_x([float(x) for x in ln.split(',')]))

model = load_model('model.h5')
adagrad = elephas_optimizers.Adagrad()
spark_model = SparkModel(sc,
                         model,
                         optimizer=adagrad,
                         frequency='epoch',
                         mode='synchronous',
                         num_workers=3)

# Test Spark model
spark_model.predict_classes(test_data, "output/data/prediction")
sc = SparkContext(conf=conf)
sc.setLogLevel("ERROR")
# Build RDD from numpy features and labels
rdd = to_simple_rdd(sc, x_train, y_train)
# Epoch Before Check Point
num_epoch_in_one_step = 10
batch_size = 100
# Accuracy records
stat_lines = []
adagrad = elephas_optimizers.Adagrad()
for i in range(0, 200):
    # Train Spark model
    # Initialize SparkModel from Keras model and Spark context
    spark_model = SparkModel(sc,
                             model,
                             mode='asynchronous',
                             frequency='epoch',
                             num_workers=1,
                             optimizer=adagrad)
    spark_model.train(rdd,
                      nb_epoch=num_epoch_in_one_step,
                      batch_size=batch_size,
                      verbose=0,
                      validation_split=0.1)
    score1 = model.evaluate(x_train, y_train, verbose=0)
    score2 = model.evaluate(x_test, y_test, verbose=0)
    print('#############################')
    print('Finished epochs', (i + 1) * num_epoch_in_one_step)
    print('Train accuracy:', score1[1])
    print('Test accuracy:', score2[1])
    print('#############################')
    stat_lines.append(
Esempio n. 33
0
model.add(Activation('relu'))
model.add(Dropout(0.2))
model.add(Dense(10))
model.add(Activation('softmax'))

sgd = SGD(lr=0.1)

# Build RDD from numpy features and labels
rdd = to_simple_rdd(sc, x_train, y_train)

# Initialize SparkModel from Keras model and Spark context
adagrad = elephas_optimizers.Adagrad()
spark_model = SparkModel(sc,
                         model,
                         optimizer=adagrad,
                         frequency='epoch',
                         mode='asynchronous',
                         num_workers=2,
                         master_optimizer=sgd)

# Train Spark model
spark_model.train(rdd,
                  nb_epoch=nb_epoch,
                  batch_size=batch_size,
                  verbose=2,
                  validation_split=0.1)

# Evaluate Spark model by evaluating the underlying model
score = spark_model.master_network.evaluate(x_test, y_test, verbose=2)
print('Test accuracy:', score[1])
Esempio n. 34
0
model.add(Dropout(0.2))
model.add(Dense(10))
model.add(Activation('softmax'))

# Compile model
sgd = SGD(lr=0.1)
model.compile(loss='categorical_crossentropy', optimizer=sgd)

# Build RDD from numpy features and labels
rdd = to_simple_rdd(sc, x_train, y_train)

# Initialize SparkModel from Keras model and Spark context
adagrad = elephas_optimizers.Adagrad()
spark_model = SparkModel(sc,
                         model,
                         optimizer=adagrad,
                         frequency='epoch',
                         mode='asynchronous',
                         num_workers=2)

# Train Spark model
spark_model.train(rdd,
                  nb_epoch=nb_epoch,
                  batch_size=batch_size,
                  verbose=2,
                  validation_split=0.1)

# Evaluate Spark model by evaluating the underlying model
score = spark_model.master_network.evaluate(x_test,
                                            y_test,
                                            show_accuracy=True,
                                            verbose=2)
class KerasNeuralNetworkSpark(object):
    def __init__(self, layers, spark, batch_size=64, epoch=10, num_workers=2, predictionCol='prediction',
                 labelCol='target', featuresCol='feature'):
        self._batch_size = batch_size
        self._epoch = epoch
        self._model = None
        self._spark = spark
        self._labels = labelCol
        self._features = featuresCol
        self._prediction = predictionCol
        self._layers = layers
        self._worker_num = num_workers
        self._build_model()

    def _build_model(self):
        model = Sequential()
        adam = elephas_optimizers.Adam()
        layers = self._layers
        model.add(Dense(layers[1], input_dim=layers[0], init='normal', activation='relu'))
        for i in range(2, len(layers) - 1):
            model.add(Dense(layers[i], activation='relu'))

        model.add(Dense(layers[-1], activation='sigmoid'))
        self._model = SparkModel(self._spark.sparkContext, model,
                                 optimizer=adam,
                                 frequency='epoch',
                                 mode='asynchronous',
                                 master_loss='mse',
                                 num_workers=self._worker_num)

    def fit(self, df):
        if hasattr(self._model, 'server'):
            self._model.server.terminate()
        pdf = df.toPandas()

        rdd = to_simple_rdd(self._spark.sparkContext, pdf[self._features], pdf[self._labels])
        self._model.train(rdd, self._epoch, self._batch_size, 0, 0.1)

    def transform(self, df):
        pdf = df.toPandas()
        # df.write.save('test_df.parquet')
        pnparray = pdf[self._features].values
        container = np.zeros((pnparray.shape[0], len(pnparray[0])))
        for i in range(pnparray.shape[0]):
            container[i, :] = pnparray[i][:]
        result = self._model.predict(container)

        pdf[self._prediction] = result

        # import pickle
        # with open('ann_result.p', 'w') as f:
        #     pickle.dump(result, f)

        # result_df = pd.DataFrame(pdf
        new_df = self._spark.createDataFrame(pdf)
        # df.join(new_df)
        return new_df

    def stop_server(self):
        if hasattr(self._model, 'server') and hasattr(self._model.server, 'terminate'):
            self._model.server.terminate()
                                  # output signal.  Here's the activation function is given be ReLU.  
model.add(Activation('relu'))
model.add(Dropout(0.5))           # dropout is then applied 

# finally the 128 outputs of the previous FC layer are fully connected to num_classes of neurons, which 
# is activated by a softmax function
model.add( Dense(nb_classes, W_regularizer=l2(0.01) ))
model.add( Activation('softmax') )
# write the neural network model representation to a png image
#grapher.plot(model, 'nn_mnist.png')

model.compile(loss='categorical_crossentropy', optimizer='adadelta')
# model.compile(loss='categorical_crossentropy', optimizer='sgd' or 'adam or 'adadelta')

## spark
conf = SparkConf().setAppName(APP_NAME) #.setMaster(MASTER_IP)
sc = SparkContext(conf=conf)

# Build RDD from numpy features and labels
rdd = to_simple_rdd(sc, X_train, Y_train)

# Initialize SparkModel from Keras model and Spark context
spark_model = SparkModel(sc,model)

# Train Spark model
spark_model.train(rdd, nb_epoch=nb_epoch, batch_size=batch_size, verbose=1, validation_split=0.15) # num_workers might not work in early spark version

# Evaluate Spark model by evaluating the underlying model
score = spark_model.get_network().evaluate(X_test, Y_test, show_accuracy=True, verbose=2)
print('Test accuracy:', score[1])
Esempio n. 37
0
    print('Test data : x')
    print(type(x_test))
    print(x_test)
    print('Test data : y')
    print(type(y_test))
    print(y_test)

    print('Converting training data to RDD')
    rddataset = to_simple_rdd(sc, x_train, y_train)

    print('Initializing SPark Model')
    sgd = elephas_optimizers.SGD()
    spark_model = SparkModel(sc,
                             model,
                             optimizer=sgd,
                             frequency='epoch',
                             mode='asynchronous',
                             num_workers=2)

    print('Commencing training')
    spark_model.train(rddataset,
                      nb_epoch=10,
                      batch_size=200,
                      verbose=1,
                      validation_split=0)
    #model.fit(x_train, y_train, nb_epoch=5, batch_size=32)
    print('Training completed')

    sc.stop()
# Define elephas optimizer (which tells the model how to aggregate updates on the Spark master)
adadelta = elephas_optimizers.Adadelta()

from elephas.utils.rdd_utils import to_labeled_point
from elephas.utils.rdd_utils import to_simple_rdd
lp_rdd = to_simple_rdd(sc, features_train, labels_train)

#print(lp_rdd.take(5))

from elephas.spark_model import SparkModel
from elephas import optimizers as elephas_optimizers

adagrad = elephas_optimizers.Adagrad()
spark_model = SparkModel(sc,
                         model,
                         optimizer=adagrad,
                         frequency='epoch',
                         mode='asynchronous',
                         num_workers=8)
spark_model.train(lp_rdd,
                  nb_epoch=20,
                  batch_size=32,
                  verbose=0,
                  validation_split=0.1)

print(spark_model)

prediction = spark_model.predict_classes(features_test)
print(prediction)
truth = [l[1] for l in labels_test]

from sklearn.metrics import confusion_matrix
Esempio n. 39
0
        model_config.input_split_index = encoder_train_input.shape[1]
        training_generator = WMTSequence(encoder_train_input, decoder_train_input, decoder_train_target, model_config)

        for raw_train_input, decoder_train_target in training_generator:
            encoder_train_input, decoder_train_input = raw_train_input
            train_input = np.hstack((encoder_train_input, decoder_train_input))
            train_rdd = to_simple_rdd(sc, train_input, decoder_train_target)

            if args.ensemble:
                model = DistributedEnsembleSeq2Seq(model_config)
            else:
                model = DistributedSeq2Seq(model_config)

            spark_model = SparkModel(model.model,
                                     frequency='epoch',
                                     mode='synchronous',
                                     batch_size=args.batch_size,
                                     custom_objects={'EncoderSlice': EncoderSlice, 'DecoderSlice': DecoderSlice})

            spark_model.fit(train_rdd,
                            batch_size=model_config.batch_size,
                            epochs=model_config.epochs,
                            validation_split=0.0,
                            verbose=1)

        model.evaluate(encoder_test_input, raw_test_target)

    else:
        training_generator = WMTSequence(encoder_train_input, decoder_train_input, decoder_train_target, model_config)
        validation_generator = WMTSequence(encoder_dev_input, decoder_dev_input, decoder_dev_target, model_config)
Esempio n. 40
0
model.compile(loss='categorical_crossentropy', optimizer=SGD())
model.summary()


# Create a Resilient Distributed Dataset (RDD) from training data

# TODO: get data
# TODO: is it possible to separate traininng data into multiple batches?

rdd = to_simple_rdd(sc, X_train, Y_train)


# Create the Elephas model instance
spark_model = SparkModel(sc,
                         model,
                         optimizer = elephas_optimizers.Adagrad(),
                         frequency = 'epoch',
                         mode = 'asynchronous',
                         num_workers = WORKERS
                         )

# Train model
spark_model.train(rdd,
                  nb_epoch = EPOCHS,
                  batch_size = BATCH_SIZE,
                  verbose = False,
                  validation_split = VAL_SPLIT,
                  num_workers = WORKERS
                  )