def predict(filename): raw_text = getText(filename) data = getDataset(raw_text) model = defineModel(data) loadWeights(model) generateText(model, raw_text)
def main(toolName, datasetName): tool = getToolObject(toolName) #load the dataset [used as data in shapley plot] sequences = getDataset(datasetName, toolName) print("Loaded the dataset " + datasetName + " of " + str(len(sequences)) + " data points.") #Calculate the features for the dataset print( "Calculating features for the dataset. [Might take a while for wu-crispr]" ) feature_set = [] cnt = 1 for seq in sequences: features = tool.getFeatures(seq) feature_set.append(features) if toolName == 'wu-crispr' and cnt % 100 == 0: #inform on progress [wu-crispr takes a while] print("-- Calculated features for " + str(cnt)) cnt = cnt + 1 print("Calculated the features for this dataset.") #Get feature names [for printing in the shapley plot] feature_names = tool.loadFeatureNames() print("Loaded the names of all " + str(len(feature_names)) + " features.") #Put together features with names in one dataframe dataset_df = pd.DataFrame(np.array(feature_set), columns=feature_names) #training set (must be loaded before model) train_df = tool.loadTrainingSet() print("Loaded the training set used for tool " + toolName + ", size: " + str(train_df.shape) + ".") #load model of the tool model = tool.loadModel() print("Loaded the model for tool " + toolName + ".") #summarize training set and subsample test set summary_train_df = shap.kmeans(train_df, 2) dataset_sub_df = dataset_df # optional to speed up things can use dataset_df.sample(400) #compute and plot shapley values shap_explainer = shap.KernelExplainer(model.predict, summary_train_df) print(dataset_sub_df) shap_values = shap_explainer.shap_values(dataset_sub_df) #save the values with open("../results/SHAP-" + toolName + "-" + datasetName, 'wb') as file: pickle.dump(shap_values, file) #the SHAP values pickle.dump(dataset_sub_df, file) #the data used print("Computed and saved SHAP values.") #ploatting shap.summary_plot(shap_values, dataset_sub_df)
def __init__(self, dataSetPath): self.action_space = gym.spaces.Discrete(3) self.observation_space = gym.spaces.Box(low=np.array([0.0]), high=np.array([1.0])) self.action = ["hold", "buy", "sell"] self.counter = 0 self.data = getDataset(dataSetPath) self.dataLength = len(self.data) self.inventory = [] self.total_profit = 0
def marvin_model_test(): """ Tests the SVM Marvin hotword detector :return: None """ # Download data downloadData(data_path="/input/speech_commands/") # Get dictionary with files and labels dataDict = getDataDict(data_path="/input/speech_commands/") # Obtain dataframe by merging dev and test dataset devDF = getDataframe(dataDict["dev"], include_unknown=True) testDF = getDataframe(dataDict["test"], include_unknown=True) evalDF = pd.concat([devDF, testDF], ignore_index=True) print("Test files: {}".format(evalDF.shape[0])) # Obtain Marvin - Other separated data evalDF["class"] = evalDF.apply(lambda row: 1 if row["category"] == "marvin" else -1, axis=1) evalDF.drop("category", axis=1) test_true_labels = evalDF["class"].tolist() eval_data, _ = getDataset(df=evalDF, batch_size=BATCH_SIZE, cache_file="kws_val_cache", shuffle=False) # Load trained model model = load_model("../models/marvin_kws.h5") layer_name = "features256" feature_extractor = Model(inputs=model.input, outputs=model.get_layer(layer_name).output) # Load trained PCA object with open("../models/marvin_kws_pca.pickle", "rb") as file: pca = pickle.load(file) # Load trained SVM with open("../models/marvin_kws_svm.pickle", "rb") as file: marvin_svm = pickle.load(file) # Extract the feature embeddings and evaluate using SVM X_test = feature_extractor.predict(eval_data, use_multiprocessing=True) X_test_scaled = pca.transform(X_test) test_pred_labels = marvin_svm.predict(X_test_scaled) OC_Statistics(test_pred_labels, test_true_labels, "marvin_cm_without_noise")
def main(job_id, params): from utils import getModel from utils import getDataset logger = get_module_logger(__name__) logger.info('Starting job with id: %d' % job_id) logger.info('Fetching dataset %s' % params['dataset']) dataset = getDataset(params['dataset'])() logger.info('Using model %s' % params['experiment_name']) simulation = getModel(params['experiment_name'])(**params) logger.info('Starting training ...') score = simulation.run(dataset) return score
def train(filename): raw_text = getText(filename) data = getDataset(raw_text) model = defineModel(data) fitModel(model, data)
def model_train(): """ Trains model which is used as a feature extractor :return:None """ # Download data downloadData(data_path="/input/speech_commands/") # Get data dictionary dataDict = getDataDict(data_path="/input/speech_commands/") # Obtain dataframe for each dataset trainDF = getDataframe(dataDict["train"]) valDF = getDataframe(dataDict["val"]) devDF = getDataframe(dataDict["dev"]) testDF = getDataframe(dataDict["test"]) print("Dataset statistics") print("Train files: {}".format(trainDF.shape[0])) print("Validation files: {}".format(valDF.shape[0])) print("Dev test files: {}".format(devDF.shape[0])) print("Test files: {}".format(testDF.shape[0])) # Use TF Data API for efficient data input train_data, train_steps = getDataset(df=trainDF, batch_size=BATCH_SIZE, cache_file="train_cache", shuffle=True) val_data, val_steps = getDataset(df=valDF, batch_size=BATCH_SIZE, cache_file="val_cache", shuffle=False) model = create_model() model.summary() # Stop training if the validation accuracy doesn't improve earlyStopping = EarlyStopping(monitor="val_loss", patience=PATIENCE, verbose=1) # Reduce LR on validation loss plateau reduceLR = ReduceLROnPlateau(monitor="val_loss", patience=PATIENCE, verbose=1) # Compile the model model.compile( loss="sparse_categorical_crossentropy", optimizer=Adam(learning_rate=LEARNING_RATE), metrics=["sparse_categorical_accuracy"], ) # Train the model history = model.fit( train_data.repeat(), steps_per_epoch=train_steps, validation_data=val_data.repeat(), validation_steps=val_steps, epochs=EPOCHS, callbacks=[earlyStopping, reduceLR], ) # Save model print("Saving model") model.save("../models/marvin_kws.h5") # Save history data print("Saving training history") with open("../models/marvin_kws_history.pickle", "wb") as file: pickle.dump(history.history, file, protocol=pickle.HIGHEST_PROTOCOL) plot_history(history=history)
def marvin_kws_model(): """ Trains an One Class SVM for hotword detection :return: None """ # Download data downloadData(data_path="/input/speech_commands/") # Get dictionary with files and labels dataDict = getDataDict(data_path="/input/speech_commands/") # Obtain dataframe for each dataset trainDF = getDataframe(dataDict["train"]) valDF = getDataframe(dataDict["val"]) # Obtain Marvin data from training data marvin_data, _ = getDataset( df=trainDF.loc[trainDF["category"] == "marvin", :], batch_size=BATCH_SIZE, cache_file="kws_marvin_cache", shuffle=False, ) # Obtain Marvin - Other separated data from validation data valDF["class"] = valDF.apply(lambda row: 1 if row["category"] == "marvin" else -1, axis=1) valDF.drop("category", axis=1) val_true_labels = valDF["class"].tolist() val_data, _ = getDataset(df=valDF, batch_size=BATCH_SIZE, cache_file="kws_val_cache", shuffle=False) # Load model and create feature extractor model = load_model("../models/marvin_kws.h5") layer_name = "features256" feature_extractor = Model(inputs=model.input, outputs=model.get_layer(layer_name).output) # Obtain the feature embeddings X_train = feature_extractor.predict(marvin_data, use_multiprocessing=True) X_val = feature_extractor.predict(val_data, use_multiprocessing=True) # Apply PCA to reduce dimensionality pca = PCA(n_components=32) pca.fit(X_train) print("Variance captured = ", sum(pca.explained_variance_ratio_)) X_train_transformed = pca.transform(X_train) X_val_transformed = pca.transform(X_val) # SVM hyper-parameter tuning using Gaussian process marvin_svm = svm.OneClassSVM() svm_space = [ Real(10**-5, 10**0, "log-uniform", name="gamma"), Real(10**-5, 10**0, "log-uniform", name="nu"), ] @use_named_args(svm_space) def svm_objective(**params): marvin_svm.set_params(**params) marvin_svm.fit(X_train_transformed) val_pred_labels = marvin_svm.predict(X_val_transformed) score = f1_score(val_pred_labels, val_true_labels) return -1 * score res_gp_svm = gp_minimize(func=svm_objective, dimensions=svm_space, n_calls=100, n_jobs=-1, verbose=False, random_state=1) print("Best F1 score={:.4f}".format(-res_gp_svm.fun)) ax = plot_convergence(res_gp_svm) plt.savefig("../docs/results/marvin_svm.png", dpi=300) plt.show() # Instantiate a SVM with the optimal hyper-parameters best_params_svm = {k.name: x for k, x in zip(svm_space, res_gp_svm.x)} marvin_kws = svm.OneClassSVM() marvin_kws.set_params(**best_params_svm) marvin_kws.fit(X_train_transformed) # Performance on training set val_pred_labels = marvin_kws.predict(X_val_transformed) OC_Statistics(val_pred_labels, val_true_labels, "marvin_cm_training") print("Saving PCA object") with open("../models/marvin_kws_pca.pickle", "wb") as file: pickle.dump(pca, file, protocol=pickle.HIGHEST_PROTOCOL) print("Saving Marvin SVM") with open("../models/marvin_kws_svm.pickle", "wb") as file: pickle.dump(marvin_svm, file, protocol=pickle.HIGHEST_PROTOCOL)
evalMode = args.eval verbose_eval = args.verbose_eval save_model = True if args.save_model == 1 else False # save_model = False # filterMode = args.filter # pre = "test_bpr-he_d10.last.h5" # num_negatives = 1 topK = 100 if evalMode == "all" else 10 evaluation_threads = 1 # Loading data t1 = time() dataset = getDataset(data, path, evalMode) train, trainSeq, df, testRatings, testNegatives = dataset.trainMatrix, dataset.trainSeq, dataset.df, dataset.testRatings, dataset.testNegatives uNum, iNum = df.uid.max() + 1, df.iid.max() + 1 # uNum = max(uNum, len(testRatings)) stat = "Load data done [%.1f s]. #user=%d, #item=%d, #train=%d, #test=%d" % ( time() - t1, uNum, iNum, len(df), len(testRatings) ) # user and item index start at 1, not zero, so the exact number of users and items = num - 1 # Initialise Model if modelName == "mf": ranker = MatrixFactorization(uNum, iNum, dim) elif modelName == "bpr": ranker = BPR(uNum, iNum, dim)