def predict(filename):
    raw_text = getText(filename)
    data = getDataset(raw_text)
    model = defineModel(data)

    loadWeights(model)
    generateText(model, raw_text)
Beispiel #2
0
def main(toolName, datasetName):

    tool = getToolObject(toolName)

    #load the dataset [used as data in shapley plot]
    sequences = getDataset(datasetName, toolName)
    print("Loaded the dataset " + datasetName + " of " + str(len(sequences)) +
          " data points.")

    #Calculate the features for the dataset
    print(
        "Calculating features for the dataset. [Might take a while for wu-crispr]"
    )
    feature_set = []
    cnt = 1
    for seq in sequences:
        features = tool.getFeatures(seq)
        feature_set.append(features)
        if toolName == 'wu-crispr' and cnt % 100 == 0:  #inform on progress [wu-crispr takes a while]
            print("-- Calculated features for " + str(cnt))
        cnt = cnt + 1
    print("Calculated the features for this dataset.")

    #Get feature names [for printing in the shapley plot]
    feature_names = tool.loadFeatureNames()
    print("Loaded the names of all " + str(len(feature_names)) + " features.")

    #Put together features with names in one dataframe
    dataset_df = pd.DataFrame(np.array(feature_set), columns=feature_names)

    #training set (must be loaded before model)
    train_df = tool.loadTrainingSet()
    print("Loaded the training set used for tool " + toolName + ", size: " +
          str(train_df.shape) + ".")

    #load model of the tool
    model = tool.loadModel()
    print("Loaded the model for tool " + toolName + ".")

    #summarize training set and subsample test set
    summary_train_df = shap.kmeans(train_df, 2)
    dataset_sub_df = dataset_df  # optional to speed up things can use dataset_df.sample(400)

    #compute and plot shapley values
    shap_explainer = shap.KernelExplainer(model.predict, summary_train_df)
    print(dataset_sub_df)
    shap_values = shap_explainer.shap_values(dataset_sub_df)

    #save the values
    with open("../results/SHAP-" + toolName + "-" + datasetName, 'wb') as file:
        pickle.dump(shap_values, file)  #the SHAP values
        pickle.dump(dataset_sub_df, file)  #the data used
    print("Computed and saved SHAP values.")

    #ploatting
    shap.summary_plot(shap_values, dataset_sub_df)
Beispiel #3
0
 def __init__(self, dataSetPath):
     self.action_space = gym.spaces.Discrete(3)
     self.observation_space = gym.spaces.Box(low=np.array([0.0]),
                                             high=np.array([1.0]))
     self.action = ["hold", "buy", "sell"]
     self.counter = 0
     self.data = getDataset(dataSetPath)
     self.dataLength = len(self.data)
     self.inventory = []
     self.total_profit = 0
def marvin_model_test():
    """
    Tests the SVM Marvin hotword detector
    :return: None
    """

    # Download data
    downloadData(data_path="/input/speech_commands/")

    # Get dictionary with files and labels
    dataDict = getDataDict(data_path="/input/speech_commands/")

    # Obtain dataframe by merging dev and test dataset
    devDF = getDataframe(dataDict["dev"], include_unknown=True)
    testDF = getDataframe(dataDict["test"], include_unknown=True)

    evalDF = pd.concat([devDF, testDF], ignore_index=True)

    print("Test files: {}".format(evalDF.shape[0]))

    # Obtain Marvin - Other separated data
    evalDF["class"] = evalDF.apply(lambda row: 1
                                   if row["category"] == "marvin" else -1,
                                   axis=1)
    evalDF.drop("category", axis=1)
    test_true_labels = evalDF["class"].tolist()

    eval_data, _ = getDataset(df=evalDF,
                              batch_size=BATCH_SIZE,
                              cache_file="kws_val_cache",
                              shuffle=False)

    # Load trained model
    model = load_model("../models/marvin_kws.h5")

    layer_name = "features256"
    feature_extractor = Model(inputs=model.input,
                              outputs=model.get_layer(layer_name).output)

    # Load trained PCA object
    with open("../models/marvin_kws_pca.pickle", "rb") as file:
        pca = pickle.load(file)

    # Load trained SVM
    with open("../models/marvin_kws_svm.pickle", "rb") as file:
        marvin_svm = pickle.load(file)

    # Extract the feature embeddings and evaluate using SVM
    X_test = feature_extractor.predict(eval_data, use_multiprocessing=True)

    X_test_scaled = pca.transform(X_test)
    test_pred_labels = marvin_svm.predict(X_test_scaled)

    OC_Statistics(test_pred_labels, test_true_labels,
                  "marvin_cm_without_noise")
Beispiel #5
0
def main(job_id, params):
    from utils import getModel
    from utils import getDataset

    logger = get_module_logger(__name__)
    logger.info('Starting job with id: %d' % job_id)
    logger.info('Fetching dataset %s' % params['dataset'])
    dataset = getDataset(params['dataset'])()

    logger.info('Using model %s' % params['experiment_name'])
    simulation = getModel(params['experiment_name'])(**params)
    logger.info('Starting training ...')
    score = simulation.run(dataset)
    return score
Beispiel #6
0
def train(filename):
    raw_text = getText(filename)
    data = getDataset(raw_text)
    model = defineModel(data)
    fitModel(model, data)
Beispiel #7
0
def model_train():
    """
    Trains model which is used as a feature extractor
    :return:None
    """

    # Download data
    downloadData(data_path="/input/speech_commands/")

    # Get data dictionary
    dataDict = getDataDict(data_path="/input/speech_commands/")

    # Obtain dataframe for each dataset
    trainDF = getDataframe(dataDict["train"])
    valDF = getDataframe(dataDict["val"])
    devDF = getDataframe(dataDict["dev"])
    testDF = getDataframe(dataDict["test"])

    print("Dataset statistics")
    print("Train files: {}".format(trainDF.shape[0]))
    print("Validation files: {}".format(valDF.shape[0]))
    print("Dev test files: {}".format(devDF.shape[0]))
    print("Test files: {}".format(testDF.shape[0]))

    # Use TF Data API for efficient data input
    train_data, train_steps = getDataset(df=trainDF,
                                         batch_size=BATCH_SIZE,
                                         cache_file="train_cache",
                                         shuffle=True)

    val_data, val_steps = getDataset(df=valDF,
                                     batch_size=BATCH_SIZE,
                                     cache_file="val_cache",
                                     shuffle=False)

    model = create_model()
    model.summary()

    # Stop training if the validation accuracy doesn't improve
    earlyStopping = EarlyStopping(monitor="val_loss",
                                  patience=PATIENCE,
                                  verbose=1)

    # Reduce LR on validation loss plateau
    reduceLR = ReduceLROnPlateau(monitor="val_loss",
                                 patience=PATIENCE,
                                 verbose=1)

    # Compile the model
    model.compile(
        loss="sparse_categorical_crossentropy",
        optimizer=Adam(learning_rate=LEARNING_RATE),
        metrics=["sparse_categorical_accuracy"],
    )

    # Train the model
    history = model.fit(
        train_data.repeat(),
        steps_per_epoch=train_steps,
        validation_data=val_data.repeat(),
        validation_steps=val_steps,
        epochs=EPOCHS,
        callbacks=[earlyStopping, reduceLR],
    )

    # Save model
    print("Saving model")
    model.save("../models/marvin_kws.h5")

    # Save history data
    print("Saving training history")
    with open("../models/marvin_kws_history.pickle", "wb") as file:
        pickle.dump(history.history, file, protocol=pickle.HIGHEST_PROTOCOL)

    plot_history(history=history)
Beispiel #8
0
def marvin_kws_model():
    """
    Trains an One Class SVM for hotword detection
    :return: None
    """

    # Download data
    downloadData(data_path="/input/speech_commands/")

    # Get dictionary with files and labels
    dataDict = getDataDict(data_path="/input/speech_commands/")

    # Obtain dataframe for each dataset
    trainDF = getDataframe(dataDict["train"])
    valDF = getDataframe(dataDict["val"])

    # Obtain Marvin data from training data
    marvin_data, _ = getDataset(
        df=trainDF.loc[trainDF["category"] == "marvin", :],
        batch_size=BATCH_SIZE,
        cache_file="kws_marvin_cache",
        shuffle=False,
    )

    # Obtain Marvin - Other separated data from validation data
    valDF["class"] = valDF.apply(lambda row: 1
                                 if row["category"] == "marvin" else -1,
                                 axis=1)
    valDF.drop("category", axis=1)
    val_true_labels = valDF["class"].tolist()

    val_data, _ = getDataset(df=valDF,
                             batch_size=BATCH_SIZE,
                             cache_file="kws_val_cache",
                             shuffle=False)

    # Load model and create feature extractor
    model = load_model("../models/marvin_kws.h5")

    layer_name = "features256"
    feature_extractor = Model(inputs=model.input,
                              outputs=model.get_layer(layer_name).output)

    # Obtain the feature embeddings
    X_train = feature_extractor.predict(marvin_data, use_multiprocessing=True)
    X_val = feature_extractor.predict(val_data, use_multiprocessing=True)

    # Apply PCA to reduce dimensionality
    pca = PCA(n_components=32)
    pca.fit(X_train)
    print("Variance captured = ", sum(pca.explained_variance_ratio_))

    X_train_transformed = pca.transform(X_train)
    X_val_transformed = pca.transform(X_val)

    # SVM hyper-parameter tuning using Gaussian process
    marvin_svm = svm.OneClassSVM()

    svm_space = [
        Real(10**-5, 10**0, "log-uniform", name="gamma"),
        Real(10**-5, 10**0, "log-uniform", name="nu"),
    ]

    @use_named_args(svm_space)
    def svm_objective(**params):
        marvin_svm.set_params(**params)

        marvin_svm.fit(X_train_transformed)
        val_pred_labels = marvin_svm.predict(X_val_transformed)

        score = f1_score(val_pred_labels, val_true_labels)

        return -1 * score

    res_gp_svm = gp_minimize(func=svm_objective,
                             dimensions=svm_space,
                             n_calls=100,
                             n_jobs=-1,
                             verbose=False,
                             random_state=1)

    print("Best F1 score={:.4f}".format(-res_gp_svm.fun))

    ax = plot_convergence(res_gp_svm)
    plt.savefig("../docs/results/marvin_svm.png", dpi=300)
    plt.show()

    # Instantiate a SVM with the optimal hyper-parameters
    best_params_svm = {k.name: x for k, x in zip(svm_space, res_gp_svm.x)}
    marvin_kws = svm.OneClassSVM()
    marvin_kws.set_params(**best_params_svm)

    marvin_kws.fit(X_train_transformed)

    # Performance on training set
    val_pred_labels = marvin_kws.predict(X_val_transformed)
    OC_Statistics(val_pred_labels, val_true_labels, "marvin_cm_training")

    print("Saving PCA object")
    with open("../models/marvin_kws_pca.pickle", "wb") as file:
        pickle.dump(pca, file, protocol=pickle.HIGHEST_PROTOCOL)

    print("Saving Marvin SVM")
    with open("../models/marvin_kws_svm.pickle", "wb") as file:
        pickle.dump(marvin_svm, file, protocol=pickle.HIGHEST_PROTOCOL)
    evalMode = args.eval
    verbose_eval = args.verbose_eval
    save_model = True if args.save_model == 1 else False
    # save_model = False
    # filterMode = args.filter

    # pre = "test_bpr-he_d10.last.h5"

    # num_negatives = 1
    topK = 100 if evalMode == "all" else 10
    evaluation_threads = 1

    # Loading data
    t1 = time()

    dataset = getDataset(data, path, evalMode)

    train, trainSeq, df, testRatings, testNegatives = dataset.trainMatrix, dataset.trainSeq, dataset.df, dataset.testRatings, dataset.testNegatives
    uNum, iNum = df.uid.max() + 1, df.iid.max() + 1
    # uNum = max(uNum, len(testRatings))

    stat = "Load data done [%.1f s]. #user=%d, #item=%d, #train=%d, #test=%d" % (
        time() - t1, uNum, iNum, len(df), len(testRatings)
    )  # user and item index start at 1, not zero, so the exact number of users and items = num - 1

    # Initialise Model

    if modelName == "mf":
        ranker = MatrixFactorization(uNum, iNum, dim)
    elif modelName == "bpr":
        ranker = BPR(uNum, iNum, dim)