Esempio n. 1
0
def Svm(xtrain_count,xvalid_count,train_y,valid_y,my_tags,xtrain_tfidf,xvalid_tfidf,xtrain_tfidf_ngram,xvalid_tfidf_ngram,xtrain_tfidf_ngram_chars,xvalid_tfidf_ngram_chars):
    from sklearn.svm import SVC
    # SVM on Bag of words
    predictions = md.train_model(SVC(kernel = 'linear', C = 1), xtrain_count, train_y,xvalid_count)
    cm = confusion_matrix(valid_y, predictions)
    print (cm)
    print('SVM Bow accuracy %s' % accuracy_score(predictions, valid_y))
    print(classification_report(valid_y, predictions,target_names=my_tags))
    # SVM on Word Level TF IDF Vectors
    predictions = md.train_model(SVC(kernel = 'linear', C = 1), xtrain_tfidf, train_y, xvalid_tfidf)
    print ("SVM, WordLevel TF-IDF: ", accuracy_score(predictions, valid_y))
    cm = confusion_matrix(valid_y, predictions)
    print (cm)
    print(classification_report(valid_y,predictions,target_names=my_tags))
    # SVM on Character Level TF IDF Vectors
    predictions = md.train_model(SVC(kernel = 'linear', C = 1), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
    print ("SVM, CharLevel Vectors: ", accuracy_score(predictions, valid_y))
    cm = confusion_matrix(valid_y, predictions)
    print (cm)
    print(classification_report(valid_y,predictions,target_names=my_tags))
    # SVM on Ngram Level TF IDF Vectors
    predictions = md.train_model(SVC(kernel = 'linear', C = 1), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
    print ("SVM, N-Gram Vectors: ",  accuracy_score(predictions, valid_y))
    cm = confusion_matrix(valid_y, predictions)
    print (cm)
    print(classification_report(valid_y,predictions,target_names=my_tags))
Esempio n. 2
0
def dtree(train, train_y,valid_y,my_tags,xtrain_tfidf,xvalid_tfidf,xtrain_tfidf_ngram,xvalid_tfidf_ngram,xtrain_tfidf_ngram_chars,xvalid_tfidf_ngram_chars):
   xtrain_count=train[0]
   xvalid_count=train[1]
   from sklearn.tree import DecisionTreeClassifier
   # training a DescisionTreeClassifier on Bag of words
   predictions = md.train_model(DecisionTreeClassifier(max_depth = 2), xtrain_count, train_y,xvalid_count)
   cm = confusion_matrix(valid_y, predictions)
   print (cm)
   print('Bow Dtree accuracy %s' % accuracy_score(predictions, valid_y))
   print(classification_report(valid_y, predictions,target_names=my_tags))
   # training a DescisionTreeClassifier  on Word Level TF IDF Vectors
   predictions = md.train_model(DecisionTreeClassifier(max_depth = 2), xtrain_tfidf, train_y, xvalid_tfidf)
   cm = confusion_matrix(valid_y, predictions)
   print (cm)
   print('Word level TF IDF Vectors Dtree accuracy %s' % accuracy_score(predictions, valid_y))
   print(classification_report(valid_y,predictions,target_names=my_tags))

   # training a DescisionTreeClassifier  on Ngram Level TF IDF Vectors
   predictions = md.train_model(DecisionTreeClassifier(max_depth = 2), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
   cm = confusion_matrix(valid_y, predictions)
   print (cm)
   print('Ngram Level TF IDF Vectors Dtree accuracy %s' % accuracy_score(predictions, valid_y))
   print(classification_report(valid_y,predictions,target_names=my_tags))

   # training a DescisionTreeClassifier  on Character Level TF IDF Vectors
   predictions = md.train_model(DecisionTreeClassifier(max_depth = 2), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
   cm = confusion_matrix(valid_y, predictions)
   print (cm)
   print('Character Level TF IDF Vectors Dtree accuracy %s' % accuracy_score(predictions, valid_y))
   print(classification_report(valid_y,predictions,target_names=my_tags))
Esempio n. 3
0
def main():
    data_path = Path("/home/gonzalo_franco/workspaces/python/molecules/data")
    resolution = 0.7
    min_xyz = -5
    max_xyz = 5
    possible_elements = ['N', 'C', 'H', 'O', 'F']
    possible_elements_dict = {}
    for i, e in enumerate(possible_elements):
        possible_elements_dict[e] = i

    range_of_values = np.arange(2 * min_xyz, 2 * max_xyz + resolution,
                                resolution)
    range_of_values = np.round(range_of_values, 3)

    print("Loading data")
    molecule_structure_dict, train_data = processing.load_data(
        data_path, min_xyz, max_xyz, resolution)

    print("Training model")
    modeling.train_model(train_data,
                         molecule_structure_dict,
                         possible_elements_dict,
                         range_of_values,
                         batch_size=32,
                         tensorboard_dir="tensorboard/test_2")
Esempio n. 4
0
def SGD_Svm(xtrain_count, xvalid_count, train_y, valid_y, my_tags,
            xtrain_tfidf, xvalid_tfidf, xtrain_tfidf_ngram, xvalid_tfidf_ngram,
            xtrain_tfidf_ngram_chars, xvalid_tfidf_ngram_chars):

    # training a SVM_sgd classifier on Bag of words
    from sklearn.linear_model import SGDClassifier
    predictions = md.train_model(
        SGDClassifier(loss='hinge',
                      penalty='l2',
                      alpha=1e-3,
                      random_state=42,
                      max_iter=5,
                      tol=None), xtrain_count, train_y, xvalid_count)
    cm = confusion_matrix(valid_y, predictions)
    print(cm)
    print('SVM_SGD for Bow accuracy %s' % accuracy_score(predictions, valid_y))
    print(classification_report(valid_y, predictions, target_names=my_tags))

    # SVM SGDClassifier on Word Level TF IDF Vectors
    predictions = md.train_model(
        SGDClassifier(loss='hinge',
                      penalty='l2',
                      alpha=1e-3,
                      random_state=42,
                      max_iter=5,
                      tol=None), xtrain_tfidf, train_y, xvalid_tfidf)
    print("SVM_sgd, WordLevel TF-IDF: ", accuracy_score(predictions, valid_y))
    cm = confusion_matrix(valid_y, predictions)
    print(cm)
    print(classification_report(valid_y, predictions, target_names=my_tags))
    #SVM SGDClassifier on Ngram Level TF IDF Vectors
    predictions = md.train_model(
        SGDClassifier(loss='hinge',
                      penalty='l2',
                      alpha=1e-3,
                      random_state=42,
                      max_iter=5,
                      tol=None), xtrain_tfidf_ngram, train_y,
        xvalid_tfidf_ngram)
    print("SVM_sgd, N-Gram Vectors: ", accuracy_score(predictions, valid_y))
    cm = confusion_matrix(valid_y, predictions)
    print(cm)
    print(classification_report(valid_y, predictions, target_names=my_tags))
    # SVM SGDClassifier on Character Level TF IDF Vectors
    predictions = md.train_model(
        SGDClassifier(loss='hinge',
                      penalty='l2',
                      alpha=1e-3,
                      random_state=42,
                      max_iter=5,
                      tol=None), xtrain_tfidf_ngram_chars, train_y,
        xvalid_tfidf_ngram_chars)
    print("SVM_sgd, CharLevel Vectors: ", accuracy_score(predictions, valid_y))
    cm = confusion_matrix(valid_y, predictions)
    print(cm)
    print(classification_report(valid_y, predictions, target_names=my_tags))
Esempio n. 5
0
def model_sensitivity_method(data, args, visualizer=None, title=None):
    """
    Given a dataset `data` and arguments `args`, run a full test of private
    prediction using the model sensitivity method.

    Note: This algorithm only guarantees privacy for models with convex losses.
    """
    assert args.model == "linear", f"Model {args.model} not supported."

    # initialize model and criterion:
    num_classes = int(data["train"]["targets"].max()) + 1
    num_samples, num_features = data["train"]["features"].size()
    model = modeling.initialize_model(num_features,
                                      num_classes,
                                      device=args.device)
    criterion = nn.CrossEntropyLoss()
    regularized_criterion = modeling.add_l2_regularization(
        criterion, model, args.weight_decay)

    # train classifier:
    logging.info("Training non-private classifier...")
    modeling.train_model(model,
                         data["train"],
                         criterion=regularized_criterion,
                         optimizer=args.optimizer,
                         num_epochs=args.num_epochs,
                         learning_rate=args.learning_rate,
                         batch_size=args.batch_size,
                         visualizer=visualizer,
                         title=title)

    # perturb model parameters:
    logging.info("Applying model sensitivity method...")
    scale = sensitivity_scale(args.epsilon, args.delta, args.weight_decay,
                              criterion, num_samples, args.noise_dist)
    param = modeling.get_parameter_vector(model)
    mean = torch.zeros_like(param)
    noise_dist = "gaussian" if args.noise_dist in ["gaussian", "advanced_gaussian"] \
        else args.noise_dist
    perturbation = getattr(noise, noise_dist)(mean, scale)

    with torch.no_grad():
        param.add_(perturbation)
    modeling.set_parameter_vector(model, param)

    # perform inference on both training and test set:
    logging.info("Performing inference with perturbed predictor...")
    predictions = {
        split: modeling.test_model(model, data_split).argmax(dim=1)
        for split, data_split in data.items()
    }
    return predictions
Esempio n. 6
0
def knn(xtrain_count, xvalid_count, train_y, valid_y, my_tags, xtrain_tfidf,
        xvalid_tfidf, xtrain_tfidf_ngram, xvalid_tfidf_ngram,
        xtrain_tfidf_ngram_chars, xvalid_tfidf_ngram_chars):

    from sklearn.neighbors import KNeighborsClassifier
    # training a KNN classifier on Bag of words
    predictions = md.train_model(KNeighborsClassifier(n_neighbors=7),
                                 xtrain_count, train_y, xvalid_count)
    cm = confusion_matrix(valid_y, predictions)
    print(cm)
    print('For Bow KNN accuracy %s' % accuracy_score(predictions, valid_y))
    print(classification_report(valid_y, predictions, target_names=my_tags))

    # training a KNN classifier on Word Level TF IDF Vectors
    predictions = md.train_model(KNeighborsClassifier(n_neighbors=7),
                                 xtrain_tfidf, train_y, xvalid_tfidf)
    cm = confusion_matrix(valid_y, predictions)
    print(cm)
    print('For Word Level TF IDF Vectors KNN accuracy %s' %
          accuracy_score(predictions, valid_y))
    print(classification_report(valid_y, predictions, target_names=my_tags))

    # training a KNN classifier on Ngram Level TF IDF Vectors
    predictions = md.train_model(KNeighborsClassifier(n_neighbors=7),
                                 xtrain_tfidf_ngram, train_y,
                                 xvalid_tfidf_ngram)
    cm = confusion_matrix(valid_y, predictions)
    print(cm)
    print('For Ngram Level TF IDF Vectors KNN accuracy %s' %
          accuracy_score(predictions, valid_y))
    print(classification_report(valid_y, predictions, target_names=my_tags))

    # training a KNN classifier on Ngram Level TF IDF Vectors
    predictions = md.train_model(KNeighborsClassifier(n_neighbors=7),
                                 xtrain_tfidf_ngram_chars, train_y,
                                 xvalid_tfidf_ngram_chars)
    cm = confusion_matrix(valid_y, predictions)
    print(cm)
    print('For Character Level TF IDF Vectors KNN accuracy %s' %
          accuracy_score(predictions, valid_y))
    print(classification_report(valid_y, predictions, target_names=my_tags))
Esempio n. 7
0
def main():
    dtm = readDtm()
    dtm = (dtm - np.mean(dtm)) / np.std(dtm)  # normalize dtm distribution
    dtm = np.flipud(dtm)  # input data set has wrong orientation
    mask = readEscarpmentMask()

    # counts, bins = np.histogram(np.ravel(dtm),bins=30)
    # plt.close('all')
    # plt.hist(bins[:-1], bins, weights=counts)
    # plt.show()

    row_list, col_list = getTileExtents(dtm)
    data = makePatches(dtm, mask, row_list, col_list)

    data['mask'][data['mask'] >= 0.5] = 1
    data['mask'][data['mask'] < 0.5] = 0

    # plt.close('all')
    # plt.imshow(data['mask'][1, :, :])
    # plt.show()
    # plt.imshow(data['dtm'][1, :, :])
    # plt.show()
    # plt.imshow(data['mask'][65, :, :])
    # plt.show()
    # plt.imshow(data['dtm'][65, :, :])
    # plt.show()

    # plt.imshow(data['mask'][((120*4)+65), :, :])
    # plt.show()
    # plt.imshow(data['dtm'][((120*4)+65), :, :])
    # plt.show()

    data['dtm'] = np.expand_dims(data['dtm'], axis=3)
    data['mask'] = np.expand_dims(data['mask'], axis=3)

    train_model(data['dtm'],
                data['mask'],
                model_fname='model.h5',
                N=128,
                channels=1)
Esempio n. 8
0
def Naive_Bayes(xtrain_count, xvalid_count, train_y, valid_y, my_tags,
                xtrain_tfidf, xvalid_tfidf, xtrain_tfidf_ngram,
                xvalid_tfidf_ngram, xtrain_tfidf_ngram_chars,
                xvalid_tfidf_ngram_chars):
    from sklearn import naive_bayes
    # Naive Bayes on Bag of words
    predictions = md.train_model(naive_bayes.MultinomialNB(), xtrain_count,
                                 train_y, xvalid_count)
    print("MultinomialNB, Bag of words: ",
          accuracy_score(predictions, valid_y))
    cm = confusion_matrix(valid_y, predictions)
    print(cm)
    print(classification_report(valid_y, predictions, target_names=my_tags))
    # Naive Bayes on Word Level TF IDF Vectors
    predictions = md.train_model(naive_bayes.MultinomialNB(), xtrain_tfidf,
                                 train_y, xvalid_tfidf)
    print("MultinomialNB, WordLevel TF-IDF: ",
          accuracy_score(predictions, valid_y))
    cm = confusion_matrix(valid_y, predictions)
    print(cm)
    print(classification_report(valid_y, predictions, target_names=my_tags))
    # Naive Bayes on Ngram Level TF IDF Vectors
    predictions = md.train_model(naive_bayes.MultinomialNB(),
                                 xtrain_tfidf_ngram, train_y,
                                 xvalid_tfidf_ngram)
    print("MultinomialNB, N-Gram TF-IDF: ",
          accuracy_score(predictions, valid_y))
    cm = confusion_matrix(valid_y, predictions)
    print(cm)
    print(classification_report(valid_y, predictions, target_names=my_tags))
    # Naive Bayes on Character Level TF IDF Vectors
    predictions = md.train_model(naive_bayes.MultinomialNB(),
                                 xtrain_tfidf_ngram_chars, train_y,
                                 xvalid_tfidf_ngram_chars)
    print("MultinomialNB, CharLevel TF-IDF: ",
          accuracy_score(predictions, valid_y))
    cm = confusion_matrix(valid_y, predictions)
    print(cm)
    print(classification_report(valid_y, predictions, target_names=my_tags))
def Gaussian_Naive_Bayes(xtrain_count,xvalid_count,train_y,valid_y,my_tags,xtrain_tfidf,xvalid_tfidf,xtrain_tfidf_ngram,xvalid_tfidf_ngram,xtrain_tfidf_ngram_chars,xvalid_tfidf_ngram_chars):
    from sklearn.naive_bayes import GaussianNB
    xtrain_count=xtrain_count.toarray()
    xvalid_count=xvalid_count.toarray()

    # training a Gaussian Naive Bayes classifier for Bag of words
    predictions = md.train_model(GaussianNB(), xtrain_count, train_y,xvalid_count)
    cm = confusion_matrix(valid_y,predictions)
    print (cm)
    print('gnb accuracy for Bow %s' % accuracy_score(predictions, valid_y))
    print(classification_report(valid_y, predictions,target_names=my_tags))

    # training a Gaussian Naive Bayes classifier for word level TF-IDF
    xtrain_tfidf=xtrain_tfidf.toarray()
    xvalid_tfidf=xvalid_tfidf.toarray()
    predictions = md.train_model(GaussianNB(), xtrain_tfidf, train_y,xvalid_tfidf)
    cm = confusion_matrix(valid_y,predictions)
    print (cm)
    print('gnb accuracy for word level TF-IDF %s' % accuracy_score(predictions, valid_y))
    print(classification_report(valid_y, predictions,target_names=my_tags))
    
    # training a Gaussian Naive Bayes classifier for N-gram TF-IDF
    xtrain_tfidf_ngram=xtrain_tfidf_ngram.toarray()
    xvalid_tfidf_ngram=xvalid_tfidf_ngram.toarray()
    predictions = md.train_model(GaussianNB(), xtrain_tfidf_ngram, train_y,xvalid_tfidf_ngram)
    cm = confusion_matrix(valid_y,predictions)
    print (cm)
    print('gnb accuracy for N-gram TF-IDF %s' % accuracy_score(predictions, valid_y))
    print(classification_report(valid_y, predictions,target_names=my_tags))

    # training a Gaussian Naive Bayes classifier for character level TF-IDF
    xtrain_tfidf_ngram_chars=xtrain_tfidf_ngram_chars.toarray()
    xvalid_tfidf_ngram_chars=xvalid_tfidf_ngram_chars.toarray()
    predictions = md.train_model(GaussianNB(), xtrain_tfidf_ngram_chars, train_y,xvalid_tfidf_ngram_chars)
    cm = confusion_matrix(valid_y,predictions)
    print (cm)
    print('gnb accuracy for character level TF-IDF %s' % accuracy_score(predictions, valid_y))
    print(classification_report(valid_y, predictions,target_names=my_tags))
def Bernoulli_Naive_Bayes(xtrain_count, xvalid_count, train_y, valid_y,
                          my_tags, xtrain_tfidf, xvalid_tfidf,
                          xtrain_tfidf_ngram, xvalid_tfidf_ngram,
                          xtrain_tfidf_ngram_chars, xvalid_tfidf_ngram_chars):
    from sklearn.naive_bayes import BernoulliNB
    # training a Bernoulli Naive Bayes classifier for Bag of words
    predictions = md.train_model(BernoulliNB(), xtrain_count, train_y,
                                 xvalid_count)
    cm = confusion_matrix(valid_y, predictions)
    print(cm)
    print('Bnb accuracy %s' % accuracy_score(predictions, valid_y))
    print(classification_report(valid_y, predictions, target_names=my_tags))
    # training a Bernoulli Naive Bayes classifier for word level TF-IDF
    predictions = md.train_model(BernoulliNB(), xtrain_tfidf, train_y,
                                 xvalid_tfidf)
    cm = confusion_matrix(valid_y, predictions)
    print(cm)
    print('Bnb accuracy for word level TF-IDF %s' %
          accuracy_score(predictions, valid_y))
    print(classification_report(valid_y, predictions, target_names=my_tags))
    # training a Bernoulli Naive Bayes classifier for N-gram TF-IDF
    predictions = md.train_model(BernoulliNB(), xtrain_tfidf_ngram, train_y,
                                 xvalid_tfidf_ngram)
    cm = confusion_matrix(valid_y, predictions)
    print(cm)
    print('Bnb accuracy for N-gram TF-IDF %s' %
          accuracy_score(predictions, valid_y))
    print(classification_report(valid_y, predictions, target_names=my_tags))
    # training a Bernoulli Naive Bayes classifier for character level TF-IDF
    predictions = md.train_model(BernoulliNB(), xtrain_tfidf_ngram_chars,
                                 train_y, xvalid_tfidf_ngram_chars)
    cm = confusion_matrix(valid_y, predictions)
    print(cm)
    print('Bnb accuracy for character level TF-IDF %s' %
          accuracy_score(predictions, valid_y))
    print(classification_report(valid_y, predictions, target_names=my_tags))
def main():
    set_seed(42)
    if prep['clear_tb']:
        print('Clearing tensorboard logs directory')
        clear_tb_logs()

    print('Creating working copy of data directory')
    create_working_copy(DATA_DIR, WORK_DIR)

    if prep['rotation']:
        print('Generating rotated images')
        generate_rotated_images(pjoin(WORK_DIR, 'train', 'dirty'))
        generate_rotated_images(pjoin(WORK_DIR, 'train', 'cleaned'))

    if prep['bg_removal']:
        print('Removing background from images')
        remove_bg(pjoin(WORK_DIR, 'train', 'dirty'))
        remove_bg(pjoin(WORK_DIR, 'train', 'cleaned'))
        remove_bg(pjoin(WORK_DIR, 'test'))

    train_val_split(WORK_DIR)
    prepare_test_dir(WORK_DIR)

    train_dl, valid_dl, test_dl = get_dataloaders(
        pjoin(WORK_DIR, 'train_split'), pjoin(WORK_DIR, 'valid_split'),
        pjoin(WORK_DIR, 'test'))

    net = NNet(backbone=models.resnet18)

    model, losses, accuracies = train_model(net, train_dl, valid_dl, nepoch=10)
    model.eval()

    if prep['save_model']:
        if not os.path.isdir('models'):
            os.makedirs('models')
        pth = pjoin('models', str(round(time.time())) + '.pt')
        print(f'Saving trained model to {pth}')
        net.save(pth)
Esempio n. 12
0
def main_function():


    if tf.test.gpu_device_name():

        print('Default GPU Device:{}'.format(tf.test.gpu_device_name()))

    else:

        print("Please install GPU version of TF")

    type = 'Miscellaneous'
    train_x = np.load('Data/'+str(type)+'/train/'+str(type)+'_train.npy')
    train_x = (train_x/255.0)
    train_y = pd.read_csv('Data/'+str(type)+'/train/'+str(type)+'_2014_2015_train.csv')
    train_y = labels_to_ints(train_y['FIRE_SIZE_CLASS'])
    train_y_array = np.array(train_y)
    train_y_array = tf.keras.utils.to_categorical(train_y)

    input_shape = (train_x.shape[1], train_x.shape[2], train_x.shape[3])
    output_shape = train_y_array.shape[1]
    # Import ML model (must assign Learning Rate first)
    learning_rate = 0.05
    my_model = modeling.create_model(learning_rate, input_shape, output_shape)

    # Assign ML model hyperparameters
    #learning_rate = 0.1
    epochs = 10
    batch_size = 100
    validation_split = 0.3


# Train model
    epochs, hist = modeling.train_model(my_model, train_x, train_y_array, epochs, batch_size, validation_split)
# plot
    modeling.plot_curve(epochs, hist, 'accuracy')
    print('Test modeling done')
Esempio n. 13
0
def loss_perturbation_method(data, args, visualizer=None, title=None):
    """
    Given a dataset `data` and arguments `args`, run a full test of the private
    prediction algorithms of Chaudhuri et al. (2011) / Kifer et al. (2012)
    generalized to the multi-class setting. Returns a `dict` containing the
    `predictions` for the training and test data.

    Note: This algorithm only guarantees privacy under the following assumptions:
    - The loss is strictly convex and has a continuous Hessian.
    - The model is linear.
    - The inputs have a 2-norm restricted to be less than or equal 1.
    - The Lipschitz constant of the loss function and the spectral
        norm of the Hessian must be bounded.
    """
    assert args.model == "linear", f"Model {args.model} not supported."
    assert args.noise_dist != "advanced_gaussian", \
        "Advanced Gaussian method not supported for loss perturbation."

    # get dataset properties:
    num_classes = int(data["train"]["targets"].max()) + 1
    num_samples, num_features = data["train"]["features"].size()

    # initialize model and criterion:
    model = modeling.initialize_model(num_features,
                                      num_classes,
                                      device=args.device)
    criterion = nn.CrossEntropyLoss()

    precision, weight_decay = loss_perturbation_params(args.epsilon,
                                                       args.delta,
                                                       args.noise_dist,
                                                       criterion, num_samples,
                                                       num_classes)
    weight_decay = max(weight_decay, args.weight_decay)

    # sample loss perturbation vector:
    param = modeling.get_parameter_vector(model)
    mean = torch.zeros_like(param)
    perturbation = getattr(noise, args.noise_dist)(mean, precision)
    perturbations = [torch.zeros_like(p) for p in model.parameters()]
    modeling.set_parameter_vector(perturbations, perturbation)

    # closure implementing the loss-perturbation criterion:
    def loss_perturbation_criterion(predictions, targets):
        loss = criterion(predictions, targets)
        for param, perturb in zip(model.parameters(), perturbations):
            loss += ((param * perturb).sum() / num_samples)
        return loss

    # add L2-regularizer to the loss:
    regularized_criterion = modeling.add_l2_regularization(
        loss_perturbation_criterion, model, weight_decay)

    # train classifier:
    logging.info("Training classifier with loss perturbation...")
    modeling.train_model(model,
                         data["train"],
                         criterion=regularized_criterion,
                         optimizer=args.optimizer,
                         num_epochs=args.num_epochs,
                         learning_rate=args.learning_rate,
                         batch_size=args.batch_size,
                         visualizer=visualizer,
                         title=title)

    # perform inference on both training and test set:
    logging.info("Performing inference with loss-perturbed predictor...")
    predictions = {
        split: model(data_split["features"]).argmax(dim=1)
        for split, data_split in data.items()
    }
    return predictions
Esempio n. 14
0
def subsagg_method(data, args, visualizer=None, title=None):
    """
    Given a dataset `data` and arguments `args`, run a full test of the private
    prediction algorithm of Dwork & Feldman (2018). Returns a `dict` containing
    the `predictions` for the training and test data.
    """

    # unspecified inference budgets means we are trying many values:
    if args.inference_budget == -1:
        inference_budgets = INFERENCE_BUDGETS
    else:
        inference_budgets = [args.inference_budget]

    # split training set into disjoint subsets:
    data["split_train"] = split_dataset(data["train"], args.num_models)

    # train all classifiers:
    logging.info(f"Training {args.num_models} disjoint classifiers...")
    models = [None] * args.num_models
    for idx in range(args.num_models):

        # initialize model:
        logging.info(f" => training model {idx + 1} of {args.num_models}:")
        num_classes = int(data["train"]["targets"].max()) + 1
        num_features = data["split_train"][idx]["features"].size(1)
        models[idx] = modeling.initialize_model(num_features,
                                                num_classes,
                                                model=args.model,
                                                device=args.device)

        # train using L2-regularized loss:
        regularized_criterion = modeling.add_l2_regularization(
            nn.CrossEntropyLoss(), models[idx], args.weight_decay)
        augmentation = (args.model != "linear")
        modeling.train_model(models[idx],
                             data["split_train"][idx],
                             criterion=regularized_criterion,
                             optimizer=args.optimizer,
                             num_epochs=args.num_epochs,
                             learning_rate=args.learning_rate,
                             batch_size=args.batch_size,
                             augmentation=augmentation,
                             visualizer=visualizer,
                             title=title)

    # clean up:
    del data["split_train"]

    # perform inference on both training and test set:
    logging.info("Performing inference with private predictor...")
    predictions = {}
    for split in data.keys():

        # compute predictions of each model:
        batch_size = data[split]["targets"].size(
            0) if args.model == "linear" else 128
        preds = [
            modeling.test_model(
                model,
                data[split],
                augmentation=augmentation,
                batch_size=batch_size,
            ) for model in models
        ]
        preds = [pred.argmax(dim=1) for pred in preds]
        preds = torch.stack(preds, dim=1)

        # compute private predictions:
        if split not in predictions:
            predictions[split] = {}
        for inference_budget in inference_budgets:
            # privacy parameter must be corrected for inference budget:
            epsilon = args.epsilon / float(inference_budget)
            if args.delta > 0:
                eps, _ = advanced_compose(args.epsilon, args.delta,
                                          inference_budget, args.delta)
                epsilon = max(eps, epsilon)

            # compute and store private predictions:
            predictions[split][inference_budget] = \
                private_prediction(preds, epsilon=epsilon)

    # return predictions:
    return predictions
Esempio n. 15
0
def main():
    np.random.seed(RANDOM_SEED)
    torch.manual_seed(RANDOM_SEED)
    CURRENT_TIME = time.strftime("%Y-%m-%dT%H:%M", time.localtime())
    if MODEL_LOAD_NAME:
        model_save_path = os.path.join(MODEL_ROOT, MODEL_LOAD_NAME)
    else:
        model_save_path = os.path.join(MODEL_ROOT, CURRENT_TIME + "_" + MODEL_SAVE_NAME)
    try:
        os.makedirs(model_save_path)
    except:
        pass
    LOGGER = getLogger(name="main", model_save_path=model_save_path)
    if PREPROCESS:
        LOGGER.info("START preprocessing")
        scaler_dict = csv2pickle(raw_filepath=RAW_FILEPATH,
                                 score_filepath=SCORE_FILEPATH,
                                 image_root=IMAGE_ROOT,
                                 train_filepath=TRAIN_FILEPATH,
                                 val_filepath=VAL_FILEPATH,
                                 model_save_path=model_save_path,
                                 random_seed=RANDOM_SEED)

    LOGGER.info("START Initiating Datasets")
    LOGGER.info("Build Datasets")
    LOGGER.info("Build Dataloaders")
    datasets, dataloaders = init_datasets(train_filepath=TRAIN_FILEPATH,
                                          val_filepath=VAL_FILEPATH,
                                          image_root=IMAGE_ROOT,
                                          num_col_ids=NUM_COLUMN_IDS,
                                          array_col_id=ARRAY_COLUMN_ID,
                                          transform=TRANSFORM,
                                          batch_size=BATCH_SIZE)
    dataset_sizes = {x: len(datasets[x]) for x in ["train", "val"]}
    LOGGER.info("Sample size: Train %d" % dataset_sizes['train'])
    LOGGER.info("Sample size:  Val  %d" % dataset_sizes['val'])
    num_tabular_features = len(datasets["train"][0]["ftrs"])
    LOGGER.info("Numeric features count: %d" % len(NUM_COLUMN_IDS))
    LOGGER.info("Embedding features dimension: %d" % (num_tabular_features - len(NUM_COLUMN_IDS)))
    LOGGER.info("Features name: %s" % [nn for nn in datasets['train'].csv_file.columns[NUM_COLUMN_IDS+[ARRAY_COLUMN_ID]]])

    LOGGER.info("START Initiating Model")
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    criterion = nn.MSELoss()
    if TRAINING:
        model = PriceModel(num_ftrs=num_tabular_features,
                           hidden_units=HIDDEN_UNITS,
                           fine_tune=FINE_TUNE).to(device)
        params = model.parameters()
        optimizer = optim.SGD(params, lr=LEARNING_RATE, momentum=MOMENTUM)
        if SCHEDULER_REDUCE_ON_PLATEAU:
            exp_lr_scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.1, verbose=True)
        else:
            exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=SCHEDULER_STEP_SIZE)
        LOGGER.info("Model parameters: Training epochs = %d" % NUM_EPOCHS)
        LOGGER.info("Model parameters: Learning rate = %.3f" % LEARNING_RATE)
        LOGGER.info("Model parameters: Momentum = %.2f" % MOMENTUM)
        LOGGER.info("Model parameters: Scheduler step size = %d" % SCHEDULER_STEP_SIZE)
        LOGGER.info("Model parameters: FC layers = %s" % ([2048+num_tabular_features]+HIDDEN_UNITS+[1]))
        LOGGER.info("Model parameters: Fine tuning last %d layers" % FINE_TUNE)

        # train/save/read model
        LOGGER.info("START Training Model")
        model, all_records, best_records = train_model(dataloaders=dataloaders,
                                                       dataset_sizes=dataset_sizes,
                                                       model=model,
                                                       criterion=criterion,
                                                       optimizer=optimizer,
                                                       scheduler=exp_lr_scheduler,
                                                       num_epochs=NUM_EPOCHS,
                                                       device=device,
                                                       min_max_scaler=scaler_dict["winning_bid"],
                                                       model_save_path=model_save_path)
        model_load_path = model_save_path
        LOGGER.info("Save model to %s" % model_save_path)
        save_model(model_save_path=model_save_path,
                   model=model,
                   all_records=all_records,
                   best_records=best_records,
                   scaler_dict=scaler_dict)
    else:
        model_load_path = os.path.join(MODEL_ROOT, MODEL_LOAD_NAME)
        LOGGER.info("Load model from %s" % model_load_path)
        model, all_records, best_records, scaler_dict = load_model(model_load_path=model_load_path)

    # eval model
    LOGGER.info("START Evaluation")
    model = model.to("cpu")
    evaluate_model_price(model, device, datasets, "train", scaler_dict, TRAIN_FILEPATH, model_load_path)
    evaluate_model_price(model, device, datasets, "val", scaler_dict, VAL_FILEPATH, model_load_path)
    visualization_save_path = os.path.join(model_load_path, "eval_visualizations")
    try:
        os.makedirs(visualization_save_path)
    except:
        pass
    evaluate_model(model, dataset=datasets["val"], idxs=EVALUATION_IDXS, scaler=scaler_dict["winning_bid"], 
                   save_path=visualization_save_path, model_load_path=model_load_path)

    return None
Esempio n. 16
0
def process(args):
    global log_to_file
    global SENSOR_CSV

    if args.log:
        log_to_file = True
        logging.basicConfig(filename=args.log, level=logging.INFO)
    else:
        logging.basicConfig(stream=sys.stderr, level=logging.INFO)

    if (args.command == 'process'):
        from modeling import process_data

        if not args.csv_file:
            args.csv_file = SENSOR_CSV
        #process the raw data
        df = process_data(args.csv_file)
        #save data to file
        save_data(df, args.data_file)

    elif (args.command == 'train'):
        from modeling import process_data
        from modeling import train_model

        if (args.csv_file):  #input is csv file
            #process the raw data
            df = process_data(args.csv_file)
            #save data to file
            save_data(df, args.data_file)
        else:  #input is processed data file
            #load the processed data
            df = load_data(args.data_file)

        #for aircon in ON status, build model to predict TURN OFF action
        on_model = train_model(df[0], df[1], args.classifier)

        #for aircon in OFF status, build model to predict TURN ON action
        off_model = train_model(df[2], df[3], args.classifier)

        #save models to file
        save_model([on_model, off_model], args.model_file)

    elif (args.command == 'predict'):
        from modeling import ACTION_TURN_OFF
        from modeling import ACTION_TURN_ON
        from modeling import ACTION_NOTHING
        #parse the input values from sensors
        status, inputs = parse_sensors(args.sensors)

        #load prediction models
        on_model, off_model = load_model(args.model_file)

        if status == 0:  #aircon is OFF, predict TURN ON
            action = predict(off_model, inputs)
        else:  #aircon is ON, predict TURN OFF
            action = predict(on_model, inputs)

        if action == ACTION_TURN_ON:
            print "TURN_ON"
        elif action == ACTION_TURN_OFF:
            print "TURN_OFF"
        else:
            print "DO_NOTHING"

    elif (args.command == 'evaluate'):
        import modeling

        if (args.csv_file):  #input is raw data
            #process raw data
            df = modeling.process_data(args.csv_file)
        else:  #input is processed data
            #load processed data
            df = load_data(args.data_file)

        print "\n\nPerformance for TURN ON prediction"

        con_mats = modeling.evaluate_model(df[2], df[3], args.classifier)
        fold = 1
        for c1, c2 in con_mats:
            print "\nPrediction performance for fold " + str(fold)
            print "\n... on training data"
            print_confusion_matrix(c1, ("NOTHING", "TURN-ON"))
            print "\n... on testing data"
            print_confusion_matrix(c2, ("NOTHING", "TURN-ON"))
            fold += 1

        print "\n\nPerformance for TURN OFF prediction"

        #cross validation evaluation
        con_mats = modeling.evaluate_model(df[0], df[1], args.classifier)

        #report the performance
        fold = 1
        for c1, c2 in con_mats:
            print "\nPrediction performance for fold " + str(fold)
            print "\n... on training data"
            print_confusion_matrix(c1, ("DO-NOTHING", "TURN-OFF"))
            print "\n... on testing data"
            print_confusion_matrix(c2, ("DO-NOTHING", "TURN-OFF"))
            fold += 1

    elif (args.command == 'reinforce'):
        reinforce()
    else:
        raise EngineError("unknown command")
Esempio n. 17
0
def logit_sensitivity_method(data, args, visualizer=None, title=None):
    """
    Given a dataset `data` and arguments `args`, run a full test of the logit
    sensitivity method. Returns a `dict` containing the `predictions` for the
    training and test data.

    Note: This algorithm only guarantees privacy for models with convex losses.
    """
    assert args.model == "linear", f"Model {args.model} not supported."

    # unspecified inference budgets means we are trying many values:
    if args.inference_budget == -1:
        inference_budgets = INFERENCE_BUDGETS
    else:
        inference_budgets = [args.inference_budget]

    # initialize model and criterion:
    num_classes = int(data["train"]["targets"].max()) + 1
    num_samples, num_features = data["train"]["features"].size()
    model = modeling.initialize_model(num_features,
                                      num_classes,
                                      device=args.device)
    criterion = nn.CrossEntropyLoss()
    regularized_criterion = modeling.add_l2_regularization(
        criterion, model, args.weight_decay)

    # train classifier:
    logging.info("Training non-private classifier...")
    modeling.train_model(model,
                         data["train"],
                         criterion=regularized_criterion,
                         optimizer=args.optimizer,
                         num_epochs=args.num_epochs,
                         learning_rate=args.learning_rate,
                         batch_size=args.batch_size,
                         visualizer=visualizer,
                         title=title)

    # perform inference on both training and test set:
    logging.info("Performing inference with private predictor...")
    predictions = {}
    for split in data.keys():
        if split not in predictions:
            predictions[split] = {}
        for inference_budget in inference_budgets:

            # account for the budget in the noise scale:
            scale = sensitivity_scale(args.epsilon / float(inference_budget),
                                      args.delta / float(inference_budget),
                                      args.weight_decay, criterion,
                                      num_samples, args.noise_dist)
            if args.delta > 0:
                # linearly search for the optimal noise scale under advanced
                # composition:
                del_primes = torch.linspace(0, args.delta, 1000)[1:-1]
                ind_eps_del = [
                    advanced_compose(args.epsilon, args.delta,
                                     inference_budget, dp) for dp in del_primes
                ]
                scales = [
                    sensitivity_scale(epsilon, delta, args.weight_decay,
                                      criterion, num_samples, args.noise_dist)
                    for epsilon, delta in ind_eps_del
                ]
                # for small budgets the naive scale may be better:
                scale = max(max(scales), scale)

            # make private predictions:
            noise_dist = "gaussian" if args.noise_dist in ["gaussian", "advanced_gaussian"] \
                else args.noise_dist
            preds = modeling.test_model(model, data[split])
            mean = torch.zeros_like(preds).T
            preds += getattr(noise, noise_dist)(mean, scale).T

            # make private predictions:
            predictions[split][inference_budget] = preds.argmax(dim=1)

    # return predictions:
    return predictions
Esempio n. 18
0
def process(args):
    global log_to_file
    global SENSOR_CSV

    if args.log:
        log_to_file = True
        logging.basicConfig(filename=args.log, level=logging.INFO)
    else:
        logging.basicConfig(stream=sys.stderr, level=logging.INFO)

    if (args.command == 'process'):
        from modeling import process_data

        if not args.csv_file:
            args.csv_file = SENSOR_CSV
        #process the raw data
        df = process_data(args.csv_file)
        #save data to file
        save_data(df, args.data_file)

    elif (args.command == 'train'):
        from modeling import process_data
        from modeling import train_model

        if (args.csv_file): #input is csv file
            #process the raw data
            df = process_data(args.csv_file)
            #save data to file
            save_data(df, args.data_file)
        else: #input is processed data file
            #load the processed data
            df = load_data(args.data_file)

        #for aircon in ON status, build model to predict TURN OFF action
        on_model = train_model(df[0], df[1], args.classifier)

        #for aircon in OFF status, build model to predict TURN ON action
        off_model = train_model(df[2], df[3], args.classifier)

        #save models to file
        save_model([on_model, off_model], args.model_file)

    elif (args.command == 'predict'):
        from modeling import ACTION_TURN_OFF
        from modeling import ACTION_TURN_ON
        from modeling import ACTION_NOTHING
        #parse the input values from sensors
        status, inputs = parse_sensors(args.sensors)

        #load prediction models
        on_model, off_model = load_model(args.model_file)

        if status == 0:     #aircon is OFF, predict TURN ON
            action = predict(off_model, inputs)
        else:               #aircon is ON, predict TURN OFF
            action = predict(on_model, inputs)

        if action==ACTION_TURN_ON:
            print "TURN_ON"
        elif action==ACTION_TURN_OFF:
            print "TURN_OFF"
        else:
            print "DO_NOTHING"

    elif (args.command == 'evaluate'):
        import modeling

        if (args.csv_file):     #input is raw data
            #process raw data
            df = modeling.process_data(args.csv_file)
        else:                   #input is processed data
            #load processed data
            df = load_data(args.data_file)


        print "\n\nPerformance for TURN ON prediction"

        con_mats =  modeling.evaluate_model(df[2], df[3], args.classifier)
        fold = 1
        for c1, c2 in con_mats:
            print "\nPrediction performance for fold " + str(fold)
            print "\n... on training data"
            print_confusion_matrix(c1, ("NOTHING", "TURN-ON"))
            print "\n... on testing data"
            print_confusion_matrix(c2, ("NOTHING", "TURN-ON"))
            fold += 1

        print  "\n\nPerformance for TURN OFF prediction"

        #cross validation evaluation
        con_mats = modeling.evaluate_model(df[0], df[1], args.classifier)

        #report the performance
        fold = 1
        for c1, c2 in con_mats:
            print "\nPrediction performance for fold " + str(fold)
            print "\n... on training data"
            print_confusion_matrix(c1, ("DO-NOTHING", "TURN-OFF"))
            print "\n... on testing data"
            print_confusion_matrix(c2, ("DO-NOTHING", "TURN-OFF"))
            fold += 1

    elif (args.command == 'reinforce'):
        reinforce()
    else:
        raise EngineError("unknown command")
Esempio n. 19
0
def dpsgd_method(data, args, visualizer=None, title=None):
    """
    Given a dataset `data` and arguments `args`, run a full test of private
    prediction using the differentially private SGD training method of dpsgd
    et al. (2016).
    """

    # assertions:
    if args.optimizer != "sgd":
        raise ValueError(
            f"DP-SGD does not work with {args.optimizer} optimizer.")
    if args.delta <= 0.:
        raise ValueError(
            f"Specified delta must be positive (not {args.delta}).")

    # initialize model and criterion:
    num_classes = int(data["train"]["targets"].max()) + 1
    num_samples = data["train"]["features"].size(0)
    num_features = data["train"]["features"].size(1)
    model = modeling.initialize_model(num_features,
                                      num_classes,
                                      model=args.model,
                                      device=args.device)
    regularized_criterion = modeling.add_l2_regularization(
        nn.CrossEntropyLoss(), model, args.weight_decay)

    # compute standard deviation of noise to add to gradient:
    num_samples = data["train"]["features"].size(0)
    std, eps = dpsgd_privacy.compute_noise_multiplier(args.epsilon, args.delta,
                                                      num_samples,
                                                      args.batch_size,
                                                      args.num_epochs)
    logging.info(f"DP-SGD with noise multiplier (sigma) of {std}.")
    logging.info(f"Epsilon error is {abs(eps - args.epsilon):.5f}.")

    # convert model to make differentially private gradient updates:
    model = modeling.privatize_model(model, args.clip, std)

    # train classifier:
    logging.info("Training classifier using private SGD...")
    augmentation = (args.model != "linear")
    modeling.train_model(model,
                         data["train"],
                         optimizer=args.optimizer,
                         criterion=regularized_criterion,
                         num_epochs=args.num_epochs,
                         learning_rate=args.learning_rate,
                         batch_size=args.batch_size,
                         momentum=0.0,
                         use_lr_scheduler=args.use_lr_scheduler,
                         augmentation=augmentation,
                         visualizer=visualizer,
                         title=title)

    # convert model back to "regular" model:
    model = modeling.unprivatize_model(model)

    # perform inference on both training and test set:
    logging.info("Performing inference with DP-SGD predictor...")
    predictions = {
        split: modeling.test_model(model,
                                   data_split,
                                   augmentation=augmentation).argmax(dim=1)
        for split, data_split in data.items()
    }
    return predictions
Esempio n. 20
0
nyu = {
    'train':
    NyuV2(os.path.join(data_path, 'train'), transform=transformers['train']),
    'val':
    NyuV2(os.path.join(data_path, 'val'), transform=transformers['val'])
}

dataloaders = {
    'train':
    data.DataLoader(nyu['train'],
                    num_workers=8,
                    batch_size=batch_size,
                    shuffle=True),
    'val':
    data.DataLoader(nyu['val'],
                    num_workers=8,
                    batch_size=batch_size,
                    shuffle=True)
}

resnet_wts = './models/pretrained_resnet/model.pt'
model = DEN(resnet_wts)
model = model.to(device)

params_to_update = utils.params_to_update(model)
optimizer = optim.Adam(model.parameters(), lr=16e-5)
criterion = nn.MSELoss(reduction='sum')

train_model(model, dataloaders, criterion, optimizer, n_epochs, device,
            exp_dir, early_stopping_th)