def test(self, test): # Run the prediction model on the testing dataset prediction = dict() for (i, sms) in enumerate(test): message = process(sms) prediction[i] = self.classify(message) return prediction
def load_dataset(batch_size=128, load_caption=False): batch_train = process_data.process(batch_size=batch_size, extract_center=True, load_caption=load_caption) batch_val = process_data.inital_process(nb_sub=2000, batch_size=batch_size, img_path = 'val2014', extract_center=True, load_caption=load_caption) try: batch_val.vocab = batch_train.vocab batch_val.mapping = batch_train.mapping batch_val.process_captions() except Exception as e: print "Captions not processed" print e return batch_train, batch_val
def TF_and_IDF(self): self.spam_num = self.label.value_counts()[1] self.ham_num = self.label.value_counts()[0] for i in range(self.sms.shape[0]): message = process(self.sms[i]) word_list = list() # Calculate TF for word in message: if word not in word_list: word_list += [word] if self.label[i]: self.tf_spam[word] = self.tf_spam.get(word,0) + 1 else: self.tf_ham[word] = self.tf_ham.get(word,0) + 1 # Calculate IDF for word in word_list: if self.label[i]: self.idf_spam[word] = self.idf_spam.get(word,0) + 1 else: self.idf_ham[word] = self.idf_ham.get(word,0) + 1
min_dist, min_offset = None, None for j in range(polygon_points.shape[0]): node_value, polygon_point = node_values[i], polygon_points[j] if min_dist is None or np.linalg.norm(polygon_point - node_value) < min_dist: min_dist = np.linalg.norm(polygon_point - node_value) min_offset = [ polygon_point[0] - node_value[0], polygon_point[1] - node_value[1] ] offsets.append(min_offset) return np.array(offsets) # Get training data bboxes, polygon_labels = process() # Define GCN models epochs_per_image = 10 gcn_models = [None] * epochs_per_image # Run stochastic training for image in range(len(bboxes)): print("Training image {0} of {1}".format(image + 1, len(bboxes))) # Process bounding box bbox, polygon_points = bboxes[image], polygon_labels[image] resized_bb = cv2.resize(bbox, (224, 224), interpolation=cv2.INTER_AREA) resized_bb_exp = preprocess_input(np.expand_dims(resized_bb, axis=0)) # Compute feature map resnet_model = ResNet50V2(weights='imagenet')
import torch import os from torch.optim import Adam from H_parse import H_parse from model import build_model from optimizer import * from process_data import process parse = H_parse() DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") if parse.device == "cpu": DEVICE = torch.device("cpu") data = process(parse) model = build_model() optim = NoamOpt( model.src_embed[0].d_model, 1, 4000, Adam(model.parameters(), lr=parse.lr_rate, betas=[0.9, 0.98], eps=1e-9)) label_sm = LabelSmoothing(parse.tgt_vocab_len, data.vocab["tgt"]["<pad>"], smoothing=0.1) losscompute = LossCompute(model.generator, label_sm, optim)
trainData.reset_index(inplace = True) training_counts = trainData['v1'].value_counts().tolist() print("\nTraining data set: 80% of data set\nNumber of spam: ", training_counts[0],"\nNumber of ham: ", training_counts[1]) # Reset index in testing data set testData.reset_index(inplace = True) testing_counts = testData['v1'].value_counts().tolist() print("\nTesting data set: 20% of data set\nNumber of spam: ", testing_counts[0],"\nNumber of ham: ", testing_counts[1]) # Training the TF-IDF model tfidf = TFIDF_model(trainData) tfidf.TF_and_IDF() tfidf.TFIDF() metric(testData['v1'], tfidf.test(testData['v2'])) # Running examples message1 = 'OMW. I will call you later.' process1 = process(message1) print("\nMessage 1: ", message1, "\nSpam = 1, Ham = 0: ", tfidf.classify(process1)) message2 = 'I will text you when I finish work' process2 = process(message2) print("\nMessage 2: ", message2, "\nSpam = 1, Ham = 0: ", tfidf.classify(process2)) message3 = 'You win a trip to Europe! Call now to redeem' process3 = process(message3) print("\nMessage 3: ", message3, "\nSpam = 1, Ham = 0: ", tfidf.classify(process3)) message4 = 'Text or call now for a week of FREE membership.' process4 = process(message4) print("\nMessage 4: ", message4, "\nSpam = 1, Ham = 0: ", tfidf.classify(process4))
import tensorflow as tf import model as m import process_data import matplotlib.pyplot as plt num_samples = 7260 # Number of samples to train on. encoder_input_data, decoder_input_data, decoder_target_data = process_data.process( ) model = m.seq2seq() callbacks = [ # If 'val_loss' does not improve over 2 epochs, the training stops. tf.keras.callbacks.EarlyStopping(patience=2, monitor='val_loss'), # Record logs for displaying on tensor board tf.keras.callbacks.TensorBoard(log_dir='./tensor_board') ] history = model.fit([encoder_input_data, decoder_input_data], decoder_target_data, callbacks=callbacks, batch_size=m.batch_size, epochs=m.epochs, validation_split=0.2) # Save model model.save_weights('./pretrained_weights/t1_savedModel', save_format='tf') # Plot training & validation loss values plt.plot(history.history['loss']) plt.plot(history.history['val_loss'])
# for evaluating against train size train_performance = [] val_performance = [] test_performance = [] features = None encodings = None for train_frac in train_fracs: acc = [] clfs = [] for kernel in kernel_funcs: clf = SVC(kernel = kernel) (x_train, y_train), (x_val, y_val), (x_test, y_test) = process(all_data, train_frac, val_frac, test_frac) print(x_train.shape, y_train.shape) clf.fit(x_train, y_train) clfs.append(clf) acc.append(accuracy_score(clf.predict(x_val), y_val)) optimal_index = acc.index(max(acc)) optimal_kernel = kernel_funcs[optimal_index] optimal_clf = clfs[optimal_index] train_performance.append(accuracy_score(optimal_clf.predict(x_train), y_train)) val_performance.append(accuracy_score(optimal_clf.predict(x_val), y_val)) test_performance.append(accuracy_score(optimal_clf.predict(x_test), y_test)) print(train_performance) print(val_performance) print(test_performance)
import csv from process_data import process from sklearn.decomposition import PCA file = open('../data/mushroom-classification/mushrooms.csv') all_data = list(csv.reader(file)) data_size = len(all_data) - 1 train_frac = 1.0 features = None encodings = None features, encodings, ((x_train, y_train), _, _) = process(all_data, train_frac, 0, 0, modify = True) pca = PCA(n_components = 2) pca.fit(x_train) import pandas as pd import seaborn as sns; sns.set() import matplotlib.pyplot as plt import numpy as np reduced = np.array(pca.transform(x_train)) x = reduced[:, 0] y = reduced[:, 1] colors = [] eps = 1e-4 for elt in y_train: if abs(1 - elt) < eps: colors.append('r')
def run(): x, y = process('nba_data2016-2018.csv') # learning rate for the algorithm learning_rate = 0.001 # split into 75% train and 25% test train_size=.75 X_train = x[:(int)(x.shape[0]*train_size),:] # print(X_train[len(X_train)-1]) X_test = x[(int)(x.shape[0]*train_size):,:] Y_train = y[:(int)(y.shape[0]*train_size)] Y_test = y[(int)(y.shape[0]*train_size):] D= x.shape[1] M= 20 B= np.random.rand(M) W1= np.random.rand(D,M) B2 = np.random.rand(1) W2 = np.random.rand(M,1) #each batch is now 6 "lines" because 3498/583=6 batches = 583 X_t= np.split(X_train , batches, axis=0) Y_t = np.split(Y_train , batches, axis=0) # IMPORTANT: statistic = (Yes/No - No/Yes)^2 / (Yes/No + No/Yes), Is the Mcnemar's test (a type of chi-square), to compare between 2 binary classification algorithms; with an alpha level of .05, the critical value is 3.84 losses= [] rates = 0 for i in range(len(X_t)): X= X_t[i] Y= Y_t[i] parameters = feedforward(X, B, W1, B2, W2) Z2 = parameters["Z2"] Z1= parameters["Z1"] l = cost(Z2, Y) losses.append(-l) W2 += learning_rate* back_propW2(gradientDesc(Z2, Y), parameters) B2 += learning_rate* back_propB2(gradientDesc(Z2, Y), Z2) W1 += learning_rate* back_propW1(gradientDesc(Z2, Y), W2, parameters) B += learning_rate* back_propB1(gradientDesc(Z2, Y), W2, parameters) if(i>481): rates+=(accuracy(Z2,Y)) *100 print(rates/100) plt.title('Classifier 1 (3 point percentage)') plt.plot(losses) plt.show() return (rates/100)
train_fracs = [0.0002, 0.0003, 0.0004, 0.0005, 0.006, 0.008, 0.001, 0.002, 0.005, 0.008, 0.01, 0.02, 0.05, 0.1, 0.2, 0.4, 0.6] # for evaluating against train size train_performance = [] val_performance = [] test_performance = [] features = None encodings = None for train_frac in train_fracs: acc = [] clfs = [] for depth in depths: clf = DT(max_depth = depth) if depth == depths[0] and train_frac == train_fracs[0]: features, encodings, ((x_train, y_train), (x_val, y_val), (x_test, y_test)) = process(all_data, train_frac, val_frac, test_frac, modify = True) else: _, _, ((x_train, y_train), (x_val, y_val), (x_test, y_test)) = process(all_data, train_frac, val_frac, test_frac, features = features, encodings = encodings) print(x_train.shape, y_train.shape) clf.fit(x_train, y_train) clfs.append(clf) acc.append(accuracy_score(clf.predict(x_val), y_val)) optimal_index = acc.index(max(acc)) optimal_depth = depths[optimal_index] optimal_clf = clfs[optimal_index] train_performance.append(accuracy_score(optimal_clf.predict(x_train), y_train)) val_performance.append(accuracy_score(optimal_clf.predict(x_val), y_val)) test_performance.append(accuracy_score(optimal_clf.predict(x_test), y_test)) print(train_performance) print(val_performance)
def main(): process("c444b776")
def main(): process("6d75e8bb")
def clean_database(): """ Clean ec_students_[semester] and ec_classes_[semester] table :return: none """ conn = mysql.connector.connect(**settings.MYSQL_CONFIG) cursor = conn.cursor() query = "TRUNCATE ec_students_%s" % get_semester_code_for_db( settings.SEMESTER) cursor.execute(query) query = "TRUNCATE ec_classes_%s" % get_semester_code_for_db( settings.SEMESTER) cursor.execute(query) cursor.close() conn.close() if __name__ == "__main__": with open("stu_data_version.json") as f: old_json_file = json.load(f)["stu_data_json_name"] fix_json(old_json_file) clean_directory() retrieve() clean_database() process() verify()
features = None encodings = None for train_frac in train_fracs: acc = [] clfs = [] best_acc = 0 optimal_depth = None optimal_est = None optimal_clf = None for depth in max_depths: for est in n_estimators: clf = AdaBoostClassifier(base_estimator=DT(max_depth=depth), n_estimators=est) ((x_train, y_train), (x_val, y_val), (x_test, y_test)) = process(all_data, train_frac, val_frac, test_frac) clf.fit(x_train, y_train) accuracy = accuracy_score(clf.predict(x_val), y_val) if accuracy > best_acc: best_acc = accuracy optimal_depth = depth optimal_est = est optimal_clf = clf train_performance.append( accuracy_score(optimal_clf.predict(x_train), y_train)) val_performance.append(accuracy_score(optimal_clf.predict(x_val), y_val)) test_performance.append(accuracy_score(optimal_clf.predict(x_test), y_test)) f1, ax1 = plt.subplots()
val_loss = criterion( output, targets.view(batch_size * seq_length).long()) val_losses.append(val_loss.item()) net.train( ) # reset to train mode after iterationg through validation data print("Epoch: {}/{}...".format(e + 1, epochs), "Step: {}...".format(counter), "Loss: {:.4f}...".format(loss.item()), "Val Loss: {:.4f}".format(np.mean(val_losses))) chars, encoded = process('../data/quotes_data.txt') print(len(chars), len(encoded)) # Define and print the net n_hidden = 512 n_layers = 2 net = CharRNN(chars, n_hidden, n_layers) print(net) # Declaring the hyperparameters batch_size = 128 seq_length = 100 n_epochs = 20 # start smaller if you are just testing initial behavior
from sklearn.decomposition import PCA from process_data import process import csv file = open('../data/pima-indians-diabetes-database/diabetes.csv') all_data = list(csv.reader(file)) data_size = len(all_data) - 1 train_frac = 1 val_frac = 0.2 test_frac = 0.2 (x_train, y_train), _, _ = process(all_data, train_frac, 0, 0) pca = PCA(n_components=2) pca.fit(x_train) import pandas as pd import seaborn as sns sns.set() import matplotlib.pyplot as plt import numpy as np reduced = np.array(pca.transform(x_train)) x = reduced[:, 0] y = reduced[:, 1] colors = [] eps = 1e-4 for elt in y_train: if abs(1 - elt) < eps: colors.append('r') else:
import process_data import tensorflow as tf encoder_input_data, decoder_input_data, decoder_target_data = process_data.process( is_train=False) base_model = tf.keras.models.load_model('./pretrained_weights/t1_baseline.h5') print(str(encoder_input_data.shape)) print(str(decoder_input_data.shape)) print(str(decoder_target_data.shape)) for seq_index in range(10): # Take one sequence (part of the training set) # for trying out decoding. input_seq = encoder_input_data[seq_index:seq_index + 1] decoder_input = decoder_input_data[seq_index:seq_index + 1] target_data = decoder_target_data[seq_index:seq_index + 1] decoded_sentence = base_model.predict([input_seq, decoder_input]) print('input shape : ' + str(input_seq.shape)) print(str(input_seq)) print('output shape : ' + str(decoded_sentence.shape)) print(str(decoded_sentence)) print('result : ' + str(target_data == decoded_sentence)) # print('This is decoded result : ' + str(decoded_sentence))
val_performance = [] test_performance = [] features = None encodings = None for train_frac in train_fracs: acc = [] clfs = [] for hid_layer_specific in hid_layers: clf = NN(hid_layer_specific, activation='relu') if hid_layer_specific == hid_layers[0] and train_frac == train_fracs[0]: features, encodings, ((x_train, y_train), (x_val, y_val), (x_test, y_test)) = process(all_data, train_frac, val_frac, test_frac, modify=True) else: _, _, ((x_train, y_train), (x_val, y_val), (x_test, y_test)) = process(all_data, train_frac, val_frac, test_frac, features=features, encodings=encodings) print(x_train.shape, y_train.shape) clf.fit(x_train, y_train) clfs.append(clf) acc.append(accuracy_score(clf.predict(x_val), y_val)) optimal_index = acc.index(max(acc))
def get(self): print(path) return process_data.process(path)
def main(): process('7468f01a')
train_frac = 0.7 #[0.0002, 0.0003, 0.0004, 0.0005, # for evaluating against train size train_performance = [] val_performance = [] test_performance = [] features = None encodings = None clf = NN(hid_layers, activation='relu') features, encodings, ((x_train, y_train), (x_val, y_val), (x_test, y_test)) = process(all_data, train_frac, val_frac, test_frac, modify=True) print(x_train.shape, y_train.shape) clf.fit(x_train, y_train) print("Simulated annealing:") acc_anneal = [] test_anneal = [] clf.coefs_ = [] clf.intercepts_ = [] #anneal(clf, hid_layers, x_train, y_train) #uses simulated annealing to find the optimal weights anneal = NNAnneal(clf, hid_layers, x_train, y_train) ([clf.coefs_, clf.intercepts_]), e = anneal.anneal()