def main(): images, labels = load_labeled_training(flatten=True) images = standardize(images) unl = load_unlabeled_training(flatten=True) unl = standardize(unl) test = load_public_test(flatten=True) test = standardize(test) shuffle_in_unison(images, labels) #d = DictionaryLearning().fit(images) d = MiniBatchDictionaryLearning(n_components=500, n_iter=500, verbose=True).fit(images) s = SparseCoder(d.components_) proj_test = s.transform(images) pt = s.transform(test) #kpca = KernelPCA(kernel="rbf") #kpca.fit(unl) #test_proj = kpca.transform(images) #pt = kpca.transform(test) #spca = SparsePCA().fit(unl) #test_proj = spca.transform(images) #pt = spca.transform(test) svc = SVC() scores = cross_validation.cross_val_score(svc, proj_test, labels, cv=10) print scores print np.mean(scores) print np.var(scores) svc.fit(proj_test, labels) pred = svc.predict(pt) write_results(pred, '../svm_res.csv')
def sample(self, n, seed=55): ps = self.ps pdata = ps.sample(n, seed=seed) X, Y = pdata.xy() Zx = util.standardize(X) Zy = util.standardize(Y) assert np.all(np.isfinite(Zx)) assert np.all(np.isfinite(Zy)) new_label = None if pdata.label is None else pdata.label + '_std' return PairedData(Zx, Zy, label=new_label)
def main(): images, labels = load_labeled_training(flatten=True) images = standardize(images) # images, labels = load_pca_proj(K=100) shuffle_in_unison(images, labels) ds = ClassificationDataSet(images.shape[1], 1, nb_classes=7) for i, l in zip(images, labels): ds.addSample(i, [l - 1]) # ds._convertToOneOfMany() test, train = ds.splitWithProportion(0.2) test._convertToOneOfMany() train._convertToOneOfMany() net = shortcuts.buildNetwork(train.indim, 1000, train.outdim, outclass=SoftmaxLayer) trainer = BackpropTrainer(net, dataset=train, momentum=0.1, learningrate=0.01, weightdecay=0.05) # trainer = RPropMinusTrainer(net, dataset=train) # cv = validation.CrossValidator(trainer, ds) # print cv.validate() net.randomize() tr_labels_2 = net.activateOnDataset(train).argmax(axis=1) trnres = percentError(tr_labels_2, train["class"]) # trnres = percentError(trainer.testOnClassData(dataset=train), train['class']) testres = percentError(trainer.testOnClassData(dataset=test), test["class"]) print "Training error: %.10f, Test error: %.10f" % (trnres, testres) print "Iters: %d" % trainer.totalepochs for i in range(100): trainer.trainEpochs(10) trnres = percentError(trainer.testOnClassData(dataset=train), train["class"]) testres = percentError(trainer.testOnClassData(dataset=test), test["class"]) trnmse = trainer.testOnData(dataset=train) testmse = trainer.testOnData(dataset=test) print "Iteration: %d, Training error: %.5f, Test error: %.5f" % (trainer.totalepochs, trnres, testres) print "Training MSE: %.5f, Test MSE: %.5f" % (trnmse, testmse)
def markov_model_classify(self, sentence): probs = [] words = tokenize(sentence) for i in range(0, 2): # Set initial value to prior prob = math.log(self.sentiment_counts[i]) prob -= math.log(self.total_examples) # Handle first word special case prev = words[0] if prev in self.word_counts[i]: prob += math.log(self.word_counts[i].get(prev)) prob -= math.log(self.total_words[i]) else: prob += math.log(OUT_OF_VOCAB_PROB) # Iterate over rest as bigrams for word in words[1:]: bigram = (prev, word) if bigram in self.bigram_counts[i]: prob += math.log(self.bigram_counts[i].get(bigram)) prob -= math.log(self.bigram_denoms[i].get(prev)) else: prob += math.log(OUT_OF_VOCAB_PROB) prev = word probs.append(prob) probs = standardize(probs) return probs.index(max(probs)), max(probs)
def _hist2d_wrapper(x, y, z, density=True, bins=10, range=None, **kwargs): # create tmp figure hist = np.histogram2d(x, y, weights=z, density=density, bins=bins)[0] if range is None: range = [z.min(), z.max()] hist = util.standardize(hist) * (range[1] - range[0]) + range[0] # supply bins as positions s.t. the axis range equals the bins range return _imshow_wrapper(bins[0], bins[1], hist, **kwargs)
def main(): images, labels = load_labeled_training(flatten=True) public_test = load_public_test(flatten=True) images = standardize(images) #images, labels = load_pca_proj(K=100) shuffle_in_unison(images, labels) ds = ClassificationDataSet(images.shape[1],1, nb_classes=7) testset = ClassificationDataSet(public_test.shape[1]) public_test=standardize(public_test) for i in public_test: testset.addSample(i,[0]) for i,l in zip(images, labels): ds.addSample(i,[l-1]) #ds._convertToOneOfMany() test, train = ds.splitWithProportion(0.2) test._convertToOneOfMany() train._convertToOneOfMany() net=shortcuts.buildNetwork(train.indim, 500, 1000,train.outdim, outclass=SoftmaxLayer) trainer = BackpropTrainer(net, dataset=train, learningrate=0.005, weightdecay=0.01) #trainer = RPropMinusTrainer(net, dataset=train) #cv = validation.CrossValidator(trainer, ds) #print cv.validate() net.randomize() tr_labels_2 = net.activateOnDataset(train).argmax(axis=1) trnres = percentError(tr_labels_2, train['class']) #trnres = percentError(trainer.testOnClassData(dataset=train), train['class']) testres = percentError(trainer.testOnClassData(dataset=test), test['class']) print "Training error: %.10f, Test error: %.10f" % (trnres, testres) print "Iters: %d" % trainer.totalepochs for i in range(10): trainer.trainEpochs(10) trnres = percentError(trainer.testOnClassData(dataset=train), train['class']) testres = percentError(trainer.testOnClassData(dataset=test), test['class']) trnmse = trainer.testOnData(dataset=train) testmse = trainer.testOnData(dataset=test) print "Iteration: %d, Training error: %.5f, Test error: %.5f" % (trainer.totalepochs, trnres, testres) print "Training MSE: %.5f, Test MSE: %.5f" % (trnmse, testmse) out=trainer.testOnClassData(dataset=testset) for i in range(len(out)): out[i] += 1 write_results(out, 'nn_predictions.csv')
def batch_classify(self, sentences: [str]): sentences = pd.Series(data=sentences) # Turn the dataset's sentences into BERT tokens. Truncate if too long tokens = sentences.apply((lambda x: self.tokenizer.encode(x, add_special_tokens=True))) tokens = pd.Series([i[:self.max_len] for i in tokens.values]) # Pad with 0s padded_tokens = np.array([i + [0] * (self.max_len - len(i)) for i in tokens.values]) vectors = get_bert_sentence_vectors(self.model, padded_tokens) # Run the vectors through LR model prediction = self.lr.predict(self.scaler.transform(vectors)) prediction_prob = self.lr.predict_log_proba(self.scaler.transform(vectors)) evaluations = [(prediction[i], max(standardize(prediction_prob[i]))) for i in range(0, len(prediction))] return evaluations
def extract_candidate_nodules_3d(img_arr, mask): """ Extract suspicious nodules from masked image. :param img_arr: Image array as 3D numpy array. :param mask: Image mask as 3D numpy array (must have same shape as img_arr). :return: List of candidate nodules. Each candidate nodule is in the format: [x, y, z, radius] """ standardized = util.standardize(img_arr) log_buffer = [] maxima = [] sigmas = np.linspace(2, 22, 11) for i in range(len(sigmas)): sigma = sigmas[i] gaussian = skimage.filters.gaussian(standardized, sigma=sigma) laplace = skimage.filters.laplace( gaussian ) # Important note: scikit-image returns negative second derivative for Laplacian. scale_normalized = laplace * sigma**2 # scale normalized log_buffer.append(scale_normalized) if len(log_buffer) >= 2: prev_sigma = sigmas[i - 1] target_slice = len(log_buffer) - 2 peaks = skimage.feature.peak_local_max(np.asarray(log_buffer), min_distance=prev_sigma, threshold_abs=0.2, exclude_border=False) peaks = peaks[ peaks[:, 0] == target_slice] # If we have 3 logs, pick index 1. If we have 2 logs (only occurs during the second iteration), pick index 0. peaks[:, 0] = 2 * prev_sigma * 3**0.5 # Diameter maxima.extend(peaks) log_buffer = log_buffer[-2:] # TODO get last sigma value as well. candidates = [] # slice_index = 278 # plt.imshow(img_arr[slice_index]) for point in maxima: if mask[tuple(point[1:].astype(int))]: # if abs(point[1] - slice_index) < 2: # circle = plt.Circle((point[3], point[2]), point[0] * 3 ** 0.5, color='r', fill=False) # plt.gca().add_artist(circle) candidate = list(point[::-1]) candidate.append( util.average_intensity(img_arr, point[:0:-1], point[0])) candidates.append(candidate) print('Total of', len(candidates), 'candidates found.') # plt.show() return candidates
def naive_bayes_classify(self, sentence): probs = [] words = tokenize(sentence) for i in range(0, 2): # Set initial value to prior prob = math.log(self.sentiment_counts[i]) prob -= math.log(self.total_examples) for word in words: if word in self.word_counts[i]: prob += math.log(self.word_counts[i].get(word)) prob -= math.log(self.total_words[i]) else: prob += math.log(OUT_OF_VOCAB_PROB) probs.append(prob) probs = standardize(probs) return probs.index(max(probs)), max(probs)
def run_train_cv(self, fold_splits): # データをロードする self._load_data() # モデルをクロスバリデーションの分割数分作成する self._cv_models = [] for i in range(fold_splits): model = copy.deepcopy(self._model) self._cv_models.append(model) fold = KFold(n_splits=fold_splits, shuffle=False) for i, (train_index, test_index) in enumerate( fold.split(self._whole_x, self._whole_y)): model = self._cv_models[i] # 訓練データを抽出する tx = self._whole_x.iloc[train_index] ty = self._whole_y.iloc[train_index] # 検証データを抽出する vx = self._whole_x.iloc[test_index] vy = self._whole_y.iloc[test_index] # モデルがDNNの場合はデータを正規化する if type(self._model) is ModelDnn: # Max-Minスケール化 #scaler, tx, vx = util.max_min_scale(tx, vx) # 標準化 scaler, tx, vx = util.standardize(tx, vx) # 学習を行う run_fold_name = '{0:s}_fold_{1:02d}'.format(self._run_name, i) model.train(tx, ty, vx, vy) # 予測を行う pred_y = model.predict(vx) # 評価結果を出力する self._print_evaluation_score(model, run_fold_name, vy, pred_y) # 特徴量の重要度を表示する self._show_importance_of_feature(model, run_fold_name) # Graphvizのグラフをファイルに出力する self._export_graphviz(model, run_fold_name)
def variable_effect(pheno,var,regressors,conn): """ Test effect of continuous variable. pheno = dataframe: -filtered to be only relevant subjects (use mask_var) var = column from pheno regressors = list of strings, formatted for patsy connectomes = n_subjects x n_edges array Returns: table = n_edges - betas_std = including standardization on controls - pvalues = pvalues - qvalues = fdr corrected pvalues alpha = 0.05 """ n_edges = conn.shape[1] contrast = np.zeros(1 + len(regressors)) contrast[0] = 1 betas_std = np.zeros(n_edges) pvalues = np.zeros(n_edges) formula = ' + '.join((regressors + [var])) dmat = pat.dmatrix(formula, pheno, return_type='dataframe',NA_action='raise') mask_std = np.ones(pheno.shape[0]).astype(bool) conn_std = standardize(mask_std, conn) for edge in range(n_edges): model_std = sm.OLS(conn_std[:,edge],dmat) results_std = model_std.fit() betas_std[edge] = results_std.params[var] pvalues[edge] = results_std.pvalues[var] mt = multipletests(pvalues,method='fdr_bh') reject = mt[0] qvalues = mt[1] table = pd.DataFrame(np.array([betas_std,pvalues,qvalues,reject]).transpose(), columns=['betas_std','pvalues','qvalues','reject']) return table
def run_train_all(self): # データをロードする self._load_data() train_x, train_y = self._train_x, self._train_y # モデルがDNNの場合はデータを正規化する if type(self._model) is ModelDnn: # Max-Minスケール化 #self._train_all_scaler, train_x, _ = util.max_min_scale(train_x, None) # 標準化 self._train_all_scaler, train_x, _ = util.standardize( train_x, None) self._model.train(train_x, train_y) self.is_trained_all = True # 学習モデルを保存する self._model.save_model()
def main(): # Read in Data data = readData("nba_stats.csv") # Randomizes the data X = randomize(data) Y = X[:,-1] # Only the last column X = X[:,:-1] # All but the last column D = len(X[0]) # Standardize standardized = standardize(X) # Select first 2/3 for training index = int(math.ceil((2.0/3.0) * len(X))) training = standardized[:index+1] testing = standardized[index+1:] Y_testing = Y[index+1:] # Divide training data into two groups positive = [] negative = [] for i in range(0, len(training)): if Y[i] == 1: # spam positive.append(training[i]) else: negative.append(training[i]) positive = numpy.array(positive).astype(float) negative = numpy.array(negative).astype(float) # Compute models for spam positive_model = [] for k in range(0, D): positive_model.append((numpy.mean(positive[:,k]), numpy.std(positive[:,k]))) # Compute models for non-spam negative_model = [] for k in range(0,D): negative_model.append((numpy.mean(negative[:, k]), numpy.std(negative[:, k]))) # Classify testing samples result = [] testing_probabilities = [] for sample in testing: p_positive = float(len(positive)) / len(positive) + len(negative) p_negative = float(len(negative)) / len(positive) + len(negative) for k in range(0, D): p_positive *= likelihood(positive_model[k][0], positive_model[k][1], sample[k]) p_negative *= likelihood(negative_model[k][0], negative_model[k][1], sample[k]) testing_probabilities.append(normalize_probabilities([p_positive, p_negative])) if p_positive > p_negative: result.append(1) else: result.append(0) precisions = [] recalls = [] for threshold in range(0, 100, 5): threshold = float(threshold) / 100 TruePositives = 0.0 TrueNegatives = 0.0 FalsePositives = 0.0 FalseNegatives = 0.0 for i in range(0, len(testing_probabilities)): if Y_testing[i] == 1: # Positive example if testing_probabilities[i][0] > threshold: # Predicted positive TruePositives += 1 else: # Predicted negative FalseNegatives += 1 elif Y_testing[i] == 0: # Negative example if testing_probabilities[i][0] > threshold: # Predicted positive FalsePositives += 1 else: # Predicted negative TrueNegatives += 1 try: precision = TruePositives / (TruePositives + FalsePositives) except ZeroDivisionError: if TruePositives == 0: precision = 1 else: precision = 0 try: recall = TruePositives / (TruePositives + FalseNegatives) except ZeroDivisionError: if TruePositives == 0: recall = 1 else: recall = 0 precisions.append(precision) recalls.append(recall) plt.plot(recalls, precisions, 'r-o') plt.xlabel('Recall') plt.ylabel('Precision') plt.show()
def get_dictionary_data(n_comp=20, zero_index=False): unlabeled = util.load_unlabeled_training(flatten=False) height, width = 32, 32 n_images = 10000 patch_size = (8, 8) unlabeled = util.standardize(unlabeled) np.random.shuffle(unlabeled) print('Extracting reference patches...') patches = np.empty((0, 64)) t0 = time() for image in unlabeled[:n_images, :, :]: data = np.array(extract_patches_2d(image, patch_size, max_patches=0.01)) data = data.reshape(data.shape[0], -1) data -= np.mean(data, axis=0) data /= np.std(data, axis=0) + 1e-20 patches = np.concatenate([patches, data]) print('done in %.2fs.' % (time() - t0)) # whiten the patches z = zca.ZCA() z.fit(patches) z.transform(patches) print('Learning the dictionary...') t0 = time() dico = MiniBatchDictionaryLearning(n_components=n_comp, alpha=1) V = dico.fit(patches).components_ dt = time() - t0 print('done in %.2fs.' % dt) #plt.figure(figsize=(4.2, 4)) #for i, comp in enumerate(V[:100]): # plt.subplot(10, 10, i + 1) # plt.imshow(comp.reshape(patch_size), cmap=plt.cm.gray_r, # interpolation='nearest') # plt.xticks(()) # plt.yticks(()) #plt.subplots_adjust(0.08, 0.02, 0.92, 0.85, 0.08, 0.23) #plt.show() labeled_data, labels = util.load_labeled_training(flatten=False, zero_index=True) labeled_data = util.standardize(labeled_data) test_data = util.load_all_test(flatten=False) test_data = util.standardize(test_data) #util.render_matrix(test_data, flattened=False) print('Reconstructing the training images...') t0 = time() reconstructed_images = np.empty((0, 32, 32)) for i, image in enumerate(labeled_data): data = extract_patches_2d(image, patch_size) data = data.reshape(data.shape[0], -1) data -= np.mean(data, axis=0) data /= np.std(data, axis=0) + 1e-20 code = dico.transform(data) patches = np.dot(code, V) z.transform(patches) patches = patches.reshape(len(data), *patch_size) data = reconstruct_from_patches_2d(patches, (width, height)) data = data.reshape(1, 32, 32) reconstructed_images = np.concatenate([reconstructed_images, data]) print('done in %.2fs.' % (time() - t0)) # flatten n, x, y = reconstructed_images.shape training_images = reconstructed_images.reshape( reconstructed_images.shape[0], reconstructed_images.shape[1] * reconstructed_images.shape[2]) assert training_images.shape == (n, x * y) print('Reconstructing the test images...') t0 = time() reconstructed_test_images = np.empty((0, 32, 32)) for image in test_data: data = extract_patches_2d(image, patch_size) data = data.reshape(data.shape[0], -1) data -= np.mean(data, axis=0) data /= np.std(data, axis=0) + 1e-20 code = dico.transform(data) patches = np.dot(code, V) z.transform(patches) patches = patches.reshape(len(data), *patch_size) data = reconstruct_from_patches_2d(patches, (width, height)) data = data.reshape(1, 32, 32) reconstructed_test_images = np.concatenate( [reconstructed_test_images, data]) print('done in %.2fs.' % (time() - t0)) # flatten n, x, y = reconstructed_test_images.shape test_images = reconstructed_test_images.reshape( reconstructed_test_images.shape[0], reconstructed_test_images.shape[1] * reconstructed_test_images.shape[2]) assert test_images.shape == (n, x * y) return (training_images, labels, test_images)
- qvalues = fdr corrected pvalues alpha = 0.05 """ n_edges = conn.shape[1] betas = np.zeros(n_edges) betas_std = np.zeros(n_edges) pvalues = np.zeros(n_edges) formula = ' + '.join((regressors + [case])) dmat = pat.dmatrix(formula, pheno, return_type='dataframe', NA_action='raise') mask_std = ~pheno[case].to_numpy(dtype=bool) conn_std = standardize(mask_std, conn) for edge in range(n_edges): model_std = sm.OLS(conn_std[:, edge], dmat) results_std = model_std.fit() betas_std[edge] = results_std.params[case] pvalues[edge] = results_std.pvalues[case] mt = multipletests(pvalues, method='fdr_bh') reject = mt[0] qvalues = mt[1] table = pd.DataFrame(np.array([betas_std, pvalues, qvalues, reject]).transpose(), columns=['betas_std', 'pvalues', 'qvalues', 'reject']) return table
from pylearn2.datasets import preprocessing from pylearn2.format.target_format import convert_to_one_hot import pylab as plt import cPickle as pickle import numpy as np import util import dictionary_learning if __name__ == "__main__": #train_data, train_labels = util.load_labeled_training(flatten=True, zero_index=True) #train_data = util.standardize(train_data) test_data = util.load_all_test(flatten=True) test_data = util.standardize(test_data) #train_data_20, _, test_data_20 = dictionary_learning.get_dictionary_data(n_comp=20, zero_index=True) #train_data_100, _, test_data_100 = dictionary_learning.get_dictionary_data(n_comp=100, zero_index=True) # convert the training labels into one-hot format, as required by the pylearn2 model #train_labels = convert_to_one_hot(train_labels, dtype='int64', max_labels=7, mode='stack') # pickle the data #serial.save('training_data_for_pylearn2.pkl', train_data) #serial.save('training_data_20_components_for_pylearn2.pkl', train_data_20) #serial.save('training_data_100_components_for_pylearn2.pkl', train_data_100) #serial.save('training_labels_for_pylearn2.pkl', train_labels) serial.save('test_data_for_pylearn2.pkl', test_data)
def surf_multiple(phasor, position, Nx: int, Ny: int, prefix='', filename=None): labels = [ 'Amplitude', 'Amplitude', 'Irradiance' # , 'Log Irradiance' ] for i, label in enumerate(labels): amp = phasor[:, 0] if label == 'Amplitude': z = amp else: z = amp**2 if i == 3: log_irradiance = np.log(np.clip(amp**2, 1e-9, None)) # z = log_irradiance # log_irradiance = np.log(util.irradiance( # util.to_polar(a, phi), normalize=False)) z = util.standardize(log_irradiance) assert abs(z.min()) < 1e-3 assert abs(1 - z.max()) < 1e-3 z_log = i in [1, 2] lower_bound = 1e-6 # assume max = 1 if z_log: # manual log zscale z = np.clip(z, lower_bound, None) mini, maxi = z.min(), z.max() if mini == maxi or mini <= 0: continue else: mini = round(np.log10(mini)) maxi = round(np.log10(maxi)) if mini == maxi: continue z = np.log10(z) # ignore third dimension in position ax, _ = surf(position[:, 0], position[:, 1], z, Nx, Ny) if z_log and mini != maxi: n_ticks = int(maxi - mini) + 1 if n_ticks > 8 and n_ticks % 2 == 1: n_ticks = round(n_ticks / 2.) assert (n_ticks > 1) ticks = np.linspace(mini, maxi, n_ticks, endpoint=True).round().astype(int) labels = [f'$10^{{{v}}}$' for v in ticks] # ax.set_zticks(ticks) # auto ax.set_zticklabels(labels) plt.xlabel('Space') plt.ylabel('Space') formatter = tck.EngFormatter(places=1, sep=u"\N{THIN SPACE}", unit='m') ax.xaxis.set_major_formatter(formatter) ax.yaxis.set_major_formatter(formatter) plt.xticks(rotation=ANGLES[0] / 2, rotation_mode='anchor') plt.yticks(rotation=-ANGLES[1] / 4, rotation_mode='anchor') plot.set_num_xyticks(3) plt.title(f'{prefix}{label}') plt.tight_layout() if filename is None: plt.show() else: suffix = label.replace(' ', '') + f'-{i}' plot.save_fig(f'{filename}_{suffix}', ext='png') plt.close()
def predict(obs, mean, std): if STANDARDIZE: obs = util.standardize(obs, mean, std) actions = sess.run([preds], {x: obs, keep_prob: 1}) return actions
ax.scatter(x_ok, y_ok, color='b', label='Admitted') plt.show() # Print sigmoid of matrix arr = np.array([[0, 1], [2, 3]]) print(util.sigmoid(arr)) # Implement vectorised version of logistic cost function beta = np.array([0, 0, 0]) # Test value should be 0.6931 print("Cost for [0,0,0]: ", util.costLogistic(Xe_norm, y, beta)) alpha = 0.5 iterations = 1000 beta = util.GDLogistic(iterations, alpha, beta, Xe_norm, y) student = np.array([[45, 85]]) student_n = util.standardize(student, feature_mean, feature_std) student_ne = util.extendMatrix(student_n) print("Probability of admission: ", util.sigmoid(np.dot(student_ne, beta))) util.trainingErrs(Xe_norm, beta, y) # PLOT MESH GRID h = .01 # step size in the mesh x_min, x_max = X_norm[:, 0].min() - 0.1, X_norm[:, 0].max() + 0.1 y_min, y_max = X_norm[:, 1].min() - 0.1, X_norm[:, 1].max() + 0.1 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) # Mesh Grid x1, x2 = xx.ravel(), yy.ravel() # Turn to two Nx1 arrays XXe = util.mapFeature(x1, x2, 1) # Extend matrix for degree 2
def get_dictionary_data(n_comp=20, zero_index=True): unlabeled = util.load_unlabeled_training(flatten=False) height, width = 32, 32 n_images = 10000 patch_size = (8, 8) unlabeled = util.standardize(unlabeled) np.random.shuffle(unlabeled) print('Extracting reference patches...') patches = np.empty((0, 64)) t0 = time() for image in unlabeled[:n_images, :, :]: data = np.array(extract_patches_2d(image, patch_size, max_patches=0.10)) data = data.reshape(data.shape[0], -1) data -= np.mean(data, axis=0) data /= np.std(data, axis=0) + 1e-20 patches = np.concatenate([patches, data]) print('done in %.2fs.' % (time() - t0)) # whiten the patches z = zca.ZCA() z.fit(patches) z.transform(patches) print('Learning the dictionary...') t0 = time() dico = MiniBatchDictionaryLearning(n_components=n_comp, alpha=1) V = dico.fit(patches).components_ dt = time() - t0 print('done in %.2fs.' % dt) #plt.figure(figsize=(4.2, 4)) #for i, comp in enumerate(V[:100]): # plt.subplot(10, 10, i + 1) # plt.imshow(comp.reshape(patch_size), cmap=plt.cm.gray_r, # interpolation='nearest') # plt.xticks(()) # plt.yticks(()) #plt.subplots_adjust(0.08, 0.02, 0.92, 0.85, 0.08, 0.23) #plt.show() labeled_data, labels = util.load_labeled_training(flatten=False, zero_index=True) labeled_data = util.standardize(labeled_data) test_data = util.load_all_test(flatten=False) test_data = util.standardize(test_data) #util.render_matrix(test_data, flattened=False) print('Training SVM with the training images...') t0 = time() reconstructed_images = np.empty((0, 64)) multiplied_labels = np.empty((0)) for i in range(len(labeled_data)): image = labeled_data[i, :, :] label = labels[i] data = extract_patches_2d(image, patch_size, max_patches=0.50) data = data.reshape(data.shape[0], -1) data -= np.mean(data, axis=0) data /= np.std(data, axis=0) + 1e-20 code = dico.transform(data) patches = np.dot(code, V) z.transform(patches) reconstructed_images = np.concatenate([reconstructed_images, patches]) extended_labels = np.asarray([label] * len(patches)) multiplied_labels = np.concatenate([multiplied_labels, extended_labels]) print(reconstructed_images.shape, multiplied_labels.shape) svc = SVC() #print('Getting cross-val scores...') #scores = cross_validation.cross_val_score(svc, reconstructed_images, multiplied_labels, cv=10) #print('cross-val scores:', scores) #print('cross-val mean:', np.mean(scores)) #print('cross-val variance:', np.var(scores)) print('done in %.2fs.' % (time() - t0)) svc.fit(reconstructed_images, multiplied_labels) print('Reconstructing the test images...') t0 = time() predictions = [] for i, image in enumerate(test_data): data = extract_patches_2d(image, patch_size, max_patches=0.25) data = data.reshape(data.shape[0], -1) data -= np.mean(data, axis=0) data /= np.std(data, axis=0) + 1e-20 code = dico.transform(data) patches = np.dot(code, V) z.transform(patches) pred = svc.predict(patches) print('Variance in the predictions:', np.var(pred)) predictions.append(mode(pred)) print('done in %.2fs.' % (time() - t0)) predictions += 1 util.write_results(predictions, 'svm_patches_25_percent_20_comp.csv')
def main(): # Read in Data data = readData("spambase.data") # Randomizes the data X = randomize(data) Y = X[:, -1] # Only the last column X = X[:, :-1] # All but the last column D = len(X[0]) # Standardize standardized = standardize(X) # Select first 2/3 for training index = int(math.ceil((2.0 / 3.0) * len(X))) training = standardized[:index + 1] testing = standardized[index + 1:] Y_testing = Y[index + 1:] # Divide training data into two groups positive = [] negative = [] for i in range(0, len(training)): if Y[i] == 1: # spam positive.append(training[i]) else: negative.append(training[i]) positive = numpy.array(positive).astype(float) negative = numpy.array(negative).astype(float) # Compute models for spam positive_model = [] for k in range(0, D): positive_model.append( (numpy.mean(positive[:, k]), numpy.std(positive[:, k]))) # Compute models for non-spam negative_model = [] for k in range(0, D): negative_model.append( (numpy.mean(negative[:, k]), numpy.std(negative[:, k]))) # Classify testing samples result = [] for sample in testing: p_positive = float(len(positive)) / len(positive) + len(negative) p_negative = float(len(negative)) / len(positive) + len(negative) for k in range(0, D): p_positive *= likelihood(positive_model[k][0], positive_model[k][1], sample[k]) p_negative *= likelihood(negative_model[k][0], negative_model[k][1], sample[k]) if p_positive > p_negative: result.append(1) else: result.append(0) # Compute statistics TruePositives = 0.0 TrueNegatives = 0.0 FalsePositives = 0.0 FalseNegatives = 0.0 for i in range(0, len(result)): if Y_testing[i] == 1: # Positive example if result[i] == 1: # Predicted positive TruePositives += 1 elif result[i] == 0: # Predicted negative FalseNegatives += 1 elif Y_testing[i] == 0: # Negative example if result[i] == 1: # Predicted positive FalsePositives += 1 elif result[i] == 0: # Predicted negative TrueNegatives += 1 try: precision = TruePositives / (TruePositives + FalsePositives) recall = TruePositives / (TruePositives + FalseNegatives) f_measure = (2 * precision * recall) / (precision + recall) accuracy = (TruePositives + TrueNegatives) / ( TruePositives + TrueNegatives + FalsePositives + FalseNegatives) print 'Precision: ' + str(precision) print 'Recall: ' + str(recall) print 'F-measure: ' + str(f_measure) print 'Accuracy: ' + str(accuracy) except: pass
# Normalize X_norm, feature_mean, feature_std = util.standardizeSet(X) # Extend normalized data Xe_norm = util.extendMatrix(X_norm) for i in range(1, 7): plt.xlim(-3, 3) util.createSubScatterPlot(util.stdFeature(Xe_norm[:, i]), y, f'Feature {i}', 'Y', 2, 3, i) plt.xlim(-3, 3) plt.show() gpu = np.array([2432, 1607, 1683, 8, 8, 256]) gpu_norm = util.standardize(gpu, feature_mean, feature_std) gpu_norm_e = np.array([1, gpu_norm[0], gpu_norm[1], gpu_norm[2], gpu_norm[3], gpu_norm[4], gpu_norm[5]]) gpu = np.array([1, 2432, 1607, 1683, 8, 8, 256]) beta = util.calcBeta(Xe, y) print("Benchmark using normal eq: ", util.normalEq(Xe, y, gpu)) print("Cost function: ", util.cost(Xe, y, beta)) # 12.3964 beta2 = util.calcBeta(Xe_norm, y) print("Cost function normalized: ", util.cost(Xe_norm, y, beta2)) print("Benchmark on normalized data: ", util.normalEq(Xe_norm, y, gpu_norm_e)) # Implement vectorized version of gradient descent iterations = 10000 alpha = 0.02
def get_dictionary_data(n_comp=20, zero_index=False): unlabeled = util.load_unlabeled_training(flatten=False) height, width = 32, 32 n_images = 10000 patch_size = (8, 8) unlabeled = util.standardize(unlabeled) np.random.shuffle(unlabeled) print('Extracting reference patches...') patches = np.empty((0, 64)) t0 = time() for image in unlabeled[:n_images, :, :]: data = np.array(extract_patches_2d(image, patch_size, max_patches=0.01)) data = data.reshape(data.shape[0], -1) data -= np.mean(data, axis=0) data /= np.std(data, axis=0) + 1e-20 patches = np.concatenate([patches, data]) print('done in %.2fs.' % (time() - t0)) # whiten the patches z = zca.ZCA() z.fit(patches) z.transform(patches) print('Learning the dictionary...') t0 = time() dico = MiniBatchDictionaryLearning(n_components=n_comp, alpha=1) V = dico.fit(patches).components_ dt = time() - t0 print('done in %.2fs.' % dt) #plt.figure(figsize=(4.2, 4)) #for i, comp in enumerate(V[:100]): # plt.subplot(10, 10, i + 1) # plt.imshow(comp.reshape(patch_size), cmap=plt.cm.gray_r, # interpolation='nearest') # plt.xticks(()) # plt.yticks(()) #plt.subplots_adjust(0.08, 0.02, 0.92, 0.85, 0.08, 0.23) #plt.show() labeled_data, labels = util.load_labeled_training(flatten=False, zero_index=True) labeled_data = util.standardize(labeled_data) test_data = util.load_all_test(flatten=False) test_data = util.standardize(test_data) #util.render_matrix(test_data, flattened=False) print('Reconstructing the training images...') t0 = time() reconstructed_images = np.empty((0, 32, 32)) for i, image in enumerate(labeled_data): data = extract_patches_2d(image, patch_size) data = data.reshape(data.shape[0], -1) data -= np.mean(data, axis=0) data /= np.std(data, axis=0) + 1e-20 code = dico.transform(data) patches = np.dot(code, V) z.transform(patches) patches = patches.reshape(len(data), *patch_size) data = reconstruct_from_patches_2d(patches, (width, height)) data = data.reshape(1, 32, 32) reconstructed_images = np.concatenate([reconstructed_images, data]) print('done in %.2fs.' % (time() - t0)) # flatten n, x, y = reconstructed_images.shape training_images = reconstructed_images.reshape(reconstructed_images.shape[0], reconstructed_images.shape[1]*reconstructed_images.shape[2]) assert training_images.shape == (n, x*y) print('Reconstructing the test images...') t0 = time() reconstructed_test_images = np.empty((0, 32, 32)) for image in test_data: data = extract_patches_2d(image, patch_size) data = data.reshape(data.shape[0], -1) data -= np.mean(data, axis=0) data /= np.std(data, axis=0) + 1e-20 code = dico.transform(data) patches = np.dot(code, V) z.transform(patches) patches = patches.reshape(len(data), *patch_size) data = reconstruct_from_patches_2d(patches, (width, height)) data = data.reshape(1, 32, 32) reconstructed_test_images = np.concatenate([reconstructed_test_images, data]) print('done in %.2fs.' % (time() - t0)) # flatten n, x, y = reconstructed_test_images.shape test_images = reconstructed_test_images.reshape(reconstructed_test_images.shape[0], reconstructed_test_images.shape[1]*reconstructed_test_images.shape[2]) assert test_images.shape == (n, x*y) return (training_images, labels, test_images)