def main():
    images, labels = load_labeled_training(flatten=True)
    images = standardize(images)
    unl = load_unlabeled_training(flatten=True)
    unl = standardize(unl)
    test = load_public_test(flatten=True)
    test = standardize(test)
    shuffle_in_unison(images, labels)
    #d = DictionaryLearning().fit(images)
    d = MiniBatchDictionaryLearning(n_components=500, n_iter=500, verbose=True).fit(images)
    s = SparseCoder(d.components_)
    proj_test = s.transform(images)
    pt = s.transform(test)
    #kpca = KernelPCA(kernel="rbf")
    #kpca.fit(unl)
    #test_proj = kpca.transform(images)
    #pt = kpca.transform(test)
    #spca = SparsePCA().fit(unl)
    #test_proj = spca.transform(images)
    #pt = spca.transform(test)
    svc = SVC()
    scores = cross_validation.cross_val_score(svc, proj_test, labels, cv=10)
    print scores
    print np.mean(scores)
    print np.var(scores)
    svc.fit(proj_test, labels)
    pred = svc.predict(pt)
    write_results(pred, '../svm_res.csv')
    def sample(self, n, seed=55):
        ps = self.ps
        pdata = ps.sample(n, seed=seed)
        X, Y = pdata.xy()

        Zx = util.standardize(X)
        Zy = util.standardize(Y)
        assert np.all(np.isfinite(Zx))
        assert np.all(np.isfinite(Zy))
        new_label = None if pdata.label is None else pdata.label + '_std'
        return PairedData(Zx, Zy, label=new_label)
    def sample(self, n, seed=55):
        ps = self.ps 
        pdata = ps.sample(n, seed=seed)
        X, Y = pdata.xy()

        Zx = util.standardize(X)
        Zy = util.standardize(Y)
        assert np.all(np.isfinite(Zx))
        assert np.all(np.isfinite(Zy))
        new_label = None if pdata.label is None else pdata.label + '_std'
        return PairedData(Zx, Zy, label=new_label)
def main():
    images, labels = load_labeled_training(flatten=True)
    images = standardize(images)
    # images, labels = load_pca_proj(K=100)
    shuffle_in_unison(images, labels)
    ds = ClassificationDataSet(images.shape[1], 1, nb_classes=7)
    for i, l in zip(images, labels):
        ds.addSample(i, [l - 1])
    # ds._convertToOneOfMany()
    test, train = ds.splitWithProportion(0.2)
    test._convertToOneOfMany()
    train._convertToOneOfMany()
    net = shortcuts.buildNetwork(train.indim, 1000, train.outdim, outclass=SoftmaxLayer)

    trainer = BackpropTrainer(net, dataset=train, momentum=0.1, learningrate=0.01, weightdecay=0.05)
    # trainer = RPropMinusTrainer(net, dataset=train)
    # cv = validation.CrossValidator(trainer, ds)
    # print cv.validate()
    net.randomize()
    tr_labels_2 = net.activateOnDataset(train).argmax(axis=1)
    trnres = percentError(tr_labels_2, train["class"])
    # trnres = percentError(trainer.testOnClassData(dataset=train), train['class'])
    testres = percentError(trainer.testOnClassData(dataset=test), test["class"])
    print "Training error: %.10f, Test error: %.10f" % (trnres, testres)
    print "Iters: %d" % trainer.totalepochs

    for i in range(100):
        trainer.trainEpochs(10)
        trnres = percentError(trainer.testOnClassData(dataset=train), train["class"])
        testres = percentError(trainer.testOnClassData(dataset=test), test["class"])
        trnmse = trainer.testOnData(dataset=train)
        testmse = trainer.testOnData(dataset=test)
        print "Iteration: %d, Training error: %.5f, Test error: %.5f" % (trainer.totalepochs, trnres, testres)
        print "Training MSE: %.5f, Test MSE: %.5f" % (trnmse, testmse)
 def markov_model_classify(self, sentence):
     probs = []
     words = tokenize(sentence)
     for i in range(0, 2):
         # Set initial value to prior
         prob = math.log(self.sentiment_counts[i])
         prob -= math.log(self.total_examples)
         # Handle first word special case
         prev = words[0]
         if prev in self.word_counts[i]:
             prob += math.log(self.word_counts[i].get(prev))
             prob -= math.log(self.total_words[i])
         else:
             prob += math.log(OUT_OF_VOCAB_PROB)
         # Iterate over rest as bigrams
         for word in words[1:]:
             bigram = (prev, word)
             if bigram in self.bigram_counts[i]:
                 prob += math.log(self.bigram_counts[i].get(bigram))
                 prob -= math.log(self.bigram_denoms[i].get(prev))
             else:
                 prob += math.log(OUT_OF_VOCAB_PROB)
             prev = word
         probs.append(prob)
     probs = standardize(probs)
     return probs.index(max(probs)), max(probs)
def _hist2d_wrapper(x, y, z, density=True, bins=10, range=None, **kwargs):
    # create tmp figure
    hist = np.histogram2d(x, y, weights=z, density=density, bins=bins)[0]
    if range is None:
        range = [z.min(), z.max()]
    hist = util.standardize(hist) * (range[1] - range[0]) + range[0]
    # supply bins as positions s.t. the axis range equals the bins range
    return _imshow_wrapper(bins[0], bins[1], hist, **kwargs)
Exemple #7
0
def main():
    images, labels = load_labeled_training(flatten=True)
    public_test = load_public_test(flatten=True)
    images = standardize(images)
    #images, labels = load_pca_proj(K=100)
    shuffle_in_unison(images, labels)
    ds = ClassificationDataSet(images.shape[1],1, nb_classes=7)
    testset = ClassificationDataSet(public_test.shape[1])
    public_test=standardize(public_test)
    for i in public_test:
        testset.addSample(i,[0])
    for i,l in zip(images, labels):
        ds.addSample(i,[l-1])
    #ds._convertToOneOfMany()
    test, train = ds.splitWithProportion(0.2)
    test._convertToOneOfMany()
    train._convertToOneOfMany()
    net=shortcuts.buildNetwork(train.indim, 500, 1000,train.outdim, outclass=SoftmaxLayer)

    trainer = BackpropTrainer(net, dataset=train, learningrate=0.005, weightdecay=0.01)
    #trainer = RPropMinusTrainer(net, dataset=train)
    #cv = validation.CrossValidator(trainer, ds)
    #print cv.validate()
    net.randomize()
    tr_labels_2 = net.activateOnDataset(train).argmax(axis=1)
    trnres = percentError(tr_labels_2, train['class'])
    #trnres = percentError(trainer.testOnClassData(dataset=train), train['class'])
    testres = percentError(trainer.testOnClassData(dataset=test), test['class'])
    print "Training error: %.10f, Test error: %.10f" % (trnres, testres)
    print "Iters: %d" % trainer.totalepochs
    for i in range(10):
        trainer.trainEpochs(10)
        trnres = percentError(trainer.testOnClassData(dataset=train), train['class'])
        testres = percentError(trainer.testOnClassData(dataset=test), test['class'])
        trnmse = trainer.testOnData(dataset=train)
        testmse = trainer.testOnData(dataset=test)
        print "Iteration: %d, Training error: %.5f, Test error: %.5f" % (trainer.totalepochs, trnres, testres)
        print "Training MSE: %.5f, Test MSE: %.5f" % (trnmse, testmse)
    out=trainer.testOnClassData(dataset=testset)
    for i in range(len(out)):
        out[i] += 1
    write_results(out, 'nn_predictions.csv')
 def batch_classify(self, sentences: [str]):
     sentences = pd.Series(data=sentences)
     # Turn the dataset's sentences into BERT tokens. Truncate if too long
     tokens = sentences.apply((lambda x: self.tokenizer.encode(x, add_special_tokens=True)))
     tokens = pd.Series([i[:self.max_len] for i in tokens.values])
     # Pad with 0s
     padded_tokens = np.array([i + [0] * (self.max_len - len(i)) for i in tokens.values])
     vectors = get_bert_sentence_vectors(self.model, padded_tokens)
     # Run the vectors through LR model
     prediction = self.lr.predict(self.scaler.transform(vectors))
     prediction_prob = self.lr.predict_log_proba(self.scaler.transform(vectors))
     evaluations = [(prediction[i], max(standardize(prediction_prob[i]))) for i in range(0, len(prediction))]
     return evaluations
Exemple #9
0
def extract_candidate_nodules_3d(img_arr, mask):
    """
    Extract suspicious nodules from masked image.

    :param img_arr: Image array as 3D numpy array.
    :param mask: Image mask as 3D numpy array (must have same shape as img_arr).
    :return: List of candidate nodules. Each candidate nodule is in the format:
                [x, y, z, radius]
    """
    standardized = util.standardize(img_arr)
    log_buffer = []
    maxima = []
    sigmas = np.linspace(2, 22, 11)
    for i in range(len(sigmas)):
        sigma = sigmas[i]
        gaussian = skimage.filters.gaussian(standardized, sigma=sigma)
        laplace = skimage.filters.laplace(
            gaussian
        )  # Important note: scikit-image returns negative second derivative for Laplacian.
        scale_normalized = laplace * sigma**2  # scale normalized
        log_buffer.append(scale_normalized)
        if len(log_buffer) >= 2:
            prev_sigma = sigmas[i - 1]
            target_slice = len(log_buffer) - 2
            peaks = skimage.feature.peak_local_max(np.asarray(log_buffer),
                                                   min_distance=prev_sigma,
                                                   threshold_abs=0.2,
                                                   exclude_border=False)
            peaks = peaks[
                peaks[:, 0] ==
                target_slice]  # If we have 3 logs, pick index 1. If we have 2 logs (only occurs during the second iteration), pick index 0.
            peaks[:, 0] = 2 * prev_sigma * 3**0.5  # Diameter
            maxima.extend(peaks)
            log_buffer = log_buffer[-2:]
        # TODO get last sigma value as well.
    candidates = []
    # slice_index = 278
    # plt.imshow(img_arr[slice_index])
    for point in maxima:
        if mask[tuple(point[1:].astype(int))]:
            # if abs(point[1] - slice_index) < 2:
            # circle = plt.Circle((point[3], point[2]), point[0] * 3 ** 0.5, color='r', fill=False)
            # plt.gca().add_artist(circle)
            candidate = list(point[::-1])
            candidate.append(
                util.average_intensity(img_arr, point[:0:-1], point[0]))
            candidates.append(candidate)
    print('Total of', len(candidates), 'candidates found.')
    # plt.show()
    return candidates
 def naive_bayes_classify(self, sentence):
     probs = []
     words = tokenize(sentence)
     for i in range(0, 2):
         # Set initial value to prior
         prob = math.log(self.sentiment_counts[i])
         prob -= math.log(self.total_examples)
         for word in words:
             if word in self.word_counts[i]:
                 prob += math.log(self.word_counts[i].get(word))
                 prob -= math.log(self.total_words[i])
             else:
                 prob += math.log(OUT_OF_VOCAB_PROB)
         probs.append(prob)
     probs = standardize(probs)
     return probs.index(max(probs)), max(probs)
Exemple #11
0
    def run_train_cv(self, fold_splits):

        # データをロードする
        self._load_data()

        # モデルをクロスバリデーションの分割数分作成する
        self._cv_models = []
        for i in range(fold_splits):
            model = copy.deepcopy(self._model)
            self._cv_models.append(model)

        fold = KFold(n_splits=fold_splits, shuffle=False)
        for i, (train_index, test_index) in enumerate(
                fold.split(self._whole_x, self._whole_y)):

            model = self._cv_models[i]

            # 訓練データを抽出する
            tx = self._whole_x.iloc[train_index]
            ty = self._whole_y.iloc[train_index]

            # 検証データを抽出する
            vx = self._whole_x.iloc[test_index]
            vy = self._whole_y.iloc[test_index]

            # モデルがDNNの場合はデータを正規化する
            if type(self._model) is ModelDnn:
                # Max-Minスケール化
                #scaler, tx, vx = util.max_min_scale(tx, vx)
                # 標準化
                scaler, tx, vx = util.standardize(tx, vx)

            # 学習を行う
            run_fold_name = '{0:s}_fold_{1:02d}'.format(self._run_name, i)
            model.train(tx, ty, vx, vy)

            # 予測を行う
            pred_y = model.predict(vx)

            # 評価結果を出力する
            self._print_evaluation_score(model, run_fold_name, vy, pred_y)

            # 特徴量の重要度を表示する
            self._show_importance_of_feature(model, run_fold_name)

            # Graphvizのグラフをファイルに出力する
            self._export_graphviz(model, run_fold_name)
Exemple #12
0
def variable_effect(pheno,var,regressors,conn):
    """
    Test effect of continuous variable.
    
    pheno = dataframe:
        -filtered to be only relevant subjects (use mask_var)
    var = column from pheno
    regressors = list of strings, formatted for patsy
    connectomes = n_subjects x n_edges array
    
    Returns:
    table = n_edges
        - betas_std = including standardization on controls
        - pvalues = pvalues
        - qvalues = fdr corrected pvalues alpha = 0.05
    """
    
    n_edges = conn.shape[1]
    contrast = np.zeros(1 + len(regressors))
    contrast[0] = 1
    
    betas_std = np.zeros(n_edges)
    pvalues = np.zeros(n_edges)
        
    formula = ' + '.join((regressors + [var]))
    dmat = pat.dmatrix(formula, pheno, return_type='dataframe',NA_action='raise')
    
    mask_std = np.ones(pheno.shape[0]).astype(bool)
    conn_std = standardize(mask_std, conn)
    
    for edge in range(n_edges):
        model_std = sm.OLS(conn_std[:,edge],dmat)
        results_std = model_std.fit()
        betas_std[edge] = results_std.params[var]
        pvalues[edge] = results_std.pvalues[var]
        
    mt = multipletests(pvalues,method='fdr_bh')
    reject = mt[0]
    qvalues = mt[1]
    
    table = pd.DataFrame(np.array([betas_std,pvalues,qvalues,reject]).transpose(),
                         columns=['betas_std','pvalues','qvalues','reject'])
    return table
Exemple #13
0
    def run_train_all(self):

        # データをロードする
        self._load_data()
        train_x, train_y = self._train_x, self._train_y

        # モデルがDNNの場合はデータを正規化する
        if type(self._model) is ModelDnn:
            # Max-Minスケール化
            #self._train_all_scaler, train_x, _ = util.max_min_scale(train_x, None)
            # 標準化
            self._train_all_scaler, train_x, _ = util.standardize(
                train_x, None)

        self._model.train(train_x, train_y)
        self.is_trained_all = True

        # 学習モデルを保存する
        self._model.save_model()
Exemple #14
0
def main():
    # Read in Data
    data = readData("nba_stats.csv")
    
    # Randomizes the data
    X = randomize(data)
    Y = X[:,-1] # Only the last column
    X = X[:,:-1] # All but the last column
    D = len(X[0])

    # Standardize
    standardized = standardize(X)

    # Select first 2/3 for training
    index = int(math.ceil((2.0/3.0) * len(X)))
    training = standardized[:index+1]
    testing = standardized[index+1:]
    Y_testing = Y[index+1:]

    # Divide training data into two groups
    positive = []
    negative = []
    for i in range(0, len(training)):
        if Y[i] == 1: # spam
            positive.append(training[i])
        else:
            negative.append(training[i])
    positive = numpy.array(positive).astype(float)
    negative = numpy.array(negative).astype(float)

    # Compute models for spam
    positive_model = []
    for k in range(0, D):
        positive_model.append((numpy.mean(positive[:,k]), numpy.std(positive[:,k])))

    # Compute models for non-spam
    negative_model = []
    for k in range(0,D):
        negative_model.append((numpy.mean(negative[:, k]), numpy.std(negative[:, k])))

    # Classify testing samples
    result = []
    testing_probabilities = []
    for sample in testing:
        p_positive = float(len(positive)) / len(positive) + len(negative)
        p_negative = float(len(negative)) / len(positive) + len(negative)
        for k in range(0, D):
            p_positive *= likelihood(positive_model[k][0], positive_model[k][1], sample[k])
            p_negative *= likelihood(negative_model[k][0], negative_model[k][1], sample[k])
        
        testing_probabilities.append(normalize_probabilities([p_positive, p_negative]))
        
        if p_positive > p_negative:
            result.append(1)
        else:
            result.append(0)
    
    precisions = []
    recalls = []
    for threshold in range(0, 100, 5):
        threshold = float(threshold) / 100

        TruePositives = 0.0
        TrueNegatives = 0.0
        FalsePositives = 0.0
        FalseNegatives = 0.0
        for i in range(0, len(testing_probabilities)):
            if Y_testing[i] == 1: # Positive example
                if testing_probabilities[i][0] > threshold: # Predicted positive
                    TruePositives += 1
                else: # Predicted negative
                    FalseNegatives += 1
            elif Y_testing[i] == 0: # Negative example
                if testing_probabilities[i][0] > threshold: # Predicted positive
                    FalsePositives += 1
                else: # Predicted negative
                    TrueNegatives += 1

        try:
            precision = TruePositives / (TruePositives + FalsePositives)
        except ZeroDivisionError:
            if TruePositives == 0:
                precision = 1
            else:
                precision = 0
        
        try:
            recall = TruePositives / (TruePositives + FalseNegatives)
        except ZeroDivisionError:
            if TruePositives == 0:
                recall = 1
            else:
                recall = 0
        
        precisions.append(precision)
        recalls.append(recall)

    plt.plot(recalls, precisions, 'r-o')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.show()
Exemple #15
0
def get_dictionary_data(n_comp=20, zero_index=False):
    unlabeled = util.load_unlabeled_training(flatten=False)
    height, width = 32, 32
    n_images = 10000
    patch_size = (8, 8)

    unlabeled = util.standardize(unlabeled)
    np.random.shuffle(unlabeled)

    print('Extracting reference patches...')

    patches = np.empty((0, 64))
    t0 = time()

    for image in unlabeled[:n_images, :, :]:
        data = np.array(extract_patches_2d(image, patch_size,
                                           max_patches=0.01))
        data = data.reshape(data.shape[0], -1)
        data -= np.mean(data, axis=0)
        data /= np.std(data, axis=0) + 1e-20
        patches = np.concatenate([patches, data])

    print('done in %.2fs.' % (time() - t0))

    # whiten the patches
    z = zca.ZCA()
    z.fit(patches)
    z.transform(patches)

    print('Learning the dictionary...')
    t0 = time()
    dico = MiniBatchDictionaryLearning(n_components=n_comp, alpha=1)
    V = dico.fit(patches).components_
    dt = time() - t0
    print('done in %.2fs.' % dt)

    #plt.figure(figsize=(4.2, 4))
    #for i, comp in enumerate(V[:100]):
    #    plt.subplot(10, 10, i + 1)
    #    plt.imshow(comp.reshape(patch_size), cmap=plt.cm.gray_r,
    #               interpolation='nearest')
    #    plt.xticks(())
    #    plt.yticks(())
    #plt.subplots_adjust(0.08, 0.02, 0.92, 0.85, 0.08, 0.23)
    #plt.show()

    labeled_data, labels = util.load_labeled_training(flatten=False,
                                                      zero_index=True)
    labeled_data = util.standardize(labeled_data)

    test_data = util.load_all_test(flatten=False)
    test_data = util.standardize(test_data)

    #util.render_matrix(test_data, flattened=False)

    print('Reconstructing the training images...')
    t0 = time()
    reconstructed_images = np.empty((0, 32, 32))

    for i, image in enumerate(labeled_data):
        data = extract_patches_2d(image, patch_size)
        data = data.reshape(data.shape[0], -1)
        data -= np.mean(data, axis=0)
        data /= np.std(data, axis=0) + 1e-20

        code = dico.transform(data)
        patches = np.dot(code, V)
        z.transform(patches)
        patches = patches.reshape(len(data), *patch_size)

        data = reconstruct_from_patches_2d(patches, (width, height))
        data = data.reshape(1, 32, 32)
        reconstructed_images = np.concatenate([reconstructed_images, data])

    print('done in %.2fs.' % (time() - t0))

    # flatten
    n, x, y = reconstructed_images.shape
    training_images = reconstructed_images.reshape(
        reconstructed_images.shape[0],
        reconstructed_images.shape[1] * reconstructed_images.shape[2])
    assert training_images.shape == (n, x * y)

    print('Reconstructing the test images...')
    t0 = time()
    reconstructed_test_images = np.empty((0, 32, 32))

    for image in test_data:
        data = extract_patches_2d(image, patch_size)
        data = data.reshape(data.shape[0], -1)
        data -= np.mean(data, axis=0)
        data /= np.std(data, axis=0) + 1e-20

        code = dico.transform(data)
        patches = np.dot(code, V)
        z.transform(patches)
        patches = patches.reshape(len(data), *patch_size)

        data = reconstruct_from_patches_2d(patches, (width, height))
        data = data.reshape(1, 32, 32)
        reconstructed_test_images = np.concatenate(
            [reconstructed_test_images, data])

    print('done in %.2fs.' % (time() - t0))

    # flatten
    n, x, y = reconstructed_test_images.shape
    test_images = reconstructed_test_images.reshape(
        reconstructed_test_images.shape[0],
        reconstructed_test_images.shape[1] *
        reconstructed_test_images.shape[2])
    assert test_images.shape == (n, x * y)

    return (training_images, labels, test_images)
        - qvalues = fdr corrected pvalues alpha = 0.05
    """
    n_edges = conn.shape[1]

    betas = np.zeros(n_edges)
    betas_std = np.zeros(n_edges)
    pvalues = np.zeros(n_edges)

    formula = ' + '.join((regressors + [case]))
    dmat = pat.dmatrix(formula,
                       pheno,
                       return_type='dataframe',
                       NA_action='raise')

    mask_std = ~pheno[case].to_numpy(dtype=bool)
    conn_std = standardize(mask_std, conn)

    for edge in range(n_edges):
        model_std = sm.OLS(conn_std[:, edge], dmat)
        results_std = model_std.fit()
        betas_std[edge] = results_std.params[case]
        pvalues[edge] = results_std.pvalues[case]
    mt = multipletests(pvalues, method='fdr_bh')
    reject = mt[0]
    qvalues = mt[1]

    table = pd.DataFrame(np.array([betas_std, pvalues, qvalues,
                                   reject]).transpose(),
                         columns=['betas_std', 'pvalues', 'qvalues', 'reject'])
    return table
Exemple #17
0
from pylearn2.datasets import preprocessing
from pylearn2.format.target_format import convert_to_one_hot

import pylab as plt
import cPickle as pickle
import numpy as np

import util
import dictionary_learning

if __name__ == "__main__":

    #train_data, train_labels = util.load_labeled_training(flatten=True, zero_index=True)
    #train_data = util.standardize(train_data)
    test_data = util.load_all_test(flatten=True)
    test_data = util.standardize(test_data)

    #train_data_20, _, test_data_20 = dictionary_learning.get_dictionary_data(n_comp=20, zero_index=True)
    #train_data_100, _, test_data_100 = dictionary_learning.get_dictionary_data(n_comp=100, zero_index=True)

    # convert the training labels into one-hot format, as required by the pylearn2 model
    #train_labels = convert_to_one_hot(train_labels, dtype='int64', max_labels=7, mode='stack')

    # pickle the data
    #serial.save('training_data_for_pylearn2.pkl', train_data)
    #serial.save('training_data_20_components_for_pylearn2.pkl', train_data_20)
    #serial.save('training_data_100_components_for_pylearn2.pkl', train_data_100)

    #serial.save('training_labels_for_pylearn2.pkl', train_labels)

    serial.save('test_data_for_pylearn2.pkl', test_data)
Exemple #18
0
def surf_multiple(phasor,
                  position,
                  Nx: int,
                  Ny: int,
                  prefix='',
                  filename=None):
    labels = [
        'Amplitude', 'Amplitude', 'Irradiance'
        # , 'Log Irradiance'
    ]
    for i, label in enumerate(labels):
        amp = phasor[:, 0]
        if label == 'Amplitude':
            z = amp
        else:
            z = amp**2
        if i == 3:
            log_irradiance = np.log(np.clip(amp**2, 1e-9, None))
            # z = log_irradiance
            # log_irradiance = np.log(util.irradiance(
            #     util.to_polar(a, phi), normalize=False))
            z = util.standardize(log_irradiance)
            assert abs(z.min()) < 1e-3
            assert abs(1 - z.max()) < 1e-3

        z_log = i in [1, 2]
        lower_bound = 1e-6  # assume max = 1
        if z_log:
            # manual log zscale
            z = np.clip(z, lower_bound, None)
            mini, maxi = z.min(), z.max()
            if mini == maxi or mini <= 0:
                continue
            else:
                mini = round(np.log10(mini))
                maxi = round(np.log10(maxi))
                if mini == maxi:
                    continue
                z = np.log10(z)

        # ignore third dimension in position
        ax, _ = surf(position[:, 0], position[:, 1], z, Nx, Ny)
        if z_log and mini != maxi:
            n_ticks = int(maxi - mini) + 1
            if n_ticks > 8 and n_ticks % 2 == 1:
                n_ticks = round(n_ticks / 2.)

            assert (n_ticks > 1)
            ticks = np.linspace(mini, maxi, n_ticks,
                                endpoint=True).round().astype(int)
            labels = [f'$10^{{{v}}}$' for v in ticks]
            # ax.set_zticks(ticks) # auto
            ax.set_zticklabels(labels)

        plt.xlabel('Space')
        plt.ylabel('Space')
        formatter = tck.EngFormatter(places=1, sep=u"\N{THIN SPACE}", unit='m')
        ax.xaxis.set_major_formatter(formatter)
        ax.yaxis.set_major_formatter(formatter)
        plt.xticks(rotation=ANGLES[0] / 2, rotation_mode='anchor')
        plt.yticks(rotation=-ANGLES[1] / 4, rotation_mode='anchor')
        plot.set_num_xyticks(3)

        plt.title(f'{prefix}{label}')
        plt.tight_layout()
        if filename is None:
            plt.show()
        else:
            suffix = label.replace(' ', '') + f'-{i}'
            plot.save_fig(f'{filename}_{suffix}', ext='png')

        plt.close()
Exemple #19
0
def predict(obs, mean, std):
    if STANDARDIZE:
      obs = util.standardize(obs, mean, std)
    actions = sess.run([preds], {x: obs, keep_prob: 1})
    return actions
ax.scatter(x_ok, y_ok, color='b', label='Admitted')
plt.show()

# Print sigmoid of matrix
arr = np.array([[0, 1], [2, 3]])
print(util.sigmoid(arr))

# Implement vectorised version of logistic cost function
beta = np.array([0, 0, 0])  # Test value should be 0.6931
print("Cost for [0,0,0]: ", util.costLogistic(Xe_norm, y, beta))

alpha = 0.5
iterations = 1000
beta = util.GDLogistic(iterations, alpha, beta, Xe_norm, y)
student = np.array([[45, 85]])
student_n = util.standardize(student, feature_mean, feature_std)

student_ne = util.extendMatrix(student_n)
print("Probability of admission: ", util.sigmoid(np.dot(student_ne, beta)))
util.trainingErrs(Xe_norm, beta, y)

# PLOT MESH GRID
h = .01  # step size in the mesh

x_min, x_max = X_norm[:, 0].min() - 0.1, X_norm[:, 0].max() + 0.1
y_min, y_max = X_norm[:, 1].min() - 0.1, X_norm[:, 1].max() + 0.1

xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max,
                                                           h))  # Mesh Grid
x1, x2 = xx.ravel(), yy.ravel()  # Turn to two Nx1 arrays
XXe = util.mapFeature(x1, x2, 1)  # Extend matrix for degree 2
Exemple #21
0
def get_dictionary_data(n_comp=20, zero_index=True):
    unlabeled = util.load_unlabeled_training(flatten=False)
    height, width = 32, 32
    n_images = 10000
    patch_size = (8, 8)

    unlabeled = util.standardize(unlabeled)
    np.random.shuffle(unlabeled)

    print('Extracting reference patches...')

    patches = np.empty((0, 64))
    t0 = time()

    for image in unlabeled[:n_images, :, :]:
        data = np.array(extract_patches_2d(image, patch_size, max_patches=0.10))
        data = data.reshape(data.shape[0], -1)
        data -= np.mean(data, axis=0)
        data /= np.std(data, axis=0) + 1e-20
        patches = np.concatenate([patches, data])

    print('done in %.2fs.' % (time() - t0))

    # whiten the patches
    z = zca.ZCA()
    z.fit(patches)
    z.transform(patches)

    print('Learning the dictionary...')
    t0 = time()
    dico = MiniBatchDictionaryLearning(n_components=n_comp, alpha=1)
    V = dico.fit(patches).components_
    dt = time() - t0
    print('done in %.2fs.' % dt)

    #plt.figure(figsize=(4.2, 4))
    #for i, comp in enumerate(V[:100]):
    #    plt.subplot(10, 10, i + 1)
    #    plt.imshow(comp.reshape(patch_size), cmap=plt.cm.gray_r,
    #               interpolation='nearest')
    #    plt.xticks(())
    #    plt.yticks(())
    #plt.subplots_adjust(0.08, 0.02, 0.92, 0.85, 0.08, 0.23)
    #plt.show()

    labeled_data, labels = util.load_labeled_training(flatten=False, zero_index=True)
    labeled_data = util.standardize(labeled_data)

    test_data = util.load_all_test(flatten=False)
    test_data = util.standardize(test_data)

    #util.render_matrix(test_data, flattened=False)

    print('Training SVM with the training images...')
    t0 = time()
    reconstructed_images = np.empty((0, 64))
    multiplied_labels = np.empty((0))

    for i in range(len(labeled_data)):
        image = labeled_data[i, :, :]
        label = labels[i]
        data = extract_patches_2d(image, patch_size, max_patches=0.50)
        data = data.reshape(data.shape[0], -1)
        data -= np.mean(data, axis=0)
        data /= np.std(data, axis=0) + 1e-20

        code = dico.transform(data)
        patches = np.dot(code, V)
        z.transform(patches)

        reconstructed_images = np.concatenate([reconstructed_images, patches])
        extended_labels = np.asarray([label] * len(patches))
        multiplied_labels = np.concatenate([multiplied_labels, extended_labels])

    print(reconstructed_images.shape, multiplied_labels.shape)
    svc = SVC()
    #print('Getting cross-val scores...')
    #scores = cross_validation.cross_val_score(svc, reconstructed_images, multiplied_labels, cv=10)
    #print('cross-val scores:', scores)
    #print('cross-val mean:', np.mean(scores))
    #print('cross-val variance:', np.var(scores))

    print('done in %.2fs.' % (time() - t0))

    svc.fit(reconstructed_images, multiplied_labels)

    print('Reconstructing the test images...')
    t0 = time()

    predictions = []

    for i, image in enumerate(test_data):
        data = extract_patches_2d(image, patch_size, max_patches=0.25)
        data = data.reshape(data.shape[0], -1)
        data -= np.mean(data, axis=0)
        data /= np.std(data, axis=0) + 1e-20

        code = dico.transform(data)
        patches = np.dot(code, V)
        z.transform(patches)

        pred = svc.predict(patches)
        print('Variance in the predictions:', np.var(pred))
        predictions.append(mode(pred))

    print('done in %.2fs.' % (time() - t0))

    predictions += 1
    util.write_results(predictions, 'svm_patches_25_percent_20_comp.csv')
Exemple #22
0
def main():
    # Read in Data
    data = readData("spambase.data")

    # Randomizes the data
    X = randomize(data)
    Y = X[:, -1]  # Only the last column
    X = X[:, :-1]  # All but the last column
    D = len(X[0])

    # Standardize
    standardized = standardize(X)

    # Select first 2/3 for training
    index = int(math.ceil((2.0 / 3.0) * len(X)))
    training = standardized[:index + 1]
    testing = standardized[index + 1:]
    Y_testing = Y[index + 1:]

    # Divide training data into two groups
    positive = []
    negative = []
    for i in range(0, len(training)):
        if Y[i] == 1:  # spam
            positive.append(training[i])
        else:
            negative.append(training[i])
    positive = numpy.array(positive).astype(float)
    negative = numpy.array(negative).astype(float)

    # Compute models for spam
    positive_model = []
    for k in range(0, D):
        positive_model.append(
            (numpy.mean(positive[:, k]), numpy.std(positive[:, k])))

    # Compute models for non-spam
    negative_model = []
    for k in range(0, D):
        negative_model.append(
            (numpy.mean(negative[:, k]), numpy.std(negative[:, k])))

    # Classify testing samples
    result = []
    for sample in testing:
        p_positive = float(len(positive)) / len(positive) + len(negative)
        p_negative = float(len(negative)) / len(positive) + len(negative)
        for k in range(0, D):
            p_positive *= likelihood(positive_model[k][0],
                                     positive_model[k][1], sample[k])
            p_negative *= likelihood(negative_model[k][0],
                                     negative_model[k][1], sample[k])

        if p_positive > p_negative:
            result.append(1)
        else:
            result.append(0)

    # Compute statistics
    TruePositives = 0.0
    TrueNegatives = 0.0
    FalsePositives = 0.0
    FalseNegatives = 0.0
    for i in range(0, len(result)):
        if Y_testing[i] == 1:  # Positive example
            if result[i] == 1:  # Predicted positive
                TruePositives += 1
            elif result[i] == 0:  # Predicted negative
                FalseNegatives += 1
        elif Y_testing[i] == 0:  # Negative example
            if result[i] == 1:  # Predicted positive
                FalsePositives += 1
            elif result[i] == 0:  # Predicted negative
                TrueNegatives += 1

    try:
        precision = TruePositives / (TruePositives + FalsePositives)
        recall = TruePositives / (TruePositives + FalseNegatives)
        f_measure = (2 * precision * recall) / (precision + recall)
        accuracy = (TruePositives + TrueNegatives) / (
            TruePositives + TrueNegatives + FalsePositives + FalseNegatives)

        print 'Precision: ' + str(precision)
        print 'Recall: ' + str(recall)
        print 'F-measure: ' + str(f_measure)
        print 'Accuracy: ' + str(accuracy)
    except:
        pass
Exemple #23
0
# Normalize
X_norm, feature_mean, feature_std = util.standardizeSet(X)

# Extend normalized data
Xe_norm = util.extendMatrix(X_norm)

for i in range(1, 7):
    plt.xlim(-3, 3)
    util.createSubScatterPlot(util.stdFeature(Xe_norm[:, i]), y, f'Feature {i}', 'Y', 2, 3, i)

plt.xlim(-3, 3)
plt.show()

gpu = np.array([2432, 1607, 1683, 8, 8, 256])
gpu_norm = util.standardize(gpu, feature_mean, feature_std)
gpu_norm_e = np.array([1, gpu_norm[0], gpu_norm[1], gpu_norm[2], gpu_norm[3], gpu_norm[4], gpu_norm[5]])

gpu = np.array([1, 2432, 1607, 1683, 8, 8, 256])
beta = util.calcBeta(Xe, y)
print("Benchmark using normal eq: ", util.normalEq(Xe, y, gpu))
print("Cost function: ", util.cost(Xe, y, beta))
# 12.3964
beta2 = util.calcBeta(Xe_norm, y)
print("Cost function normalized: ", util.cost(Xe_norm, y, beta2))
print("Benchmark on normalized data: ", util.normalEq(Xe_norm, y, gpu_norm_e))


# Implement vectorized version of gradient descent
iterations = 10000
alpha = 0.02
def get_dictionary_data(n_comp=20, zero_index=False):
    unlabeled = util.load_unlabeled_training(flatten=False)
    height, width = 32, 32
    n_images = 10000
    patch_size = (8, 8)

    unlabeled = util.standardize(unlabeled)
    np.random.shuffle(unlabeled)

    print('Extracting reference patches...')

    patches = np.empty((0, 64))
    t0 = time()

    for image in unlabeled[:n_images, :, :]:
        data = np.array(extract_patches_2d(image, patch_size, max_patches=0.01))
        data = data.reshape(data.shape[0], -1)
        data -= np.mean(data, axis=0)
        data /= np.std(data, axis=0) + 1e-20
        patches = np.concatenate([patches, data])

    print('done in %.2fs.' % (time() - t0))

    # whiten the patches
    z = zca.ZCA()
    z.fit(patches)
    z.transform(patches)

    print('Learning the dictionary...')
    t0 = time()
    dico = MiniBatchDictionaryLearning(n_components=n_comp, alpha=1)
    V = dico.fit(patches).components_
    dt = time() - t0
    print('done in %.2fs.' % dt)

    #plt.figure(figsize=(4.2, 4))
    #for i, comp in enumerate(V[:100]):
    #    plt.subplot(10, 10, i + 1)
    #    plt.imshow(comp.reshape(patch_size), cmap=plt.cm.gray_r,
    #               interpolation='nearest')
    #    plt.xticks(())
    #    plt.yticks(())
    #plt.subplots_adjust(0.08, 0.02, 0.92, 0.85, 0.08, 0.23)
    #plt.show()

    labeled_data, labels = util.load_labeled_training(flatten=False, zero_index=True)
    labeled_data = util.standardize(labeled_data)

    test_data = util.load_all_test(flatten=False)
    test_data = util.standardize(test_data)

    #util.render_matrix(test_data, flattened=False)

    print('Reconstructing the training images...')
    t0 = time()
    reconstructed_images = np.empty((0, 32, 32))

    for i, image in enumerate(labeled_data):
        data = extract_patches_2d(image, patch_size)
        data = data.reshape(data.shape[0], -1)
        data -= np.mean(data, axis=0)
        data /= np.std(data, axis=0) + 1e-20

        code = dico.transform(data)
        patches = np.dot(code, V)
        z.transform(patches)
        patches = patches.reshape(len(data), *patch_size)

        data = reconstruct_from_patches_2d(patches, (width, height))
        data = data.reshape(1, 32, 32)
        reconstructed_images = np.concatenate([reconstructed_images, data])

    print('done in %.2fs.' % (time() - t0))

    # flatten
    n, x, y = reconstructed_images.shape
    training_images = reconstructed_images.reshape(reconstructed_images.shape[0], reconstructed_images.shape[1]*reconstructed_images.shape[2])
    assert training_images.shape == (n, x*y)

    print('Reconstructing the test images...')
    t0 = time()
    reconstructed_test_images = np.empty((0, 32, 32))

    for image in test_data:
        data = extract_patches_2d(image, patch_size)
        data = data.reshape(data.shape[0], -1)
        data -= np.mean(data, axis=0)
        data /= np.std(data, axis=0) + 1e-20

        code = dico.transform(data)
        patches = np.dot(code, V)
        z.transform(patches)
        patches = patches.reshape(len(data), *patch_size)

        data = reconstruct_from_patches_2d(patches, (width, height))
        data = data.reshape(1, 32, 32)
        reconstructed_test_images = np.concatenate([reconstructed_test_images, data])

    print('done in %.2fs.' % (time() - t0))

    # flatten
    n, x, y = reconstructed_test_images.shape
    test_images = reconstructed_test_images.reshape(reconstructed_test_images.shape[0], reconstructed_test_images.shape[1]*reconstructed_test_images.shape[2])
    assert test_images.shape == (n, x*y)

    return (training_images, labels, test_images)