def EpsDBSCAN(D, k):
    nn = NearestNeighbors(n_neighbors=k + 1)
    nn.fit(D)
    distances, indices = nn.kneighbors(D)
    distances = np.delete(distances, 0, 1)
    Dist = distances.max(axis=1)
    Array = sorted(Dist)
    AvgDist = distances.sum(axis=1) / k
    Avg_Array = sorted(AvgDist)
    plt.plot(Avg_Array, 'b')

    num = len(Avg_Array)
    n_Array = [0 for i in range(num)]
    minArray = min(Avg_Array)
    maxArray = max(Avg_Array)

    for i in range(num):
        n_Array[i] = (Avg_Array[i] - minArray) / (maxArray - minArray) * (1.0 -
                                                                          0.0)

    bins = np.linspace(0, 1, 10)
    bin_indice = np.digitize(n_Array, bins)
    Eps = []
    Avg_Array = np.array(Avg_Array)
    count_max = 0

    for i in range(10):
        count = len(np.where(bin_indice == i)[0])
        if count >= k:
            #print count
            e = np.sum(Avg_Array[bin_indice == i], axis=0) / count
            plt.hlines(e, xmin=0, xmax=len(Array), colors='r')
            Eps.append(e)

    N = len(Eps)
    Eps_index = []

    for i in range(N):
        for j in range(num):
            if Avg_Array[j] > Eps[i]:
                Eps_index.append(j)
                break

    ave_slope = (maxArray - minArray) / num

    #print 'ave slope'
    #print ave_slope
    #print ''
    for i in range(N - 1):
        slope = (Eps[i + 1] - Eps[i]) / (Eps_index[i + 1] - Eps_index[i])
        #print slope
        if slope > ave_slope * 2:
            out = Eps[i]
            break
        else:
            out = Eps[i + 1]

    return Eps
def EpsValue(D, k):
    nn = NearestNeighbors(n_neighbors=k + 1)
    nn.fit(D)
    distances, indices = nn.kneighbors(D)
    distances = np.delete(distances, 0, 1)
    Dist = distances.max(axis=1)
    AvgDist = distances.sum(axis=1) / k

    out = (max(Dist) - min(AvgDist)) / 100

    return min(AvgDist), out
Example #3
0
 def nn_latentspace(self, verbose=False):
     data_train, _, labels_train = self.labelled_set.get_latent()
     data_test, _, labels_test = self.unlabelled_set.get_latent()
     nn = KNeighborsClassifier()
     nn.fit(data_train, labels_train)
     score = nn.score(data_test, labels_test)
     if verbose:
         print("NN classifier score:", score)
         print("NN classifier tuple:",
               compute_accuracy_tuple(labels_test, nn.predict(data_test)))
     return score
Example #4
0
def uniformly_random_subsample(pairs_file, n_samples, out_file):
    
    pairs = pd.read_csv(pairs_file, sep='\t')
    samples = np.random.uniform(size=(n_samples,pairs.shape[1]-2))

    nn = NearestNeighbors(1, n_jobs=-1)
    nn.fit(pairs[['vec_sim', 'jac_sim', 'len_sim', 'top_sim']])

    index = pd.DataFrame(nn.kneighbors(samples, return_distance=False), columns=['index'])
    df = pairs.reset_index().merge(index).drop_duplicates()

    df.to_csv(out_file, sep='\t', index=None)
Example #5
0
def plot(X_tr, y_tr, a_tr, X_val, y_val, a_val):
    class_err = np.zeros((11, 1))
    mae = np.zeros((11, 1))
    count = 0

    for i in [0.0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.10]:
        nn = NN(i, 20)
        nn.fit(X_tr, y_tr, a_tr)
        nn.predict(X_tr)
        [class_err_1, class_err_2, mae1,
         mae2] = nn.calc_error(X_tr, y_tr, a_tr, X_val, y_val, a_val)
        class_err[count] = class_err_2
        mae[count] = mae2
        count = count + 1

    fig1 = plt.gcf()
    plt.plot(
        np.arange(0.0, 1.1, 0.1).tolist(),
        class_err,
        marker='o',
        linestyle='dashed',
        label=
        r'Observed Classification Error Rate (Y-axis) at Trade-off parameter $\alpha$ (X-axis)'
    )
    plt.legend(prop={'size': 22})
    plt.ylabel('Classification Error on Validation Data', fontsize=22)
    plt.yticks(np.arange(0.0, 1.1, 0.1).tolist())
    plt.xlabel(r'Trade-off parameter $\alpha$', fontsize=22)
    plt.xticks(np.arange(0.0, 1.1, 0.1).tolist())
    plt.title('Classification Error vs Trade-off Parameter', fontsize=22)
    mng = plt.get_current_fig_manager()
    mng.window.showMaximized()
    plt.show()
    fig1.savefig('C.png')

    fig2 = plt.gcf()
    plt.plot(
        np.arange(0.0, 1.1, 0.1).tolist(),
        mae,
        marker='o',
        linestyle='dashed',
        label=
        r'Observed Mean Absolute Error (Y-axis) at Trade-off parameter $\alpha$ (X-axis)'
    )
    plt.legend(prop={'size': 22})
    plt.ylabel('Mean Absolute Error on Validation Data', fontsize=22)
    plt.xlabel(r'Trade-off parameter $\alpha$', fontsize=22)
    plt.xticks(np.arange(0.0, 1.1, 0.1).tolist())
    plt.title('Mean Absolute Error vs Trade-off Parameter', fontsize=22)
    mng = plt.get_current_fig_manager()
    mng.window.showMaximized()
    plt.show()
    fig2.savefig('M.png')
Example #6
0
def main():

    DATA_DIR = '../data'
    data = np.load(os.path.join(DATA_DIR, "mnist_rot_train.npz"))
    X_tr, y_tr, a_tr = data["X"], data["labels"], data["angles"]

    data = np.load(os.path.join(DATA_DIR, "mnist_rot_validation.npz"))
    X_val, y_val, a_val = data["X"], data["labels"], data["angles"]

    #Note: test class labels and angles are not provided
    #in the data set
    data = np.load(os.path.join(DATA_DIR, "mnist_rot_test.npz"))
    X_te, y_te, a_te = data["X"], data["labels"], data["angles"]

    nn = BestNN(0.2, 20)
    nn.fit(X_tr, y_tr, a_tr)
Example #7
0
def experiment_scikitlearn_baselines(train_X, train_Y, test_X, test_Y):
    train_X = train_X.numpy().reshape(316, 28*50)
    train_Y.numpy()

    sv = svm.SVC()
    sv.fit(train_X, train_Y)

    nn = NearestCentroid()
    nn.fit(train_X, train_Y)

    ga = GaussianNB()
    ga.fit(train_X, train_Y)

    dt = tree.DecisionTreeClassifier()
    dt.fit(train_X, train_Y)

    test_X = test_X.numpy().reshape(100, 28*50)
    test_Y.numpy()
    print("SVM " + str(accuracy_score(test_Y, sv.predict(test_X))))
    print("NN " + str(accuracy_score(test_Y, nn.predict(test_X))))
    print("Gausian " + str(accuracy_score(test_Y, ga.predict(test_X))))
    print("DT " + str(accuracy_score(test_Y, dt.predict(test_X))))
    print("Warning: The following is taking approximately 1.5 hours in an average laptop.")
Example #8
0
def main():

    DATA_DIR = 'data'
    data = np.load(os.path.join(DATA_DIR, "mnist_rot_train.npz"))
    X_tr, y_tr, a_tr = data["X"], data["labels"], data["angles"]

    data = np.load(os.path.join(DATA_DIR, "mnist_rot_validation.npz"))
    X_val, y_val, a_val = data["X"], data["labels"], data["angles"]

    #Note: test class labels and angles are not provided
    #in the data set
    data = np.load(os.path.join(DATA_DIR, "mnist_rot_test.npz"))
    X_te, y_te, a_te = data["X"], data["labels"], data["angles"]

    #plt.imshow(X_tr[0].reshape((28,28)))
    #plt.show()
    nn = BestNN(1.0, 30)
    #[X_tr, X_val, X_te] = nn.preprocess(X_tr, X_val, X_te)

    nn.fit(X_tr, y_tr, a_tr)
    nn.predict(X_tr)
    nn.calc_error(X_tr, y_tr, a_tr, X_val, y_val, a_val)
    nn.savetestpred(X_te)
Example #9
0
    def update_unlabelled_knn(self):
        with torch.set_grad_enabled(False):
            self.model.eval()
            latent_train, _, labels_train = self.labelled_set.get_latent()
            latent_test, _, labels_test = self.unlabelled_set.sequential(
            ).get_latent()  # need original counts as input
            counts_test, _ = self.unlabelled_set.sequential().raw_data()
            nn = KNeighborsClassifier()
            # on a subset of the train data restriction
            # latent_train = latent_train
            # labels_train = labels_train
            nn.fit(latent_train, labels_train)
            print("SCORE nn :", nn.score(latent_test, labels_test))

            proba_test = nn.predict_proba(latent_test)
            classification_ratio = np.zeros(
                (len(latent_test), self.gene_dataset.n_labels))
            classification_ratio[:, nn.classes_] = proba_test

            to_keep = proba_test.max(axis=1) >= 0.8  # Threshold of confidence

            # # 1 - Maxime's initial formulation
            # classification_ratio = np.log(classification_ratio + 1e-8) * self.knn_classification_ratio
            # # 2 - Alternative here classification_ratio > 0 implies the opposite sign somewher
            # classification_ratio = -np.log(1 - classification_ratio)

            counts_test, labels_test, classification_ratio = \
                counts_test[to_keep], labels_test[to_keep], classification_ratio[to_keep]

            labelled_test_set = TensorDataset(
                torch.from_numpy(counts_test.astype(np.float32)),
                torch.from_numpy(labels_test.astype(np.int64)),
                torch.from_numpy(classification_ratio.astype(np.float32)))

            self.unlabelled_knn_set = self.create_posterior(
                gene_dataset=labelled_test_set, shuffle=True)
            self.model.train()
Example #10
0
def neuralNetworkSK(xTrain, yTrain):
    xTrainFlat = flattenComponents(xTrain)
    nn = MLPClassifier(activation='logistic', solver='sag')
    nn.fit(xTrainFlat, yTrain)
    return nn
X_tr = X_tr.reshape(X_tr.shape[0], -1)
X_t = X_t.reshape(X_t.shape[0], -1)

one_hot = OneHotEncoder()

y_tr = train_dataset.targets.numpy().reshape(-1, 1)
y_t = test_dataset.targets.numpy().reshape(-1, 1)

y_tr1 = one_hot.fit_transform(y_tr).toarray()
y_t1 = one_hot.fit_transform(y_t).toarray()

nn.fit(X_tr,
       y_tr1,
       epochs=1200,
       batch_size=64,
       loss=BinaryCrossEntropy(),
       optimizer=Adam(lr=0.001),
       show_progress=TQDM_TERMINAL)
preds = np.round(nn.predict(X_t))

total = len(preds)

correct = 0

for pred, y in zip(preds, y_t1):
    if pred.argmax() == y.argmax():
        correct += 1

print(f"Accuracy: {float(correct) * 100 / total}%")
def main():
    # if GPU is availale, use GPU
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    print("Use " + str(device))

    # create dataset
    file_list = None
    for path, dirs, files in os.walk(test_path, topdown=False):
        file_list = list(files)

    # preprocessing steps
    transform = transforms.Compose([
        transforms.Resize((512, 512)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225]),
    ])
    test_dataset = Leaf_test_Dataset(file_list, test_path, transform)
    test_loader = DataLoader(dataset=test_dataset, batch_size=batchSize)

    print("Start testing:")

    # net model
    eff_models = []
    for model_path in eff_model_paths:
        eff_net = EfficientNet.from_name('efficientnet-b4')
        eff_net._fc = nn.Linear(eff_net._fc.in_features, 5)
        eff_net.load_state_dict(torch.load(model_path))
        eff_net = eff_net.to(device)
        eff_net.eval()
        eff_models.append(eff_net)

    preds = []
    result = None

    with torch.no_grad():
        batch_num = len(test_loader)
        for index, image in enumerate(test_loader):
            image = image.to(device)

            eff_result = []
            for eff_net in eff_models:
                output = eff_net(image)
                output = output.to('cpu')
                pred = output.argmax(dim=1, keepdim=True).flatten()
                eff_result.append(pred)

            if len(preds) == 0:
                preds = np.dstack(eff_result)[0]
            else:
                preds = np.vstack([preds, np.dstack(eff_result)[0]])

        # start train combine model
        df = pd.read_csv(pred_train_csv)

        # 移除全错选项
        # get the pred acc for this line
        def get_acc(pred_csv, index):
            label = pred_csv.loc[index, 'label']
            acc = 0
            if pred_csv.loc[index, 'pred_0'] == label:
                acc += 0.2
            if pred_csv.loc[index, 'pred_1'] == label:
                acc += 0.2
            if pred_csv.loc[index, 'pred_2'] == label:
                acc += 0.2
            if pred_csv.loc[index, 'pred_3'] == label:
                acc += 0.2
            if pred_csv.loc[index, 'pred_4'] == label:
                acc += 0.2
            return round(acc, 1)

        delete_index = []
        for index in range(len(df)):
            acc = get_acc(df, index)
            # remove noise data
            if acc <= 0:
                delete_index.append(index)

        df = df.drop(delete_index)
        df = df.reset_index(drop=True)

        X = np.array(df[["pred_0", "pred_1", "pred_2", "pred_3", "pred_4"]])
        y = np.array(df[["label"]]).flatten()
        from sklearn.neural_network import MLPClassifier
        # Neural Network
        nn = MLPClassifier(max_iter=2000)
        nn.fit(X, y)
        result = nn.predict(preds)

    pred_result = pd.concat([
        pd.DataFrame(file_list, columns=['image_id']),
        pd.DataFrame(result, columns=['label'])
    ],
                            axis=1)
    pred_result.to_csv(output_path + "submission.csv", index=False, sep=',')

    print("Done.")
Example #13
0
def EpsDBSCAN(D, k):
    nn = NearestNeighbors(n_neighbors=k+1)
    nn.fit(D)
    distances, indices = nn.kneighbors(D)
    distances = np.delete(distances, 0, 1)
    Dist = distances.max(axis=1)
    Array = sorted(Dist)
    AvgDist = distances.sum(axis=1)/k
    Avg_Array = sorted(AvgDist)
    ##plt.plot(Avg_Array, 'b')

    num = len(Avg_Array)
    n_Array = [0 for i in range(num)]
    minArray = min(Avg_Array)
    maxArray = max(Avg_Array)

    for i in range(num):
        n_Array[i] = (Avg_Array[i]-minArray)/(maxArray-minArray)*(1.0-0.0)

    bins = np.linspace(0, 1, 10)
    bin_indice = np.digitize(n_Array, bins)
    Eps = []
    Avg_Array = np.array(Avg_Array)
    count_max = 0

    for i in range(10):
        count = len(np.where(bin_indice == i)[0])
        if count >= k:
            e = np.sum(Avg_Array[bin_indice == i], axis=0)/count
            ##plt.hlines(e, xmin=0, xmax=len(Array), colors='r')
            Eps.append(e)

    N = len(Eps)
    Eps_index = []

    for i in range(N):
        for j in range(num):
            if Avg_Array[j] > Eps[i]:
                Eps_index.append(j)
                break

    ave_slope = (maxArray - minArray)/num
    Slopes = []
    
    old_slope = 0.0
    for i in range(N-1):
        slope = (Eps[i+1] - Eps[i]) / (Eps_index[i+1] - Eps_index[i])
        Slopes.append(slope)
        ##if slope > old_slope and slope < old_slope * 1.1:
        ##    out = Eps[i]
        ##    break
        #if i > 0 and slope > ave_slope:
        #    out = Eps[i]
        #    break
        #else:
        #    out = Eps[i+1]
        #    old_slope = slope

    ave_slope = sum(Slopes)/len(Slopes)

    for i in range(N-1):
        if i > 0 and Slopes[i] > ave_slope:
            out = Eps[i]
            break
        else:
            out = Eps[i+1]

    #if N % 2 == 0:
    #    median1 = N/2
    #    median2 = N/2 + 1
    #    median1 = int(median1) - 1
    #    median2 = int(median2) - 1
    #    median = (Eps[median1] + Eps[median2]) / 2
    #else:
    #    median = (N + 1) / 2
    #    median = int(median) - 1
    #    median = Eps[median]

    #out = median

    #out = Avg_Array[int(num*0.9)]
    #out = Array[int(num*0.8)]
    #out = float(sum(Eps)/len(Eps))
    out = Eps[1]
    ##plt.show()

    return out