Ejemplo n.º 1
0
def main(argv):

    training_set = argv[1]
    test_set = argv[2]
    algorithm = argv[3]

    training_set = read_csv(training_set)
    test_set = read_csv(test_set)

    algorithm = algorithm.upper()

    if algorithm == 'NB':

        nb = NaiveBayes()
        nb.calculate_nb(training_set, test_set)

    else:
        int_match = re.findall('\d*', algorithm)
        if int_match[0] is not None:
            algorithm = algorithm.strip(int_match[0])
            k = int(int_match[0])
        if algorithm == 'NN':
            nn = KNearestNeighbor()
            nn.calculate_knn(training_set, test_set, k)

        exit()
Ejemplo n.º 2
0
def knn_prediction():
	
	
	X_train, X_test, y_train, y_test = train_test_split(numbers_of_dates[50:],
                                                        country_data[column].values,
                                                        test_size=0.2, shuffle=False)
	
	# st.write("Shape for " + column + " Cases")
	# st.write('X_train', X_train.shape)
	# st.write('y_train', y_train.shape)
	# st.write('X_test', X_test.shape)
	# st.write('y_test', y_test.shape)
	
	knn_reg = KNearestNeighbor(K=7)
	# knn = KNeighborsClassifier(n_neighbors=200)
	knn_reg.fit(X_train, y_train)
	#knn_pred = knn_reg.predict(numbers_start_to_futures[50:].reshape(-1, 1))
	y_pred = knn_reg.predict(X_test)
	
	st.write("Model Evaluation for " + country + " Prediction")
	mse = mean_squared_error(y_pred, y_test)
	rmse = math.sqrt(mse)
	st.write("Root Mean Square Error for KNN: ", round(rmse, 2))
	mae = mean_absolute_error(y_pred, y_test)
	st.write("Mean Absolute Error for KNN: ", round(mae, 2))
	
	r2score = metrics.r2_score(y_test, y_pred)
	if (r2score < 0) and (r2score > (-5)):
		r2score = 100 + r2score
	st.write("R_squared = " + str(round(r2score, 2)) + "%")
	st.write("The predicted cases will be approximately: " + str(y_pred[-1:]) + " on ", dates_start_to_futures[-1:])
	return rmse, r2score
Ejemplo n.º 3
0
    def __init__(self):
        self.trainNum = 5000
        self.testNum = 1000
        self.dataBaseUrl = "DataSet/"
        self.resultBasePath = "Assessment/"
        self.totalNum = self.trainNum + self.testNum
        self.knn = KNearestNeighbor()
        self.k = 12

        pass
Ejemplo n.º 4
0
def runRegressionAlgorithms(dataset):
    k_values = [5, 10, 15]
    # running knn in respect to k value
    for i in k_values:
        print("Running KNN with K of {}".format(i))
        dataset.runAlgorithm(KNearestNeighbor(i))
    k = math.ceil(len(dataset.data) / 4)
    # running kmeans in respect to the dataset
    KMeans(dataset, k)
Ejemplo n.º 5
0
def runClassificationAlgorithms(dataset):
    k_values = [5, 10, 15]
    #running each algorithm in respect to each k value
    for i in k_values:
        print("Running KNN with K of {}".format(i))
        dataset.runAlgorithm(KNearestNeighbor(i))
    for i in k_values:
        print("Running CNN with K of {}".format(i))
        dataset.runAlgorithm(CondensedNearestNeighbor(i))
    for i in k_values:
        print("Running ENN with K of {}".format(i))
        dataset.runAlgorithm(EditedNearestNeighbor(i))
    #running kmeans in respect to last ENN
    KMeans(dataset, 3)
Ejemplo n.º 6
0
def recovered_with_knn():
	# Splitting the dataset related to Recovered cases of the world into training and test sets
	
	X_train_recovered, X_test_recovered, y_train_recovered, y_test_recovered = train_test_split(numbers_of_dates[50:],
	                                                                                            covid19_world['Recovered'][50:].values,
	                                                                                            test_size=0.2, shuffle=False)
	
	# st.write("Recovered case shape")
	# st.write('X_train', X_train_recovered.shape)
	# st.write('y_train', y_train_recovered.shape)
	# st.write('X_test', X_test_recovered.shape)
	# st.write('y_test', y_test_recovered.shape)
	
	knn_reg = KNearestNeighbor(K=30)
	# knn = KNeighborsClassifier(n_neighbors=200)
	knn_reg.fit(X_train_recovered, y_train_recovered)
	knn_pred = knn_reg.predict(numbers_start_to_futures[50:].reshape(-1, 1))
	y_pred = knn_reg.predict(X_test_recovered)
	
	# st.dataframe(X_test_recovered)
	# st.dataframe(y_pred)
	# plt.plot(y_test_recovered)
	# plt.plot(y_pred)
	# plt.legend(['Test Data', 'KNN Predictions'])
	# st.pyplot()
	
	mse = mean_squared_error(y_pred, y_test_recovered)
	rmse = math.sqrt(mse)
	st.write("Root Mean Square Error: ", round(rmse, 2))
	mae = mean_absolute_error(y_pred, y_test_recovered)
	st.write("Mean Absolute Error: ", round(mae, 2))
	r2score = metrics.r2_score(y_test_recovered, y_pred)
	st.write("R_squared = " + str(round((100 + r2score), 2)) + "%")
	
	dates = dates_start_to_futures[50:-10]
	world_df = covid19_world.iloc[50:, :]
	
	st.write("The predicted cases will be approximately: " + str(knn_pred[-1:]) + " on ", dates_start_to_futures[-1:])
	## Predicted value of confirmed cases using Random Forest
	
	
	
	plt.figure(figsize=(12, 8))
	plt.xticks(rotation=60, fontsize=11)
	plt.yticks(fontsize=10)
	plt.xlabel("Dates", fontsize=20)
	plt.ylabel('World-Total Recovered cases', fontsize=20)
	plt.title("Predicted values of Recovered cases with KNN", fontsize=18)
	
	plt.plot_date(y=world_df['Recovered'].values, x=dates, label='Recovered', alpha=0.5, linestyle='-', color='cyan')
	plt.plot_date(y=knn_pred, x=dates_start_to_futures[50:], label='forecast', alpha=0.4, linestyle='-', color='orange')
	plt.legend()
	st.set_option('deprecation.showPyplotGlobalUse', False)
	st.pyplot()
	
	return rmse
Ejemplo n.º 7
0
    f(*args)
    toc = time.time()
    return toc - tic


'''
two_loop_time = time_function(classifier.compute_distances_two_loops, X_test)
print 'Two loop version took %f seconds' % two_loop_time

one_loop_time = time_function(classifier.compute_distances_one_loop, X_test)
print 'One loop version took %f seconds' % one_loop_time

no_loop_time = time_function(classifier.compute_distances_no_loops, X_test)
print 'No loop version took %f seconds' % no_loop_time
'''
classifier = KNearestNeighbor()
num_folds = 5
k_choices = [1, 3, 5, 8, 10, 12, 15, 20, 50, 100]

X_train_folds = []
y_train_folds = []
################################################################################
# TODO:                                                                        #
# Split up the training data into folds. After splitting, X_train_folds and    #
# y_train_folds should each be lists of length num_folds, where                #
# y_train_folds[i] is the label vector for the points in X_train_folds[i].     #
# Hint: Look up the numpy array_split function.                                #
################################################################################
X_train_folds = np.array_split(X_train, num_folds)
y_train_folds = np.array_split(y_train, num_folds)
Ejemplo n.º 8
0
class Assessment(object):
    def __init__(self):
        self.trainNum = 5000
        self.testNum = 1000
        self.dataBaseUrl = "DataSet/"
        self.resultBasePath = "Assessment/"
        self.totalNum = self.trainNum + self.testNum
        self.knn = KNearestNeighbor()
        self.k = 12

        pass

    def assess(self, k=None):
        if not k:
            k = self.k
        print "reading data..."
        pictures = readCsv(self.dataBaseUrl + "data", self.totalNum)

        dao = ImageDao()
        imgs = dao.getAll()
        typeDict = {}
        for img in imgs:
            typeDict[img.imgId] = img.imgType

        print "training..."
        trainSet = pictures[:self.trainNum]
        self.knn.train(trainSet)

        testSet = pictures[self.trainNum:self.totalNum]

        zerNp = np.zeros([k, self.testNum])
        testLabel = np.arange(self.trainNum, self.totalNum)
        for i in range(len(testLabel)):
            testLabel[i] = typeDict[str(testLabel[i]).zfill(5)]
        testLabel = (zerNp + testLabel).astype('int').T

        print "predicting..."
        accuracy, avgCriDist = self.knn.predictForManyWithK(
            testSet, testLabel, k, typeDict)

        print "accuracy:%f%%       averageCriticalDist:%f" % (accuracy * 100,
                                                              avgCriDist)

    def assessWithoutK(self):
        print "reading data..."
        pictures = readCsv(self.dataBaseUrl + "data", self.totalNum)

        dao = ImageDao()
        imgs = dao.getAll()
        typeDict = {}
        for img in imgs:
            typeDict[img.imgId] = img.imgType

        print "training..."
        trainSet = pictures[:self.trainNum]
        self.knn.train(trainSet)

        testSet = pictures[self.trainNum:self.totalNum]

        accuracyList = []
        heads = ['k', 'accuracy', 'averageCriticalDist']

        print "predicting..."
        for k in range(1, 101):
            zerNp = np.zeros([k, self.testNum])
            testLabel = np.arange(self.trainNum, self.totalNum)
            for i in range(len(testLabel)):
                testLabel[i] = typeDict[str(testLabel[i]).zfill(5)]
            testLabel = (zerNp + testLabel).astype('int').T

            accuracy, avgCriDist = self.knn.predictForManyWithK(
                testSet, testLabel, k, typeDict)

            item = [k, accuracy, avgCriDist]
            accuracyList.append(item)

            print "k:%d     accuracy:%f%%       averageCriticalDist:%f" % (
                k, accuracy * 100, avgCriDist)

        saveCsv(self.resultBasePath + 'assessKWithoutBlur.csv', heads,
                accuracyList)

    def assessWithoutDist(self):
        print "reading data..."
        pictures = readCsv(self.dataBaseUrl + "data", self.totalNum)

        dao = ImageDao()
        imgs = dao.getAll()
        typeDict = {}
        for img in imgs:
            typeDict[img.imgId] = img.imgType

        print "training..."
        trainSet = pictures[:self.trainNum]
        self.knn.train(trainSet)

        testSet = pictures[self.trainNum:self.totalNum]

        accuracyList = []
        heads = ['distance', 'accuracy', 'averageK']

        print "predicting..."
        for d in range(2000, 4000, 20):
            accuracy, avgK = self.knn.predictForManyWithDist(
                testSet, self.trainNum, d, typeDict)

            item = [d, accuracy, avgK]
            accuracyList.append(item)

            print "distance:%d     accuracy:%f%%       averageK:%f" % (
                d, accuracy * 100, avgK)

        saveCsv(self.resultBasePath + 'assessDist_Radius10_5000-1000.csv',
                heads, accuracyList)

    def assessWithoutRadius(self, k=None):
        if not k:
            k = self.k
        accuracyList = []
        heads = ['radius', 'accuracy', 'averageCriticalDist']

        for radius in range(5, 15):
            print radius, ':'
            print "blurring ..."
            data = MyData()
            data.saveCsvWithGaussianBlur(radius=radius)

            dao = ImageDao()
            imgs = dao.getAll()
            typeDict = {}
            for img in imgs:
                typeDict[img.imgId] = img.imgType

            zerNp = np.zeros([k, self.testNum])
            testLabel = np.arange(self.trainNum, self.totalNum)
            for i in range(len(testLabel)):
                testLabel[i] = typeDict[str(testLabel[i]).zfill(5)]
            testLabel = (zerNp + testLabel).astype('int').T

            pictures = readCsv(self.dataBaseUrl + "data", self.totalNum)

            print "training..."
            trainSet = pictures[:self.trainNum]
            self.knn.train(trainSet)

            testSet = pictures[self.trainNum:self.totalNum]

            print "predicting..."
            accuracy, avgCriDist = self.knn.predictForManyWithK(
                testSet, testLabel, k, typeDict)

            item = [radius, accuracy, avgCriDist]
            accuracyList.append(item)

            print "k:%d     radius:%f    accuracy:%f%%   averageCriticalDist:%f" % (
                k, radius, accuracy * 100, avgCriDist)

        saveCsv(self.resultBasePath + 'assessRadiusK' + str(k) + '.csv', heads,
                accuracyList)

    def assessWithoutRadiusAndK(self):
        accuracyList = []
        heads = ['radius', 'k', 'accuracy', 'averageCriticalDist']

        for radius in range(15, 25):
            print radius, ':'
            print "blurring ..."
            data = MyData()
            data.saveCsvWithGaussianBlur(radius=radius)

            dao = ImageDao()
            imgs = dao.getAll()
            typeDict = {}
            for img in imgs:
                typeDict[img.imgId] = img.imgType

            pictures = readCsv(self.dataBaseUrl + "data", self.totalNum)

            print "training..."
            trainSet = pictures[:self.trainNum]
            self.knn.train(trainSet)

            testSet = pictures[self.trainNum:self.totalNum]

            for k in range(5, 20):
                zerNp = np.zeros([k, self.testNum])
                testLabel = np.arange(self.trainNum, self.totalNum)
                for i in range(len(testLabel)):
                    testLabel[i] = typeDict[str(testLabel[i]).zfill(5)]
                testLabel = (zerNp + testLabel).astype('int').T

                print "predicting..."
                accuracy, avgCriDist = self.knn.predictForManyWithK(
                    testSet, testLabel, k, typeDict)

                item = [radius, k, accuracy, avgCriDist]
                accuracyList.append(item)

                print "radius:%f    k:%d     accuracy:%f%%   averageCriticalDist:%f" % \
                      (radius, k, accuracy * 100, avgCriDist)

        saveCsv(self.resultBasePath + 'assessRadius15-25AndK.csv', heads,
                accuracyList)

    def assessFeaWithoutK(self):
        print "reading data..."
        pictures = readCsv(self.dataBaseUrl + "netFea",
                           self.totalNum) * 256 * 40

        dao = ImageDao()
        imgs = dao.getAll()
        typeDict = {}
        for img in imgs:
            typeDict[img.imgId] = img.imgType

        print "training..."
        trainSet = pictures[:self.trainNum]
        self.knn.train(trainSet)

        testSet = pictures[self.trainNum:self.totalNum]

        accuracyList = []
        heads = ['k', 'accuracy', 'averageCriticalDist']

        print "predicting..."
        for k in range(1, 101):
            zerNp = np.zeros([k, self.testNum])
            testLabel = np.arange(self.trainNum, self.totalNum)
            for i in range(len(testLabel)):
                testLabel[i] = typeDict[str(testLabel[i]).zfill(5)]
            testLabel = (zerNp + testLabel).astype('int').T

            accuracy, avgCriDist = self.knn.predictForManyWithK(
                testSet, testLabel, k, typeDict)

            item = [k, accuracy, avgCriDist]
            accuracyList.append(item)

            print "k:%d     accuracy:%f%%       averageCriticalDist:%f" % (
                k, accuracy * 100, avgCriDist)

        saveCsv(self.resultBasePath + 'assessFeaWithoutK.csv', heads,
                accuracyList)

    def assessFeaWithBlurWithoutK(self):
        print "reading neaFeaData..."
        netFea = readCsv(self.dataBaseUrl + "netFea", self.totalNum) * 256 * 40
        print "reading data..."
        pictures = readCsv(self.dataBaseUrl + "data", self.totalNum)
        pictures = np.append(pictures, netFea, axis=1)
        print pictures.shape
        dao = ImageDao()
        imgs = dao.getAll()
        typeDict = {}
        for img in imgs:
            typeDict[img.imgId] = img.imgType

        print "training..."
        trainSet = pictures[:self.trainNum]
        self.knn.train(trainSet)

        testSet = pictures[self.trainNum:self.totalNum]

        accuracyList = []
        heads = ['k', 'accuracy', 'averageCriticalDist']

        print "predicting..."
        for k in range(1, 101):
            zerNp = np.zeros([k, self.testNum])
            testLabel = np.arange(self.trainNum, self.totalNum)
            for i in range(len(testLabel)):
                testLabel[i] = typeDict[str(testLabel[i]).zfill(5)]
            testLabel = (zerNp + testLabel).astype('int').T

            accuracy, avgCriDist = self.knn.predictForManyWithK(
                testSet, testLabel, k, typeDict)

            item = [k, accuracy, avgCriDist]
            accuracyList.append(item)

            print "k:%d     accuracy:%f%%       averageCriticalDist:%f" % (
                k, accuracy * 100, avgCriDist)

        saveCsv(self.resultBasePath + 'assessFeaWithBlurWithoutK.csv', heads,
                accuracyList)
# using the function train_test_split from sklearn library
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Standarize the values using the function StandardScaler
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# Now transform the values in standard form
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# using KNearestNeighbor class
knn = KNearestNeighbor(k=17)

# fitting in the algorithm
knn.fit(X_train, y_train)

# predict the value using predict() function
y_pred = knn.predict(np.array(X_test).reshape(len(X_test), len(X_test[0])))
# calculate the accuracy sore
from sklearn.metrics import accuracy_score

print("Accuracy: ", accuracy_score(y_test, y_pred))


# creating a function which will give us the output whether the person will purchase or not
def predict_new():
    age = (int(input("Enter the age: ")))
Ejemplo n.º 10
0
# Random forests
RF_classifier = RandomForestClassifier()
RF_classifier.fit(X, y)

#predicted = RF_classifier.predict(T)
#predicted_categories = labelEncoder.inverse_transform(predicted) # to write to file

# Naive Bayes
NB_classifier = GaussianNB()
NB_classifier.fit(X, y)

#predicted = NB_classifier.predict(T)
#predicted_categories = labelEncoder.inverse_transform(predicted) # to write to file

KNN_classifier = KNearestNeighbor()
KNN_classifier.fit(X, y)

#predictions = KNN_classifier.predict(T)

BM_classifier = BeatTheBenchmark(X, y)
BM_classifier.fit(X, y)
predicted = BM_classifier.predict(T)
predicted_categories = labelEncoder.inverse_transform(predicted)

result1 = zip(test_data['Id'], predicted_categories)

col = ["Accuracy", "Precision", "Recall", "F-Measure"]
bm_metrics = get_metrics(BM_classifier.classfier, X, y)
knn_metrics = get_metrics(KNN_classifier, X, y)
svm_metrics = get_metrics(SVM_classifier, X, y)
Ejemplo n.º 11
0
 
 ################################## start: select the size of training data and test data#############################
 numTraining = 5000   #training size 
 mask = list(range(numTraining))
 xTrainTemp = xTrain[mask]
 yTrainTemp = yTrain[mask]
 
 numTest = 500
 mask = list(range(numTest))
 xTestTemp = xTest[mask]
 yTestTemp = yTest[mask]
 ################################## end: select the size of training data and test data#############################
 
 np.set_printoptions(precision=2, threshold=20000000000)
 
 knnClassifier = KNearestNeighbor(xTrainTemp, yTrainTemp, xTestTemp)
 #knnClassifier = KNearestNeighbor(xTrain, yTrain, xTest)
 
 ################################## start: compute the distance using 3 methods #############################
 startTime = time.clock()
 distanceNoLoop = knnClassifier.computerDistanceNoLoop()
 #knnClassifier.saveDistanceToFile(distanceNoLoop, filePath+'/' + 'distanceNoLoop')
 print(distanceNoLoop.shape)
 print("NO Loop completed.")
 endTime = time.clock()
 print("cost time: %d"%(endTime-startTime))    
 
 #startTime = time.clock()
 #distance1Loop = knnClassifier.computerDistanceWith1Loop()
 #knnClassifier.saveDistanceToFile(distance1Loop, filePath+'/' + 'distance1Loop')
 #print(distance1Loop.shape)
Ejemplo n.º 12
0
##a = np.arange(15).reshape(3, 5)
##b = np.arange(10).reshape(2, 5)
##tr = np.square(a).sum(axis = 1) #per instance
##te = np.square(b).sum(axis = 1)
##M = np.dot(b,a.T)

X_train = np.reshape(X_train, (X_train.shape[0], -1))
X_test = np.reshape(X_test, (X_test.shape[0], -1))
#print X_train.shape, X_test.shape

# CV: Just saves the data

# Create a kNN classifier instance.
# Remember that training a kNN classifier is a noop:
# the Classifier simply remembers the data and does no further processing
classifier = KNearestNeighbor()
classifier.train(X_train, y_train)
dists = classifier.compute_distances_no_loops(X_test)
#plt.imshow(dists, interpolation='none')
#plt.savefig('sample.png')

y_test_pred = classifier.predict_labels(dists, k=5)
num_correct = np.sum(y_test_pred == y_test)
accuracy = float(num_correct) / num_test
print 'Got %d / %d correct => accuracy: %f' % (num_correct, num_test, accuracy)

# Now lets speed up distance matrix computation by using partial vectorization
# with one loop. Implement the function compute_distances_one_loop and run the
# code below:
##dists_one = classifier.compute_distances_one_loop(X_test)
##
Ejemplo n.º 13
0
def test_nearest_neighbor(test_x, test_y, model_file):
    knn = KNearestNeighbor()
    knn.load(model_file)
    print('model loaded successfully')
    score = knn.score(test_x, test_y)
    print('Testing Accuracy: ', score)
Ejemplo n.º 14
0
def train_nearest_neighbor(train_x, train_y, model_file):
    knn = KNearestNeighbor(no_of_neighbors=55)
    knn.fit(train_x, train_y, batch_size=1000)
    print('training complete')
    knn.save(model_file)
    print('model saved successfully')
Ejemplo n.º 15
0
upperK = 50
select = -1
k = 0

optimalDist = 2200
optimalLeastK = 12
dataNum = -1
radius = 10

print "reading data..."
data = MyData()
pictures = readCsv(dataBaseUrl + "data", dataNum)

print "training..."
knn = KNearestNeighbor()
knn.train(pictures)

dao = ImageDao()
imgs = dao.getAll()
typeDict = {}
for img in imgs:
    typeDict[img.imgId] = img.imgUrl

# server.config['UPLOAD_FOLDER'] = os.getcwd()


@server.route('/', methods=['GET', 'POST'])
def home():
    print "start..."
    pictureURL = url_for('static', filename=imgBaseURL + "empty.jpg")