Esempio n. 1
0
def recommend_movies(movie_query, k_recommendations):
    raw_movies_data = []
    with open('Movie_Recommender/movies_recommendation_data.csv', 'r') as md:
        # Discard the first line (headings)
        next(md)
        # Read the data into memory
        for line in md.readlines():
            data_row = line.strip().split(',')
            raw_movies_data.append(data_row)

    # Prepare the data for use in the knn algorithm by picking
    # the relevant columns and converting the numeric columns
    # to numbers since they were read in as strings
    movies_recommendation_data = []
    for row in raw_movies_data:
        data_row = list(map(float, row[2:]))
        movies_recommendation_data.append(data_row)
    # Use the KNN algorithm to get the 5 movies that are most
    # similar to The Post.
    recommendation_indices, _ = knn(movies_recommendation_data,
                                    movie_query,
                                    k=k_recommendations,
                                    distance_fn=euclidean_distance,
                                    choice_fn=lambda x: None)
    movie_recommendations = []
    for _, index in recommendation_indices:
        movie_recommendations.append(raw_movies_data[index])

    return movie_recommendations
Esempio n. 2
0
def main():
    x, y = preprocess('student-mat.csv')
    print(x.shape, y.shape)

    # 70%的数据用于训练
    num_training = int(0.7 * len(x))
    num_test = len(x) - num_training
    x_train = x[:num_training]
    y_train = y[:num_training]
    x_test = x[num_training:]
    y_test = y[num_training:]
    # 01化
    # normalizex(x_train, num_training)
    normalizey(y_train, num_training)
    # normalizex(x_test, num_test)
    normalizey(y_test, num_test)
    '''
    import matplotlib.pyplot as plt
    temp0x0 = []
    temp0x1 = []
    temp1x0 = []
    temp1x1 = []
    for i in range(len(x)):
        if y[i]:
            temp1x0.append(x[i][0])
            temp1x1.append(x[i][1])
        else:
            temp0x0.append(x[i][0])
            temp0x1.append(x[i][1])
    plt.plot(temp1x0, temp1x1, 'g^', temp0x0, temp0x1, 'bs')
    plt.axis([0, 21, 0, 21])
    plt.xlabel('x0')
    plt.ylabel('x1')
    plt.show()
    '''
    #选择算法
    if sys.argv[1] == 'knn':
        print('KNN')
        for k in range(1, 21):
            hit_cnt, f1socre, y_test_pre = knn(k, 2, x_test, y_test, x_train,
                                               y_train)
            print('K = ', k, ', hit rate = ', hit_cnt / num_test, 'f1score = ',
                  f1socre)
    elif sys.argv[1] == 'svm':
        print('SVM')
        normalize0(y_train)
        normalize0(y_test)
        testRbf(200, x_train, y_train, x_test, y_test)
        for C in range(8):
            hit_cnt, f1socre, y_test_pre = testRbf(200, x_train, y_train,
                                                   x_test, y_test)
            print('C = ', C, ', hit rate = ', hit_cnt / num_test, 'f1score = ',
                  f1socre)
    else:
        print('Naive Bayes')
        hit_cnt, f1socre, y_test_pre = bayes(x_train, y_train, x_test, y_test)
        print('hit rate = ', hit_cnt / num_test, 'f1score = ', f1socre)
Esempio n. 3
0
def main():
    train_file = 'train.txt'
    test_file = 'test.txt'
    bayes_accuracy = naive_bayes(train_file, test_file)
    knn_accuracy = knn(train_file, test_file, k=5)
    dt_accuracy, tree = decision_tree(train_file, test_file)
    with open('output.txt', 'w') as f:
        print_tree(tree, f)
        f.write('\n{}\t{}\t{}\n'.format(round(dt_accuracy, 2),
                                        round(knn_accuracy, 2),
                                        round(bayes_accuracy, 2)))
#  Creating model objects
model = args.model
if (model == "baseline"):
    model_obj = BaseLine(reviews, categories)

elif (model == "logreg"):
    model_obj = LogReg(reviews)

elif (model == "multinomialNB"):
    model_obj = NaiveBayes(reviews, "multinomial")

elif (model == "lda"):
    model_obj = TopicModel(reviews)

elif (model == "kNearestNeighbors"):
    model_obj = knn(reviews, target)

else:  # put additional models here.
    print("Argument Error: invalid model specified")
    sys.exit()

model_classified = []  #  classifications stored here
reviews = []  #  resetting reviews list to save memory

#  Reading test data into reviews list
if args.invert == "False":
    for classifier in categories:
        with open("spring-" + classifier + ".json") as json_file:
            for line in json_file:
                json_obj = json.loads(line)
                reviews += [(classifier, json_obj)]
Esempio n. 5
0
KNN_TRAIN = "_SUBMIT_KNN.csv"
KNN_TRAIN_BN = "_SUBMIT_KNN_BN.csv"

RESULTADO = "Prediccion.csv"

bn_transformar(TRAIN,TRAIN_BN)
bn_transformar(TEST, TEST_BN)

ampliar_set(TRAIN, TRAIN_AMPLIADO)
ampliar_set(TRAIN_BN, TRAIN_AMPLIADO_BN)

rf(TRAIN_AMPLIADO, TEST, RF_TRAIN_AMPLIADO)
rf(TRAIN_AMPLIADO_BN, TEST_BN, RF_TRAIN_AMPLIADO_BN)

knn(TRAIN, TEST, KNN_TRAIN)
knn(TRAIN_BN, TEST_BN, KNN_TRAIN_BN)

submits = [KNN_TRAIN, KNN_TRAIN_BN, RF_TRAIN_AMPLIADO_BN, RF_TRAIN_AMPLIADO]
i_mejorPredictor = 0

democratizar(submits, RESULTADO, i_mejorPredictor)

b = timeit.default_timer()
secs = b - a
m, s = divmod(secs,60)
m = int(m)
s = int(s)

print "Fin digit_recognizer (" + str( m) + ":" + str(s ) + ")"
Esempio n. 6
0
            negFeatureFolders.append(
                [negFeature[j] for j in sequence[:posNum]])
    #print(np.array(negFeatureFolders).shape)
    for i in range(folderNum):
        subTrainFeature = negFeatureFolders[i]
        subTrainFeature.extend(posFeature)
        subTrainFeature = np.array(subTrainFeature)
        subTrainLabel = list(np.zeros(posNum))
        subTrainLabel.extend(list(np.ones(posNum)))
        subTrainLabel = np.array(subTrainLabel)
        print("=====%dst Bagging=====") % (i + 1)
        print("Positive: %d, Negative: %d") % (list(subTrainLabel).count(1),
                                               list(subTrainLabel).count(0))
        #print(subTrainFeature.shape)
        #print(subTrainLabel)
        predictedLabel_temp1 = knn(subTrainFeature, subTrainLabel, testFeature,
                                   5)
        predictedLabel_temp2 = decision_Tree(subTrainFeature, subTrainLabel,
                                             testFeature)
        predictedLabel_temp3 = adboostDT(subTrainFeature, subTrainLabel,
                                         testFeature)
        predictedLabel_temp4 = RandomForest_Classifer(subTrainFeature,
                                                      subTrainLabel,
                                                      testFeature)
        predictedLabel_temp5 = svmclassifier(subTrainFeature, subTrainLabel,
                                             testFeature, 1.0, 0.015625)
        predictedLabel_temp6 = logistic_regression(subTrainFeature,
                                                   subTrainLabel, testFeature)

        predictedLabel_voting1.append(predictedLabel_temp1)
        predictedLabel_voting2.append(predictedLabel_temp2)
        predictedLabel_voting3.append(predictedLabel_temp3)
#Set k, tolerance and max interations
k = 6
tolerance = 0.0001  # only for k means
max_iterations = 300  # only for k means
# k-means clustering
# get the optimal amount of clusters
dataset = np.delete(dataset, -1,
                    axis=1)  # delete feature column to run k means clustering
#dataset = dataset[:,:-2]

km = K_Means(k, tolerance, max_iterations)
km.fit(dataset)
create_kmeans_csv(km.classes.items())
print(
    len(km.classes.items()), "clusters created from the k-means algorithm. \n"
    "Check k-means.csv for more information")

# KNN
flag = False
while not flag:
    k = input_k("Enter k to run the KNN algorithm!").response

    if k is 0 or k is "":
        flag = True
        print("Didn't execute algorithm")
    else:
        k = int(k)
        print("returned value is:", k)
        predictions = knn(X_train, X_test, k)
        create_knn_csv(X_test)
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from KNN import knn

df = datasets.load_iris()

X, y = df.data, df.target

X_train, x_test, Y_train, y_test = train_test_split(X,
                                                    y,
                                                    train_size=0.75,
                                                    random_state=42)

Model = knn()
Model.fit(X_train, Y_train)
y_preds = Model.predict(x_test)

# Model accuracy score
print(f"model accuracy score is,  {accuracy_score(y_test, y_preds)}")
Esempio n. 9
0
KNN_TRAIN = "_SUBMIT_KNN.csv"
KNN_TRAIN_BN = "_SUBMIT_KNN_BN.csv"

RESULTADO = "Prediccion.csv"

bn_transformar(TRAIN, TRAIN_BN)
bn_transformar(TEST, TEST_BN)

ampliar_set(TRAIN, TRAIN_AMPLIADO)
ampliar_set(TRAIN_BN, TRAIN_AMPLIADO_BN)

rf(TRAIN_AMPLIADO, TEST, RF_TRAIN_AMPLIADO)
rf(TRAIN_AMPLIADO_BN, TEST_BN, RF_TRAIN_AMPLIADO_BN)

knn(TRAIN, TEST, KNN_TRAIN)
knn(TRAIN_BN, TEST_BN, KNN_TRAIN_BN)

submits = [KNN_TRAIN, KNN_TRAIN_BN, RF_TRAIN_AMPLIADO_BN, RF_TRAIN_AMPLIADO]
i_mejorPredictor = 0

democratizar(submits, RESULTADO, i_mejorPredictor)

b = timeit.default_timer()
secs = b - a
m, s = divmod(secs, 60)
m = int(m)
s = int(s)

print "Fin digit_recognizer (" + str(m) + ":" + str(s) + ")"
        for i in range(folderNum):
            random.shuffle(sequence)
            negFeatureFolders.append([negFeature[j] for j in sequence[:posNum]])
    #print(np.array(negFeatureFolders).shape)
    for i in range(folderNum):
        subTrainFeature = negFeatureFolders[i]
        subTrainFeature.extend(posFeature)
        subTrainFeature = np.array(subTrainFeature)
        subTrainLabel = list(np.zeros(posNum))
        subTrainLabel.extend(list(np.ones(posNum)))
        subTrainLabel = np.array(subTrainLabel)
        print("=====%dst Bagging=====") % (i+1)
        print("Positive: %d, Negative: %d") % (list(subTrainLabel).count(1), list(subTrainLabel).count(0))
        #print(subTrainFeature.shape)
        #print(subTrainLabel)
        predictedLabel_temp1 = knn(subTrainFeature, subTrainLabel, testFeature, 5)
        predictedLabel_temp2 = decision_Tree(subTrainFeature, subTrainLabel, testFeature)
        predictedLabel_temp3 = adboostDT(subTrainFeature, subTrainLabel, testFeature)
        predictedLabel_temp4 = RandomForest_Classifer(subTrainFeature, subTrainLabel, testFeature)
        predictedLabel_temp5 = svmclassifier(subTrainFeature, subTrainLabel, testFeature, 1.0, 0.015625)
        predictedLabel_temp6 = logistic_regression(subTrainFeature, subTrainLabel, testFeature)

        predictedLabel_voting1.append(predictedLabel_temp1)
        predictedLabel_voting2.append(predictedLabel_temp2)
        predictedLabel_voting3.append(predictedLabel_temp3)
        predictedLabel_voting4.append(predictedLabel_temp4)
        predictedLabel_voting5.append(predictedLabel_temp5)
        predictedLabel_voting6.append(predictedLabel_temp6)
        print("KNN=====%dst predicted labels:") % (i+1)
        print(predictedLabel_temp1)
        print("DT=====%dst predicted labels:") % (i+1)
#  Creating model objects
model = args.model
if (model == "baseline"):
  model_obj = BaseLine(reviews, categories)

elif (model == "logreg"):
  model_obj = LogReg(reviews)

elif (model == "multinomialNB"):
  model_obj = NaiveBayes(reviews, "multinomial")

elif (model == "lda"):
  model_obj = TopicModel(reviews)

elif (model == "kNearestNeighbors"):
  model_obj = knn(reviews,target)

else: # put additional models here.
  print("Argument Error: invalid model specified")
  sys.exit()

model_classified = [] #  classifications stored here
reviews = [] #  resetting reviews list to save memory

#  Reading test data into reviews list
if args.invert == "False":
  for classifier in categories:
    with open("spring-"+classifier+".json") as json_file:
      for line in json_file:
        json_obj = json.loads(line)
        reviews += [(classifier,json_obj)]
Esempio n. 12
0
def orient(name, filename, model_file, model):

    if name == 'train':

        if model == 'nearest' or model == 'best':
            train = pd.read_csv(filename, sep=' ', header=None)
            filename_knn = model_file
            file = open(filename_knn, 'wb')
            pickle.dump(train, file)

        if model == 'nnet':
            train = pd.read_csv(filename, sep=' ', header=None)
            x_train = train.drop(columns=[0, 1], axis=1)
            y_train = train[1]
            y_train = pd.get_dummies(y_train)
            y_columns = y_train.columns
            x_train = x_train.to_numpy()
            y_train = y_train.to_numpy()
            print(x_train.shape[0], 'train samples')
            a = nn(25, 0.001, 0.9)
            (w1, w2, w3, b1, b2, b3) = a.fit(x_train, y_train)

            weights = {
                'w1': w1,
                'w2': w2,
                'w3': w3,
                'b1': b1,
                'b2': b2,
                'b3': b3,
                'y_columns': y_columns
            }
            filename_nn = model_file
            file = open(filename_nn, 'wb')
            pickle.dump(weights, file)

        if model == 'tree':
            dtreemain(name, filename, model_file)

    if name == 'test':

        if model == 'nearest' or model == 'best':
            file = open(model_file, 'rb')
            train = pickle.load(file)
            test = pd.read_csv(filename, sep=' ', header=None)
            X_test = test.drop(columns=[0, 1], axis=1)
            y_filenames = test[0]
            y_test = test[1]
            X_test = X_test.to_numpy()
            y_test = y_test.to_numpy()
            obj = knn(10)
            ypred = obj.predict(train, X_test)
            f = open("output.txt", "w")
            for i in range(len(X_test)):
                with open('output.txt', 'a') as f:
                    f.write(str(y_filenames[i]) + ' ' + str(ypred[i]) + '\n')
        if model == 'tree':
            dtreemain(name, filename, model_file)

        if model == 'nnet':

            test = pd.read_csv(filename, sep=' ', header=None)
            x_test = test.drop(columns=[0, 1], axis=1)
            y_test = test[1]
            y_filenames = test[0]
            y_test = pd.get_dummies(y_test)
            x_test = x_test.to_numpy()
            y_test = y_test.to_numpy()
            print(x_test.shape[0], 'test samples')
            file = open(model_file, 'rb')
            new_weights = pickle.load(file)
            w1f = new_weights['w1']
            w2f = new_weights['w2']
            w3f = new_weights['w3']
            b1f = new_weights['b1']
            b2f = new_weights['b2']
            b3f = new_weights['b3']
            y_columns = new_weights['y_columns']
            a = nn(25, 0.001, 0.9)
            y_test_predicted = a.predict(x_test, w1f, w2f, w3f, b1f, b2f, b3f)
            zero_one = (y_test_predicted == y_test_predicted.max(
                axis=1)[:, None]).astype(int)
            diff = (y_test == zero_one).sum(axis=1)
            accuracy = np.count_nonzero(diff == y_test.shape[1])
            print('accuracy ', accuracy / diff.shape[0] * 100)
            f = open("Output.txt", "w")
            for i in range(len(x_test)):
                with open('Output.txt', 'a') as f:
                    f.write(
                        str(y_filenames[i]) + ' ' +
                        str(y_columns[np.argmax(y_test_predicted[i])]) + '\n')
Esempio n. 13
0
from KNN import knn


trainData = <path to train data>
testData = <path to test data>
categoricalIndices = <list of indices of any non-numerical rows>

test = knn()

test.trainDataProcess(trainData, categoricalIndices)

test.testDataProcess(testData, True)

test.predict(5)
Esempio n. 14
0
from KNN import knn
from RandomForest import randomforest
from SupportVector import supportvector
knn=knn()
b=knn.info()
a=knn.kn()
rf=randomforest()
b=rf.rf()
sv=supportvector()
c=sv.svc()
print("Accuracy of KNN is : "+str(a))
print("Accuracy of RandomForest is : "+str(b))
print("Accuracy of SupportVector is : "+str(c))


 def get_recommended_movies(self, data, fav_movie):
     """Run the k-nearest neighbors algorithm"""
     recommended_movies = knn(data, fav_movie, k=5)
     return recommended_movies
Esempio n. 16
0
        data_y.append(row[0])  # |
    data_x = hp.normalize(data_x)  # Normaliza

    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # The THING itself
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

    ratio = 0.40  # Known base %
    k = 3  # K
    distance_func = 'euclidean'
    bound = int(ratio * len(data_y))

    # The classification algorithm
    my_output_labels = knn(x_train=data_x[:bound],
                           y_train=data_y[:bound],
                           x_test=data_x[bound + 1:],
                           distance=distance_func,
                           k=k)

    print('Objective: ' + str(data_y[bound + 1:]))
    print('Obtained:  ' + str(my_output_labels))

    # Matriz de Confusão
    i = 0
    tp, tn, fp, fn = 0, 0, 0, 0
    for label in data_y[bound + 1:]:
        if label == my_output_labels[
                i] and label == 'Iris-setosa':  # True Positive
            tp += 1
        elif label == my_output_labels[
                i] and label == 'Iris-versicolor':  # True Negative
Esempio n. 17
0
while True:
	imageResp = urlopen(url)
	imageNp = np.array(bytearray(imageResp.read()), dtype=np.uint8)
	frame = cv2.imdecode(imageNp, -1)

	# Convert frame to grayscale
	gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)

	# Detect multi faces in the image
	faces = face_cascade.detectMultiScale(gray, 1.3, 5)

	for face in faces:
		x, y, w, h = face

		# Get the face ROI
		offset = 7
		face_section = frame[y-offset:y+h+offset, x-offset:x+w+offset]
		face_section = cv2.resize(face_section, (100, 100))

		out = knn(trainset, face_section.flatten())

		# Draw rectangle in the original image
		cv2.putText(frame, names[int(out)],(x,y-10), font, 1,(255,0,0),2, lineType = cv2.CV_AA)
		cv2.rectangle(frame, (x,y), (x+w,y+h), (0,255,0), 2)

	cv2.imshow("Faces", frame)

	if cv2.waitKey(1) & 0xFF == ord('q'):
		break

cv2.destroyAllWindows()
Esempio n. 18
0
import pandas as pd
from DMC import dmc
from KNN import knn
from NN import nn

data = pd.read_csv("iris.csv")
testSet = [[7.2, 3.6, 5.1, 2.5]]
test = pd.DataFrame(testSet)

k = 5
result1, neighbor1 = nn(data, test)
result2, neighbor2 = knn(data, test, k)
result3, neighbor3 = dmc(data, test)

print("\nResultados: ")
print("NN\n\tResults: {} - Vizinho: {}".format(result1, neighbor1))
print("KNN\n\tResults: {} - Vizinho: {}".format(result2, neighbor2))
print("DMC\n\tResults: {} - Vizinho(Centroide): {}".format(result3, neighbor3))