def data(csv_file, vector_length, validation, word_vector_hash=None): if word_vector_hash == None: word_vector_hash = word_vectors(csv_file, vector_length, validation) # generate the sku-words sku_words = [] array = kaggle.file_to_array(csv_file, validation) class_labels = kaggle.slice(array, 1) text = kaggle.slice(array, 3) indexes = len(text) for i in range(indexes): word_count = kaggle.string_to_hash(text[i]) label = class_labels[i] line_array = [label, word_count] sku_words.append(line_array) # get a list of only the vectors sku_vects = sku_vectors(sku_words, word_vector_hash, vector_length) vects = [] for triplet in sku_vects: vect = triplet[2] vects.append(vect) sku_hash = sku_vector_hash(sku_words, word_vector_hash, vector_length) return vects, class_labels, word_vector_hash, sku_hash
def real_test(): neighbors = 20 output = [] model, word_vectors, _, labels, sku_hash = train(extra, neighbors, vector_length, False) queries = kaggle.slice(kaggle.file_to_array(training, True), 3) for q in queries: word_hash = kaggle.string_to_hash(kaggle.format_string(q)) vect = query_vector(word_hash, word_vectors, vector_length) pred = model.predict(vect)[0] output.append([pred]) return output
def validation_test(): n = 20 sample_size = 10591 start = time.time() model, word_vectors, sku_vectors, labels, sku_hash = train(training, n, vector_length, False) array = kaggle.file_to_array(training, True) labels = kaggle.slice(array, 1) print "Examples: " + str(len(labels)) queries = kaggle.slice(array, 3) test_data = [] for q in queries: word_hash = kaggle.string_to_hash(kaggle.format_string(q)) test_data.append(query_vector(word_hash, word_vectors, vector_length)) score = test(model, n, test_data, labels, sample_size, vector_length, sku_hash) print "Duration: " + str(time.time() - start) return score
def validation_test(): models, word_vectors = train_tree() correct_data = best_buy.sku_to_searches() # Create an array for each model to store predictions. output = [] for m in models: output.append([]) file_array = kaggle.file_to_array(training, True) queries = kaggle.slice(file_array, 3) skus = kaggle.slice(file_array, 1) total = 0. correct = 0. correct_pop = 0. wrong_pop = 0. right_answers = [] wrong_answers = [] for index,q in enumerate(queries): word_hash = kaggle.string_to_hash(kaggle.format_string(q)) vect = knn.query_vector(word_hash, word_vectors, vector_length) all_preds = [] for i,m in enumerate(models): preds = m.predictions(vect)[0:5] output[i].append(preds) pred = [] for p in preds: pred.append(p[0]) all_preds.append(p[0]) #For testing accuracy. sub_out = set(all_preds) total += len(sub_out) correct_sku = skus[index] pop = len(correct_data[correct_sku]) if correct_sku in sub_out: correct += 1. correct_pop += pop right_answers.append([q,correct_sku]) else: wrong_pop += pop wrong_answers.append([q,correct_sku]) #print "\nQuery: " + q #print "Correct Answer: " + str(correct_data[correct_sku][0:6]) + ". Popularity: " + str(len(correct_data[correct_sku])) #for p in sub_out: # print "Prediction: " + str(correct_data[p][0:6]) + ". Popularity: " + str(len(correct_data[p])) wrong = len(queries) - correct #print "Avg wrong pop: " + str(wrong_pop/wrong) #print "Avg correct pop: " + str(correct_pop/correct) answered = total/len(queries) #print "Correct: " + str(correct) precision = correct/total recall = correct/len(queries) print "Total: " + str(total) #print "Answered: " + str(answered) print "Precision: " + str(precision) print "Recall: " + str(recall) return output