Esempio n. 1
0
def word_vectors(csv_file, vector_length, validation, word_to_skus=None, generated_sku_vectors=None):
	word_vects = {}
	words_index = 3
	queries = kaggle.slice(kaggle.file_to_array(csv_file, validation), words_index)
	for q in queries:
		formatted = kaggle.format_string(q)
		for word in kaggle.tokenize(formatted):
			if word not in word_vects:
				word_vects[word] = vector.random_vector(vector_length)

	return word_vects
Esempio n. 2
0
def test_data(csv_file, class_labels_index, input_data_index, validation, items_count, ngram=1):
	array = kaggle.file_to_array(csv_file, validation)
	class_labels = kaggle.slice(array, class_labels_index)
	test_data = kaggle.slice(array, input_data_index)
	formatted_test_data = []
	for d in test_data:
		formatted = kaggle.format_string(d)
		tokens = kaggle.tokenize(formatted, ngram)
		formatted_test_data.append(tokens)
	if items_count != 'All':
		class_labels, formatted_test_data = class_labels[0:items_count], formatted_test_data[0:items_count]
	return class_labels, formatted_test_data
Esempio n. 3
0
def real_test():
	neighbors = 20
	output = []
	model, word_vectors, _, labels, sku_hash = train(extra, neighbors, vector_length, False)
	
	queries = kaggle.slice(kaggle.file_to_array(training, True), 3)
	for q in queries:
		word_hash = kaggle.string_to_hash(kaggle.format_string(q))
		vect = query_vector(word_hash, word_vectors, vector_length)
		pred = model.predict(vect)[0]
		output.append([pred])
	return output
Esempio n. 4
0
def validation_test():
	n = 20
	sample_size = 10591
	start = time.time()
	
	model, word_vectors, sku_vectors, labels, sku_hash = train(training, n, vector_length, False)
	array = kaggle.file_to_array(training, True)
	labels = kaggle.slice(array, 1)
	print "Examples: " + str(len(labels))
	queries = kaggle.slice(array, 3)
	test_data = []
	for q in queries:
		word_hash = kaggle.string_to_hash(kaggle.format_string(q))
		test_data.append(query_vector(word_hash, word_vectors, vector_length))
	score = test(model, n, test_data, labels, sample_size, vector_length, sku_hash)
	print "Duration: " + str(time.time() - start)
	return score
Esempio n. 5
0
def validation_test():
	models, word_vectors = train_tree()
	correct_data = best_buy.sku_to_searches()
	# Create an array for each model to store predictions.
	output = []
	for m in models:
		output.append([])

	file_array = kaggle.file_to_array(training, True)
	queries = kaggle.slice(file_array, 3)
	skus = kaggle.slice(file_array, 1)
	total = 0.
	correct = 0.
	correct_pop = 0.
	wrong_pop = 0.

	right_answers = []
	wrong_answers = []

	for index,q in enumerate(queries):
		word_hash = kaggle.string_to_hash(kaggle.format_string(q))
		vect = knn.query_vector(word_hash, word_vectors, vector_length)
		all_preds = []
		for i,m in enumerate(models):
			preds = m.predictions(vect)[0:5]
			output[i].append(preds)
			pred = []
			for p in preds:
				pred.append(p[0])
				all_preds.append(p[0])

		#For testing accuracy.
		sub_out = set(all_preds)
		total += len(sub_out)

		correct_sku = skus[index]
		pop = len(correct_data[correct_sku])
		if correct_sku in sub_out:
			correct += 1.
			correct_pop += pop
			right_answers.append([q,correct_sku])
		else:
			wrong_pop += pop
			wrong_answers.append([q,correct_sku])
			#print "\nQuery: " + q
			#print "Correct Answer: " + str(correct_data[correct_sku][0:6]) + ". Popularity: " + str(len(correct_data[correct_sku]))
			#for p in sub_out:
			#	print "Prediction: " + str(correct_data[p][0:6]) + ". Popularity: " + str(len(correct_data[p]))
	
	wrong = len(queries) - correct
	#print "Avg wrong pop: " + str(wrong_pop/wrong)
	#print "Avg correct pop: " + str(correct_pop/correct)
	answered = total/len(queries)
	#print "Correct: " + str(correct)
	precision = correct/total
	recall = correct/len(queries)
	print "Total: " + str(total)
	#print "Answered: " + str(answered)
	print "Precision: " + str(precision)
	print "Recall: " + str(recall)
	return output