Esempio n. 1
0
def data(csv_file, vector_length, validation, word_vector_hash=None):
	if word_vector_hash == None:
		word_vector_hash = word_vectors(csv_file, vector_length, validation)

	# generate the sku-words
	sku_words = []
	array = kaggle.file_to_array(csv_file, validation)
	class_labels = kaggle.slice(array, 1)
	text = kaggle.slice(array, 3)
	indexes = len(text)
	for i in range(indexes):
		word_count = kaggle.string_to_hash(text[i])
		label = class_labels[i]
		line_array = [label, word_count]
		sku_words.append(line_array)

	# get a list of only the vectors
	sku_vects = sku_vectors(sku_words, word_vector_hash, vector_length)
	vects = []
	for triplet in sku_vects:
		vect = triplet[2]
		vects.append(vect)

	sku_hash = sku_vector_hash(sku_words, word_vector_hash, vector_length)
	return vects, class_labels, word_vector_hash, sku_hash
Esempio n. 2
0
def test_data(csv_file, class_labels_index, input_data_index, validation, items_count, ngram=1):
	array = kaggle.file_to_array(csv_file, validation)
	class_labels = kaggle.slice(array, class_labels_index)
	test_data = kaggle.slice(array, input_data_index)
	formatted_test_data = []
	for d in test_data:
		formatted = kaggle.format_string(d)
		tokens = kaggle.tokenize(formatted, ngram)
		formatted_test_data.append(tokens)
	if items_count != 'All':
		class_labels, formatted_test_data = class_labels[0:items_count], formatted_test_data[0:items_count]
	return class_labels, formatted_test_data
Esempio n. 3
0
def csv_with_more_data():
	new_path = "../data/extra.csv"
	more_data = xml_to_array()
	old = kaggle.file_to_array(training)
	skus = set(kaggle.slice(old, 1))
	added_skus = set([])
	count = 0

	with open(new_path, "a") as new:
		for o in old:
			new.write(",".join(o).strip() + "\n")

		for product in more_data:
			sku = product[0]
			name = product[1]

			if sku in skus and sku not in added_skus:
				count += 1
				added_skus.add(sku)
				fake_line = ",".join(['fakeuser', sku, 'fakecategory', name, 'fake_time', 'fake_time']).encode('utf-8').strip()
				for i in range(15):
					new.write(fake_line + "\n")
	print len(added_skus)
	print len(skus)
	print count
	return None
Esempio n. 4
0
def validation_test():
	n = 20
	sample_size = 10591
	start = time.time()
	
	model, word_vectors, sku_vectors, labels, sku_hash = train(training, n, vector_length, False)
	array = kaggle.file_to_array(training, True)
	labels = kaggle.slice(array, 1)
	print "Examples: " + str(len(labels))
	queries = kaggle.slice(array, 3)
	test_data = []
	for q in queries:
		word_hash = kaggle.string_to_hash(kaggle.format_string(q))
		test_data.append(query_vector(word_hash, word_vectors, vector_length))
	score = test(model, n, test_data, labels, sample_size, vector_length, sku_hash)
	print "Duration: " + str(time.time() - start)
	return score
Esempio n. 5
0
def train_model(csv_file, ngram=1, validation=False):
	class_labels_index = 1
	input_data_index = 3
	data = kaggle.file_to_hash(csv_file, class_labels_index, input_data_index, validation)
	model = train(data, ngram)
	data = kaggle.file_to_array(csv_file, validation)
	class_labels = kaggle.slice(data, class_labels_index)
	popularity = popular.popularity_hash(class_labels, data)
	return model, popularity
Esempio n. 6
0
def word_vectors(csv_file, vector_length, validation, word_to_skus=None, generated_sku_vectors=None):
	word_vects = {}
	words_index = 3
	queries = kaggle.slice(kaggle.file_to_array(csv_file, validation), words_index)
	for q in queries:
		formatted = kaggle.format_string(q)
		for word in kaggle.tokenize(formatted):
			if word not in word_vects:
				word_vects[word] = vector.random_vector(vector_length)

	return word_vects
Esempio n. 7
0
def real_test():
	neighbors = 20
	output = []
	model, word_vectors, _, labels, sku_hash = train(extra, neighbors, vector_length, False)
	
	queries = kaggle.slice(kaggle.file_to_array(training, True), 3)
	for q in queries:
		word_hash = kaggle.string_to_hash(kaggle.format_string(q))
		vect = query_vector(word_hash, word_vectors, vector_length)
		pred = model.predict(vect)[0]
		output.append([pred])
	return output
Esempio n. 8
0
def validation_test():
	start = time.time()
	answers = kaggle.slice(kaggle.file_to_array(training, True), 1)
	#xtrees_preds, xtrees2, xtrees3, xtrees4, forest_preds, knn_preds = decision_tree.real_test()
	models = decision_tree.real_test()
	#time_preds = time_rank.real_test()
	#tf_preds = tf_idf.real_test(w)
	#merged = vote.merge_answers(xtrees_preds, xtrees2, xtrees3, xtrees4, forest_preds, tf_preds, knn_preds, time_preds)
	merged = vote.vote(models)
	#kaggle.write_predictions(merged, "../data/predictions_9_29_12.csv")
	accuracy = score(merged, answers)
	print "Duration: " + str(time.time() - start)
	return accuracy
Esempio n. 9
0
def sku_to_searches():
	array = kaggle.file_to_array(training, "all")
	#array = array[0:10000:4] + array[1:10000:4] + array[2:10000:4]
	skus = kaggle.slice(array, 1)
	skus = set(skus)
	skus_searches = {}
	for sku in skus:
		skus_searches[sku] = []

	for line in array:
		sku = line[1]
		search = line[3]
		search = " ".join(kaggle.tokenize(search.lower()))
		search = re.sub("\"", '', search)
		skus_searches[sku].append(search)
	return skus_searches
Esempio n. 10
0
def validation_test():
	models, word_vectors = train_tree()
	correct_data = best_buy.sku_to_searches()
	# Create an array for each model to store predictions.
	output = []
	for m in models:
		output.append([])

	file_array = kaggle.file_to_array(training, True)
	queries = kaggle.slice(file_array, 3)
	skus = kaggle.slice(file_array, 1)
	total = 0.
	correct = 0.
	correct_pop = 0.
	wrong_pop = 0.

	right_answers = []
	wrong_answers = []

	for index,q in enumerate(queries):
		word_hash = kaggle.string_to_hash(kaggle.format_string(q))
		vect = knn.query_vector(word_hash, word_vectors, vector_length)
		all_preds = []
		for i,m in enumerate(models):
			preds = m.predictions(vect)[0:5]
			output[i].append(preds)
			pred = []
			for p in preds:
				pred.append(p[0])
				all_preds.append(p[0])

		#For testing accuracy.
		sub_out = set(all_preds)
		total += len(sub_out)

		correct_sku = skus[index]
		pop = len(correct_data[correct_sku])
		if correct_sku in sub_out:
			correct += 1.
			correct_pop += pop
			right_answers.append([q,correct_sku])
		else:
			wrong_pop += pop
			wrong_answers.append([q,correct_sku])
			#print "\nQuery: " + q
			#print "Correct Answer: " + str(correct_data[correct_sku][0:6]) + ". Popularity: " + str(len(correct_data[correct_sku]))
			#for p in sub_out:
			#	print "Prediction: " + str(correct_data[p][0:6]) + ". Popularity: " + str(len(correct_data[p]))
	
	wrong = len(queries) - correct
	#print "Avg wrong pop: " + str(wrong_pop/wrong)
	#print "Avg correct pop: " + str(correct_pop/correct)
	answered = total/len(queries)
	#print "Correct: " + str(correct)
	precision = correct/total
	recall = correct/len(queries)
	print "Total: " + str(total)
	#print "Answered: " + str(answered)
	print "Precision: " + str(precision)
	print "Recall: " + str(recall)
	return output
Esempio n. 11
0
def query_times(csv, column):
	times = kaggle.slice(csv, column)
	unix = []
	for t in times:
		unix.append([unix_time(t)])
	return unix
Esempio n. 12
0
def skus(csv):
	return kaggle.slice(csv, 1)
Esempio n. 13
0
 def skus(cls):
     class_labels = kaggle.slice(kaggle.file_to_array(csv_file), class_labels_index)
     return class_labels