Example #1
0
def word_vectors(csv_file, vector_length, validation, word_to_skus=None, generated_sku_vectors=None):
	word_vects = {}
	words_index = 3
	queries = kaggle.slice(kaggle.file_to_array(csv_file, validation), words_index)
	for q in queries:
		formatted = kaggle.format_string(q)
		for word in kaggle.tokenize(formatted):
			if word not in word_vects:
				word_vects[word] = vector.random_vector(vector_length)

	return word_vects
Example #2
0
def test_data(csv_file, class_labels_index, input_data_index, validation, items_count, ngram=1):
	array = kaggle.file_to_array(csv_file, validation)
	class_labels = kaggle.slice(array, class_labels_index)
	test_data = kaggle.slice(array, input_data_index)
	formatted_test_data = []
	for d in test_data:
		formatted = kaggle.format_string(d)
		tokens = kaggle.tokenize(formatted, ngram)
		formatted_test_data.append(tokens)
	if items_count != 'All':
		class_labels, formatted_test_data = class_labels[0:items_count], formatted_test_data[0:items_count]
	return class_labels, formatted_test_data
Example #3
0
def sku_to_searches():
	array = kaggle.file_to_array(training, "all")
	#array = array[0:10000:4] + array[1:10000:4] + array[2:10000:4]
	skus = kaggle.slice(array, 1)
	skus = set(skus)
	skus_searches = {}
	for sku in skus:
		skus_searches[sku] = []

	for line in array:
		sku = line[1]
		search = line[3]
		search = " ".join(kaggle.tokenize(search.lower()))
		search = re.sub("\"", '', search)
		skus_searches[sku].append(search)
	return skus_searches