def word_vectors(csv_file, vector_length, validation, word_to_skus=None, generated_sku_vectors=None): word_vects = {} words_index = 3 queries = kaggle.slice(kaggle.file_to_array(csv_file, validation), words_index) for q in queries: formatted = kaggle.format_string(q) for word in kaggle.tokenize(formatted): if word not in word_vects: word_vects[word] = vector.random_vector(vector_length) return word_vects
def test_data(csv_file, class_labels_index, input_data_index, validation, items_count, ngram=1): array = kaggle.file_to_array(csv_file, validation) class_labels = kaggle.slice(array, class_labels_index) test_data = kaggle.slice(array, input_data_index) formatted_test_data = [] for d in test_data: formatted = kaggle.format_string(d) tokens = kaggle.tokenize(formatted, ngram) formatted_test_data.append(tokens) if items_count != 'All': class_labels, formatted_test_data = class_labels[0:items_count], formatted_test_data[0:items_count] return class_labels, formatted_test_data
def sku_to_searches(): array = kaggle.file_to_array(training, "all") #array = array[0:10000:4] + array[1:10000:4] + array[2:10000:4] skus = kaggle.slice(array, 1) skus = set(skus) skus_searches = {} for sku in skus: skus_searches[sku] = [] for line in array: sku = line[1] search = line[3] search = " ".join(kaggle.tokenize(search.lower())) search = re.sub("\"", '', search) skus_searches[sku].append(search) return skus_searches