def data(csv_file, vector_length, validation, word_vector_hash=None): if word_vector_hash == None: word_vector_hash = word_vectors(csv_file, vector_length, validation) # generate the sku-words sku_words = [] array = kaggle.file_to_array(csv_file, validation) class_labels = kaggle.slice(array, 1) text = kaggle.slice(array, 3) indexes = len(text) for i in range(indexes): word_count = kaggle.string_to_hash(text[i]) label = class_labels[i] line_array = [label, word_count] sku_words.append(line_array) # get a list of only the vectors sku_vects = sku_vectors(sku_words, word_vector_hash, vector_length) vects = [] for triplet in sku_vects: vect = triplet[2] vects.append(vect) sku_hash = sku_vector_hash(sku_words, word_vector_hash, vector_length) return vects, class_labels, word_vector_hash, sku_hash
def test_data(csv_file, class_labels_index, input_data_index, validation, items_count, ngram=1): array = kaggle.file_to_array(csv_file, validation) class_labels = kaggle.slice(array, class_labels_index) test_data = kaggle.slice(array, input_data_index) formatted_test_data = [] for d in test_data: formatted = kaggle.format_string(d) tokens = kaggle.tokenize(formatted, ngram) formatted_test_data.append(tokens) if items_count != 'All': class_labels, formatted_test_data = class_labels[0:items_count], formatted_test_data[0:items_count] return class_labels, formatted_test_data
def csv_with_more_data(): new_path = "../data/extra.csv" more_data = xml_to_array() old = kaggle.file_to_array(training) skus = set(kaggle.slice(old, 1)) added_skus = set([]) count = 0 with open(new_path, "a") as new: for o in old: new.write(",".join(o).strip() + "\n") for product in more_data: sku = product[0] name = product[1] if sku in skus and sku not in added_skus: count += 1 added_skus.add(sku) fake_line = ",".join(['fakeuser', sku, 'fakecategory', name, 'fake_time', 'fake_time']).encode('utf-8').strip() for i in range(15): new.write(fake_line + "\n") print len(added_skus) print len(skus) print count return None
def validation_test(): n = 20 sample_size = 10591 start = time.time() model, word_vectors, sku_vectors, labels, sku_hash = train(training, n, vector_length, False) array = kaggle.file_to_array(training, True) labels = kaggle.slice(array, 1) print "Examples: " + str(len(labels)) queries = kaggle.slice(array, 3) test_data = [] for q in queries: word_hash = kaggle.string_to_hash(kaggle.format_string(q)) test_data.append(query_vector(word_hash, word_vectors, vector_length)) score = test(model, n, test_data, labels, sample_size, vector_length, sku_hash) print "Duration: " + str(time.time() - start) return score
def train_model(csv_file, ngram=1, validation=False): class_labels_index = 1 input_data_index = 3 data = kaggle.file_to_hash(csv_file, class_labels_index, input_data_index, validation) model = train(data, ngram) data = kaggle.file_to_array(csv_file, validation) class_labels = kaggle.slice(data, class_labels_index) popularity = popular.popularity_hash(class_labels, data) return model, popularity
def word_vectors(csv_file, vector_length, validation, word_to_skus=None, generated_sku_vectors=None): word_vects = {} words_index = 3 queries = kaggle.slice(kaggle.file_to_array(csv_file, validation), words_index) for q in queries: formatted = kaggle.format_string(q) for word in kaggle.tokenize(formatted): if word not in word_vects: word_vects[word] = vector.random_vector(vector_length) return word_vects
def real_test(): neighbors = 20 output = [] model, word_vectors, _, labels, sku_hash = train(extra, neighbors, vector_length, False) queries = kaggle.slice(kaggle.file_to_array(training, True), 3) for q in queries: word_hash = kaggle.string_to_hash(kaggle.format_string(q)) vect = query_vector(word_hash, word_vectors, vector_length) pred = model.predict(vect)[0] output.append([pred]) return output
def validation_test(): start = time.time() answers = kaggle.slice(kaggle.file_to_array(training, True), 1) #xtrees_preds, xtrees2, xtrees3, xtrees4, forest_preds, knn_preds = decision_tree.real_test() models = decision_tree.real_test() #time_preds = time_rank.real_test() #tf_preds = tf_idf.real_test(w) #merged = vote.merge_answers(xtrees_preds, xtrees2, xtrees3, xtrees4, forest_preds, tf_preds, knn_preds, time_preds) merged = vote.vote(models) #kaggle.write_predictions(merged, "../data/predictions_9_29_12.csv") accuracy = score(merged, answers) print "Duration: " + str(time.time() - start) return accuracy
def sku_to_searches(): array = kaggle.file_to_array(training, "all") #array = array[0:10000:4] + array[1:10000:4] + array[2:10000:4] skus = kaggle.slice(array, 1) skus = set(skus) skus_searches = {} for sku in skus: skus_searches[sku] = [] for line in array: sku = line[1] search = line[3] search = " ".join(kaggle.tokenize(search.lower())) search = re.sub("\"", '', search) skus_searches[sku].append(search) return skus_searches
def validation_test(): models, word_vectors = train_tree() correct_data = best_buy.sku_to_searches() # Create an array for each model to store predictions. output = [] for m in models: output.append([]) file_array = kaggle.file_to_array(training, True) queries = kaggle.slice(file_array, 3) skus = kaggle.slice(file_array, 1) total = 0. correct = 0. correct_pop = 0. wrong_pop = 0. right_answers = [] wrong_answers = [] for index,q in enumerate(queries): word_hash = kaggle.string_to_hash(kaggle.format_string(q)) vect = knn.query_vector(word_hash, word_vectors, vector_length) all_preds = [] for i,m in enumerate(models): preds = m.predictions(vect)[0:5] output[i].append(preds) pred = [] for p in preds: pred.append(p[0]) all_preds.append(p[0]) #For testing accuracy. sub_out = set(all_preds) total += len(sub_out) correct_sku = skus[index] pop = len(correct_data[correct_sku]) if correct_sku in sub_out: correct += 1. correct_pop += pop right_answers.append([q,correct_sku]) else: wrong_pop += pop wrong_answers.append([q,correct_sku]) #print "\nQuery: " + q #print "Correct Answer: " + str(correct_data[correct_sku][0:6]) + ". Popularity: " + str(len(correct_data[correct_sku])) #for p in sub_out: # print "Prediction: " + str(correct_data[p][0:6]) + ". Popularity: " + str(len(correct_data[p])) wrong = len(queries) - correct #print "Avg wrong pop: " + str(wrong_pop/wrong) #print "Avg correct pop: " + str(correct_pop/correct) answered = total/len(queries) #print "Correct: " + str(correct) precision = correct/total recall = correct/len(queries) print "Total: " + str(total) #print "Answered: " + str(answered) print "Precision: " + str(precision) print "Recall: " + str(recall) return output
def query_times(csv, column): times = kaggle.slice(csv, column) unix = [] for t in times: unix.append([unix_time(t)]) return unix
def skus(csv): return kaggle.slice(csv, 1)
def skus(cls): class_labels = kaggle.slice(kaggle.file_to_array(csv_file), class_labels_index) return class_labels