def tune(train_fn, param_vals, train_feats, train_labels, val_feats, val_labels): train_accs = np.ndarray(len(param_vals)) val_accs = np.ndarray(len(param_vals)) for i, val in enumerate(param_vals): theta, theta_0 = train_fn(train_feats, train_labels, val) train_preds = p1.classify(train_feats, theta, theta_0) train_accs[i] = p1.accuracy(train_preds, train_labels) val_preds = p1.classify(val_feats, theta, theta_0) val_accs[i] = p1.accuracy(val_preds, val_labels) return train_accs, val_accs
def check_extract_bow_feature_vectors(): ex_name = "Extract bow feature vectors" texts = ["He loves her ", "He really really loves her"] keys = ["he", "loves", "her", "really"] dictionary = {k: i for i, k in enumerate(keys)} exp_res = np.array([[1, 1, 1, 0], [1, 1, 1, 1]]) non_bin_res = np.array([[1, 1, 1, 0], [1, 1, 1, 2]]) try: res = p1.extract_bow_feature_vectors(texts, dictionary) except NotImplementedError: log(red("FAIL"), ex_name, ": not implemented") return if not type(res) == np.ndarray: log(red("FAIL"), ex_name, ": does not return a numpy array, type: ", type(res)) return if not len(res) == len(exp_res): log(red("FAIL"), ex_name, ": expected an array of shape ", exp_res.shape, " but got array of shape", res.shape) return log(green("PASS"), ex_name) if (res == exp_res).all(): log(yellow("WARN"), ex_name, ": uses binary indicators as features") elif (res == non_bin_res).all(): log(green("PASS"), ex_name, ": correct non binary features") else: log(red("FAIL"), ex_name, ": unexpected feature matrix") return
def check_bag_of_words(): ex_name = "Bag of words" texts = ["He loves to walk on the beach", "There is nothing better"] try: res = p1.bag_of_words(texts) except NotImplementedError: log(red("FAIL"), ex_name, ": not implemented") return if not type(res) == dict: log(red("FAIL"), ex_name, ": does not return a tuple, type: ", type(res)) return vals = sorted(res.values()) exp_vals = list(range(len(res.keys()))) if not vals == exp_vals: log(red("FAIL"), ex_name, ": wrong set of indices. Expected: ", exp_vals, " got ", vals) return log(green("PASS"), ex_name, "") keys = sorted(res.keys()) exp_keys = [ 'beach', 'better', 'he', 'is', 'loves', 'nothing', 'on', 'the', 'there', 'to', 'walk' ] stop_keys = ['beach', 'better', 'loves', 'nothing', 'walk'] if keys == exp_keys: log(yellow("WARN"), ex_name, ": does not remove stopwords:", [k for k in keys if k not in stop_keys]) elif keys == stop_keys: log(green("PASS"), ex_name, " stopwords removed") else: log(red("FAIL"), ex_name, ": keys are missing:", [k for k in stop_keys if k not in keys], " or are not unexpected:", [k for k in keys if k not in stop_keys])
#------------------------------------------------------------------------------- # Data loading #------------------------------------------------------------------------------- train_data = utils.load_data('reviews_train.tsv') val_data = utils.load_data('reviews_val.tsv') test_data = utils.load_data('reviews_test.tsv') train_texts, train_labels = zip(*((sample['text'], sample['sentiment']) for sample in train_data)) val_texts, val_labels = zip(*((sample['text'], sample['sentiment']) for sample in val_data)) test_texts, test_labels = zip(*((sample['text'], sample['sentiment']) for sample in test_data)) dictionary = p1.bag_of_words(train_texts) train_bow_features = p1.extract_bow_feature_vectors(train_texts, dictionary) val_bow_features = p1.extract_bow_feature_vectors(val_texts, dictionary) test_bow_features = p1.extract_bow_feature_vectors(test_texts, dictionary) #------------------------------------------------------------------------------- # Calculate theta & theta_0 using each algorithm #------------------------------------------------------------------------------- toy_features, toy_labels = toy_data = utils.load_toy_data('toy_data.tsv') T = 10 L = 0.2 thetas_perceptron = p1.perceptron(toy_features, toy_labels, T)
def train_fn(features, labels, L): return p1.pegasos(features, labels, best_T, L)
def train_fn(features, labels, T): return p1.pegasos(features, labels, T, best_L)