def closed_form_extra_features(self): preprocess1 = Preprocess() x_set = preprocess1.matrixify(self.data, 60) y_set = Preprocess.get_y(self.data) lengths = [] length_squared = [] for datapoint in self.data: text_length = len(datapoint['text']) lengths.append(text_length) children_length_inter = [] children_list = [] log_children_list = [] for datapoint in self.data: children_list.append(datapoint['children']) if datapoint['children'] != 0: log_children_list.append(math.log(datapoint['children'])) else: log_children_list.append(0) for length, children in zip(lengths, children_list): children_length_inter.append(length * children) preprocess1.add_features(children_length_inter) x_set = preprocess1.add_features(log_children_list) x_set = feature_selector.backwardElimination(x_set, y_set, 0.1) return self.run_model(x_set, y_set)
def test_add_features(self): preprocessor = Preprocess() preprocessor.preprocess(self.data) x_set = preprocessor.matrixify(self.data) new_feature = [] other_feature = [] for some_feature in self.data: new_feature.append(5) for some_other_feature in self.data: other_feature.append(3) x_set = preprocessor.add_features(new_feature) x_set = preprocessor.add_features(other_feature) self.assertEqual(x_set.shape,(self.test_size, 165))
from Evaluator import Evaluator from preprocess import Preprocess import json import feature_selector with open("../src/proj1_data.json") as fp: data = json.load(fp) preprocess1 = Preprocess() Preprocess.preprocess(data) num_words = 60 preprocess1.matrixify(data, num_words) y_set = Preprocess.get_y(data) children_length_inter = preprocess1.children_length_interaction(data) log_children_list = preprocess1.log_children(data) preprocess1.add_features(children_length_inter) preprocess1.add_features(log_children_list) x_set = preprocess1.feature_set x_optimal = feature_selector.backwardElimination(x_set,y_set,0.15) time, mse = Evaluator.evaluate_closed_form(x_optimal, y_set) print(mse) print(time)