def __init__(self, save): self.save = save self.data = MainTable().getDB() training_set, test_set = train_test_split(self.data, test_size=TEST_SIZE) # separate data to apartments features (without prices) and apartments prices self.training_features = removeCols(training_set, ['SQR_FEET_PRICE']) self.test_features = removeCols(test_set, ['SQR_FEET_PRICE']) self.training_prices = selectCols(training_set, ['SQR_FEET_PRICE']) self.test_prices = selectCols(test_set, ['SQR_FEET_PRICE']) self.all_data_without_prices = removeCols(self.data, ['SQR_FEET_PRICE']) self.all_data_only_prices = selectCols(self.data, ['SQR_FEET_PRICE'])
def pushMuseumsDB(self, radius): self.museums = self._extractMuseumsData() self.data = Apartments.getInstance().getData() self.data = selectCols(self.data, ['ADDRESS', 'LAT', 'LON']) self.data['MUSEUMS'] = self.data.apply(self._countMuseumsInRadius, args=(radius, ), axis=1) self.data = selectCols(self.data, ['ADDRESS', 'MUSEUMS']).drop_duplicates() self.data.to_csv(path_or_buf=DATASETS_PATH + "/museums_db" + str(radius) + ".csv", index=False)
def parksParamTuning(): train_scores_dict = {} test_scores_dict = {} radius_list = [0.5, 1] area_list = [100, 200] for radius in radius_list: for area in area_list: file_name = "_parksRadius" + str(radius) + "_area" + str(area) all_data = MainTable(extra=file_name) df = all_data.getDB() # Split to Data and Actual results X = selectCols(df, features) y = df['SQR_FEET_PRICE'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) tot_train_score = 0 tot_test_score = 0 n = 5 for i in range(0, n): regressor = DecisionTreeRegressor(min_impurity_decrease=200) regressor.fit(X_train, y_train) tot_train_score += regressor.score(X_train, y_train) tot_test_score += regressor.score(X_test, y_test) train_scores_dict["radius " + str(radius) + "\narea " + str(area)] = tot_train_score / n test_scores_dict["radius " + str(radius) + "\narea " + str(area)] = tot_test_score / n graph_paramTuning(train_scores_dict, test_scores_dict, 'Tuning parks radius and area with Desicion Trees', 'Parks radius and area')
def paramTuning(file_name, param_values_list, param_name): train_scores_dict = {} test_scores_dict = {} for p in param_values_list: # Get the base table all_data = MainTable(extra = file_name + str(p)) df = all_data.getDB() # Split to Data and Actual results X = selectCols(df, features) y = df['SQR_FEET_PRICE'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42) tot_train_score = 0 tot_test_score = 0 n = 5 for i in range(1, n+1): regressor = DecisionTreeRegressor(min_impurity_decrease=200) regressor.fit(X_train, y_train) tot_train_score += regressor.score(X_train, y_train) tot_test_score += regressor.score(X_test, y_test) train_scores_dict[p] = tot_train_score / n test_scores_dict[p] = tot_test_score / n graph_paramTuning(train_scores_dict, test_scores_dict, 'Tuning ' + param_name + 'with Desicion Trees', param_name)
def loadMuseumsDB(self, radius): try: self.data = pd.read_csv(DATASETS_PATH + "/museums_db" + str(radius) + ".csv") self.data = selectCols(self.data, ['ADDRESS', 'MUSEUMS']).drop_duplicates( subset='ADDRESS', keep='first') except FileNotFoundError: self.pushMuseumsDB(radius)
def compareFeatures(): # Get the base table all_data = MainTable() df = all_data.getDB() base_feats = [ 'BOROUGH', 'BUILDING_AGE' ] external_feats = [ 'CRIMES', 'HI_ED', 'HIGH_SCHOOLS', 'BUS_STOPS', 'SUBWAY_STOPS', 'NUM_OF_PARKS', 'AREA_OF_PARKS', 'NOISE', 'HEALTH', 'GALLERIES', 'MUSEUMS' ] mean_train_score_b, mean_test_score_b = getBaseFeatsScores(df, base_feats) train_scores_dict = {} test_scores_dict = {} for feat in external_feats: curr_feats = base_feats curr_feats.append(feat) # Split to Data and Actual results X = selectCols(df, curr_feats) y = df['SQR_FEET_PRICE'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42) tot_train_score = 0 tot_test_score = 0 n = 5 for i in range(1, n+1): regressor = DecisionTreeRegressor(min_impurity_decrease=200) regressor.fit(X_train, y_train) tot_train_score += regressor.score(X_train, y_train) tot_test_score += regressor.score(X_test, y_test) train_scores_dict[feat] = tot_train_score / n test_scores_dict[feat] = tot_test_score / n graph_barsForFeatures(train_scores_dict, test_scores_dict, 'Comparing features using Desicion Trees', 'Feature Name', mean_train_score_b, mean_test_score_b)
def getBaseFeatsScores(df, base_feats): # Split to Data and Actual results X_b = selectCols(df, base_feats) y_b = df['SQR_FEET_PRICE'] X_train_b, X_test_b, y_train_b, y_test_b = train_test_split(X_b, y_b, test_size=0.25, random_state=42) tot_train_score_b = 0 tot_test_score_b = 0 n = 5 for i in range(1, n + 1): regressor = DecisionTreeRegressor(min_impurity_decrease=200) regressor.fit(X_train_b, y_train_b) tot_train_score_b += regressor.score(X_train_b, y_train_b) tot_test_score_b += regressor.score(X_test_b, y_test_b) mean_train_score_b = tot_train_score_b / n mean_test_score_b = tot_test_score_b / n return mean_train_score_b, mean_test_score_b