data_dir = os.path.join(base_dir,'static/data/') fig_dir = os.path.join(base_dir,'static/images/') movie_dir = os.path.join(base_dir,'static/movies/') data_name = 'all_loans_proc' LD = pd.read_csv(data_dir + data_name, parse_dates=['issue_d']) fips_data = LCL.load_location_data(data_dir, group_by='fips') zip3_data = LCL.load_location_data(data_dir, group_by='zip3') fips_to_zip = LCL.make_fips_to_zip_dict(data_dir, group_by='zip') #%% make a k-tree for doing nearest neighbor imputation of missing data base_map = LCL.load_base_map(fig_dir + 'USA_Counties_text.svg', ax_xml=True) (county_paths,state_paths) = LCL.get_map_paths(base_map,fips_to_zip) title_path = base_map.findAll('text')[0] map_coords = LCH.extract_fips_coords(county_paths) ktree = KDTree(map_coords.values) #make nearest neighbor tree #%% make sequence of decision trees and build a movie X = LD[['longitude','latitude']] y = LD['ROI'] #plot average return by area, not portfolio return max_levels = 16 min_samples_leaf = 50 pred_arr = np.zeros((len(fips_data),max_levels)) for n in xrange(max_levels): clf = tree.DecisionTreeRegressor(max_depth=n+1, min_samples_leaf=min_samples_leaf, random_state=0) clf.fit(X, y) pred_arr[:,n] = clf.predict(fips_data[['longitude','latitude']].values) #%% generate pngs for reg-tree at each depth value