def kfold_train(gamma=5): # with open('../data/node_dict.pkl','r') as f: # graph_nodes = cPickle.load( f ) graph_nodes = build_graph() lat_lng_df = pd.read_pickle('../data/all_intersections_df.pkl') df_train = pd.read_pickle('../data/train_df.pkl') X = df_train.loc[:, ['X', 'Y']].values y = df_train.loc[:, 'Category'].values gc = GraphClassifier(graph_nodes, lat_lng_df, gamma=gamma) gc.fit(X, y) with open('../data/graph_class_gamma-{0}.pkl'.format(gamma),'w') as f: cPickle.dump( gc, f, -1 )
def yst_train(gamma=20): # with open('../data/node_dict.pkl','r') as f: # graph_nodes = cPickle.load( f ) graph_nodes = build_graph() lat_lng_df = pd.read_pickle('../data/all_intersections_df.pkl') df_train = pd.read_pickle('../data/train_df_yst.pkl') X = df_train.loc[:, ['X', 'Y']].values y = df_train.loc[:, 'Category'].values # yst_columns = [ 'year_group', 'season', 'time_of_day', 'weekend' ] yst_columns = [ 'year', 'month', 'DayOfWeek' ] yst_column_values = [ df_train[col].values for col in yst_columns ] yst_dict = { col: val for col, val in izip(yst_columns, yst_column_values) } gc = GraphClassifier(graph_nodes, lat_lng_df, gamma=gamma) gc.fit(X, y, yst_dict) with open('../data/graph_class_yst_gamma-{0}.pkl'.format(gamma),'w') as f: cPickle.dump( gc, f, -1 )
def train_model(predict=True, use_resampling=False, num_resamples=10): graph_nodes = build_graph() with open('../data/node_dict.pkl','w') as f: cPickle.dump( graph_nodes, f, -1 ) lat_lng_df = pd.read_pickle('../data/all_intersections_df.pkl') df_train = pd.read_csv('../data/train.csv', parse_dates=['Dates']) X = df_train.loc[:, ['X', 'Y']].values y = df_train.loc[:, 'Category'].values if use_resampling: resamples = num_resamples else: resamples = None gc = GraphClassifier(graph_nodes, lat_lng_df, num_resamples=resamples) gc.fit(X, y) # save fit model if use_resampling: with open('../data/graph_class_resample_{0}.pkl'.format(num_resamples),'w') as f: cPickle.dump( gc, f, -1 ) else: with open('../data/graph_class_normal.pkl','w') as f: cPickle.dump( gc, f, -1 ) if predict: predict_df = gc.predict(X) print "\nsaving submission..." with gzip.open('../data/submissions/submission_training_data.csv.gz', 'w') as f: predict_df.to_csv(f, index=False) print "Done!"