print(utils.multiclass.type_of_target(train_y)) # In[260]: fold_generator = KFold(n_splits=5, shuffle=True, random_state=111) # In[261]: ## use mse to score the model use_metrics = ["neg_mean_squared_error"] # - Linear Model # In[262]: lm_scores = cross_validate(LM(), train_X, train_y, cv=fold_generator, scoring=use_metrics) # - KNN # In[263]: knn_scores = cross_validate(KNN(), train_X, train_y, cv=fold_generator, scoring=use_metrics)
df=gold_lex, cols=DIRECTIONS[curr_dir]['target']) n_in = source_lexicon.shape[1] n_out = target_lexicon.shape[1] baseline = MLP_Ensemble(embeddings=embeddings) my_model = framework.models.Mapping_Model( layers=[n_in] + MY_MODEL['hidden_layers'] + [n_out], activation=MY_MODEL['activation'], dropout_hidden=MY_MODEL['dropout_hidden'], train_steps=MY_MODEL['train_steps'], batch_size=MY_MODEL['batch_size'], optimizer=MY_MODEL['optimizer'], source_lexicon=source_lexicon) reference_LM = framework.models.SKlearn_Mapping_Model( base_model=LM(), source_lexicon=source_lexicon) reference_KNN = framework.models.SKlearn_Mapping_Model( base_model=KNN(n_neighbors=20), source_lexicon=source_lexicon) ev = framework.models.Evaluator( models={ 'baseline': baseline, 'Reference_KNN': reference_KNN, 'reference_LM': reference_LM, 'my_model': my_model }) ev.crossvalidate(words=gold_lex.index, labels=target_lexicon, k_splits=KFOLD, outpath='results/{}/{}'.format( curr_dir, setting.name))
import keras.backend as K from main.data import SETTINGS, VAD, BE5, DIRECTIONS from framework.util import average_subdirs INFO = ''' Ablation experiment in Section 4.1 (Figure 2). ''' k_fold = 10 MODELS = { #'knn':KNN(n_neighbors=20), #only use linreg because no hyperparas 'lm': LM() } for setting in SETTINGS: print(setting.name) gold_lex = setting.load_data() #this experiment will only be performed for those gold lexicons which #also have dominance. if 'Dominance' in gold_lex.columns: for curr_dir in list(DIRECTIONS): print(curr_dir) source_lexicon = gold_lex[DIRECTIONS[curr_dir]['source']] target_lexicon = gold_lex[DIRECTIONS[curr_dir]['target']] for base_model_name, base_model in MODELS.items():
train['Age']=2015-pd.DatetimeIndex(train['Open Date']).year test['Age']=2015-pd.DatetimeIndex(test['Open Date']).year #Extract the age and log transform it X=np.log(train[['Age']].values.reshape((train.shape[0],1))) Xt=np.log(test[['Age']].values.reshape((test.shape[0],1))) y=train['revenue'].values #randomize the order for cross validation combined=zip(y,X) shuffle(combined) y[:], X[:] = zip(*combined) #Model Setup clf=LM() scores=[] ss=KFold(len(y), n_folds=3,shuffle=True) for trainCV, testCV in ss: X_train, X_test, y_train, y_test= X[trainCV], X[testCV], y[trainCV], y[testCV] clf.fit(X_train, np.log(y_train)) y_pred=np.exp(clf.predict(X_test)) scores.append(mean_squared_error(y_test,y_pred)) #Average RMSE from cross validation scores=np.array(scores) print "CV Score:",np.mean(scores**0.5)
# %% ----------------------------------------- # Generate the sample of the data. X, y = gen_data(N=250) # Plot at the synthetic data D = pd.DataFrame(dict(y=y, x1=X[:, 0], x2=X[:, 1], x3=X[:, 2])) (ggplot(D.melt(id_vars='y'), aes(x="value", y='y')) + geom_point(alpha=.25) + facet_wrap('variable') + theme(figure_size=(7, 3))) # %% Build Model ----------------------------------------- # Linear model mod_lm = LM() mod_lm.fit(X, y) # Random Forest model mod_rf = RF() mod_rf.fit(X, y) # RF > LM m.mean_squared_error(y, mod_lm.predict(X)) m.mean_squared_error(y, mod_rf.predict(X)) # Performance on future data. test_X, test_y = gen_data(N=250, seed=2000) m.mean_squared_error(test_y, mod_lm.predict(test_X)) m.mean_squared_error(test_y, mod_rf.predict(test_X))