def main(): logfile = open(os.path.dirname(os.path.realpath(__file__)) + '/log/email_star.log', 'a') parser = optparse.OptionParser() parser.add_option("-s", "--start", dest="start", type="int", help="how far back to look for star feed items, in minutes") parser.add_option("-e", "--end", dest="end", type="int", help="delay for star feed items, in minutes from present") parser.add_option("-D", "--dry-run", action="store_true", dest="dry_run") (options, args) = parser.parse_args() items = wsgi.Star.search(created={"$gt": now() - 60*options.start, "$lt": now() - 60*options.end}) for item in items: if item.get('entity_class') == "User": recipient = item.entity elif item.get('entity_class') == "Expr": recipient = item.entity.owner if not item.initiator.id == recipient.id: headers = wsgi.mail_feed(item, recipient, options.dry_run) logfile.write('\n' + time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.localtime(time.time())) + " " * 4 + headers['To'] + ' ' * ( 50 - len(headers['To']) ) + headers['Subject'] ) logfile.close()
def run(train, y, test, v, z): #cname = sys._getframe().f_code.co_name cname = 'p' train.drop('id', axis=1, inplace=True) test.drop('id', axis=1, inplace=True) from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, space_eval dtrain = xgb.DMatrix(train, y) def step_xgb(params): cv = xgb.cv(params=params, dtrain=dtrain, num_boost_round=10000, early_stopping_rounds=50, nfold=10, seed=params['seed']) score = cv.ix[len(cv)-1, 0] print(cname, score, len(cv), params) return dict(loss=score, status=STATUS_OK) space_xgb = dict( max_depth = hp.choice('max_depth', range(2, 8)), subsample = hp.quniform('subsample', 0.6, 1, 0.05), colsample_bytree = hp.quniform('colsample_bytree', 0.6, 1, 0.05), learning_rate = hp.quniform('learning_rate', 0.005, 0.03, 0.005), min_child_weight = hp.quniform('min_child_weight', 1, 6, 1), gamma = hp.quniform('gamma', 0.5, 10, 0.05), objective = 'binary:logistic', eval_metric = 'logloss', seed = 1, silent = 1 ) trs = state.load('xgb_trials') if trs == None: tr = Trials() else: tr, _ = trs if len(tr.trials) > 0: print('reusing %d trials, best was:'%(len(tr.trials)), space_eval(space_xgb, tr.argmin)) best = tr.argmin while len(tr.trials) < 15: best = fmin(step_xgb, space_xgb, algo=tpe.suggest, max_evals=len(tr.trials) + 1, trials = tr) state.save('xgb_trials', (tr, space_xgb)) xgb_params = space_eval(space_xgb, best) print(xgb_params) N_splits = 9 N_seeds = 1 skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True) dtest = xgb.DMatrix(test) for s in range(N_seeds): scores = [] cname2 = cname + str(s) v[cname2], z[cname2] = 0, 0 xgb_params['seed'] = s + 4242 for n, (itrain, ival) in enumerate(skf.split(train, y)): dtrain = xgb.DMatrix(train.ix[itrain], y[itrain]) dvalid = xgb.DMatrix(train.ix[ival], y[ival]) watch = [(dtrain, 'train'), (dvalid, 'valid')] clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False) p = clf.predict(dvalid) v.loc[ival, cname2] += p score = metrics.log_loss(y[ival], p) z[cname2] += clf.predict(dtest) print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, state.now()) scores.append(score) z[cname2] /= N_splits cv = scores z['y'] = z[cname2] print('validation loss: ', cv, np.mean(cv), np.std(cv)) return cv, None
train, y, test, _ = data.get() z = pd.DataFrame() z['id'] = test.id z['y'] = 0 v = pd.DataFrame() v['id'] = train.id v['y'] = y cv, _ = run(train, y, test, v, z) state.save('model', (v, z, cv, None)) else: v, z, cv, _ = saved return v, z, cv, _ if '__main__' == __name__: print('starting', state.now()) v, z, cv, _ = predict() state.save_model(v, z, cv) if public_score == None: # если есть public score - перезаписывать отправленное уже не стоит state.save_predicts(z) else: import os if os.path.exists('../model_scores.csv'): mdf = pd.read_csv('../model_scores.csv') else: mdf = pd.DataFrame(columns=['timestamp', 'model', 'cv', 'cv std', 'public score']) idx = mdf.model == state.base_name_ if np.sum(idx) == 0: mdf.loc[len(mdf), 'model'] = state.base_name_ idx = mdf.model == state.base_name_
def run(train, y, test, v, z): #cname = sys._getframe().f_code.co_name from keras import layers from keras import models from keras import optimizers cname = 'p' train.drop('id', axis=1, inplace=True) test.drop('id', axis=1, inplace=True) num_splits = 9 scaler = preprocessing.RobustScaler() train = scaler.fit_transform(train) test = scaler.transform(test) input_dims = train.shape[1] def build_model(): input_ = layers.Input(shape=(input_dims,)) model = layers.Dense(256, kernel_initializer='Orthogonal')(input_) #model = layers.BatchNormalization()(model) #model = layers.advanced_activations.PReLU()(model) model = layers.Activation('selu')(model) #model = layers.Dropout(0.7)(model) model = layers.Dense(64, kernel_initializer='Orthogonal')(model) #model = layers.BatchNormalization()(model) model = layers.Activation('selu')(model) #model = layers.advanced_activations.PReLU()(model) #model = layers.Dropout(0.9)(model) model = layers.Dense(16, kernel_initializer='Orthogonal')(model) #model = layers.BatchNormalization()(model) model = layers.Activation('selu')(model) #model = layers.advanced_activations.PReLU()(model) model = layers.Dense(1, activation='sigmoid')(model) model = models.Model(input_, model) model.compile(loss = 'binary_crossentropy', optimizer = optimizers.Nadam()) #print(model.summary(line_length=120)) return model batch_size = 128 np.random.seed(1234) build_model().summary(line_length=120) ss = model_selection.ShuffleSplit(n_splits=num_splits, random_state=11, test_size=1/num_splits) scores = list() model_path = state.temp_name('keras_mlp_weights') v[cname] = 0 z[cname] = 0 for n, (itrain, ival) in enumerate(ss.split(train, y)): xtrain, xval = train[itrain], train[ival] ytrain, yval = y[itrain], y[ival] model = build_model() model.fit( xtrain, ytrain, batch_size = batch_size, epochs = 10000, validation_data = (xval, yval), verbose = 0, callbacks = build_keras_fit_callbacks(model_path), shuffle = True ) model.load_weights(model_path) p = model.predict(xval) v.loc[ival, cname] += p.ravel() score = metrics.log_loss(y[ival], p) print(cname, 'fold %d: '%(n+1), score, state.now()) scores.append(score) z[cname] += model.predict(test).ravel() del model for i in range(3): gc.collect(i) state.drop_temp(model_path) cv=np.mean(scores) z[cname] /= num_splits z['y'] = z[cname] return cv, None
def predict(): saved = state.load('model') #saved = None if debug_mode: saved = None if saved == None: train, y, test, _ = data.get() ftrain, ftest, _ = fea_1.get() ftrain2, ftest2, _ = fea_2.get() train = pd.concat([train, ftrain, ftrain2], axis=1) test = pd.concat([test, ftest, ftest2], axis=1) print(train.shape, test.shape) z = pd.DataFrame() z['id'] = test.id z['y'] = 0 v = pd.DataFrame() v['id'] = train.id v['y'] = y cv, _ = run(train, y, test, v, z) state.save('model', (v, z, cv, None)) else: v, z, cv, _ = saved return v, z, cv, _ if '__main__' == __name__: print('starting', state.now()) state.run_predict(predict, debug_mode, public_score) print('done.', state.now())
def run(train, y, test, v, z): #cname = sys._getframe().f_code.co_name cname = 'p' train.drop('id', axis=1, inplace=True) test.drop('id', axis=1, inplace=True) from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, space_eval dtrain = xgb.DMatrix(train, y) def step_xgb(params): cv = xgb.cv(params=params, dtrain=dtrain, num_boost_round=10000, early_stopping_rounds=50, nfold=10, seed=params['seed']) score = cv.ix[len(cv) - 1, 0] print(cname, score, len(cv), params) return dict(loss=score, status=STATUS_OK) space_xgb = dict(max_depth=hp.choice('max_depth', range(2, 9)), subsample=hp.quniform('subsample', 0.6, 1, 0.05), colsample_bytree=hp.quniform('colsample_bytree', 0.6, 1, 0.05), learning_rate=hp.quniform('learning_rate', 0.005, 0.1, 0.005), min_child_weight=hp.quniform('min_child_weight', 1, 6, 1), gamma=hp.quniform('gamma', 0.5, 10, 0.05), reg_alpha=hp.quniform('reg_alpha', 0, 1, 0.001), objective='binary:logistic', eval_metric='logloss', seed=1, silent=1) trs = state.load('xgb_trials') if trs == None or debug_mode: tr = Trials() else: tr, _ = trs if len(tr.trials) > 0: print('reusing %d trials, best was:' % (len(tr.trials)), space_eval(space_xgb, tr.argmin)) best = tr.argmin while len(tr.trials) < 15: best = fmin(step_xgb, space_xgb, algo=tpe.suggest, max_evals=len(tr.trials) + 1, trials=tr) state.save('xgb_trials', (tr, space_xgb)) xgb_params = space_eval(space_xgb, best) print(xgb_params) N_splits = 9 N_seeds = 3 skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True) dtest = xgb.DMatrix(test) cv = [] for s in range(N_seeds): scores = [] cname2 = cname + str(s) v[cname2], z[cname2] = 0, 0 xgb_params['seed'] = s + 4242 for n, (itrain, ival) in enumerate(skf.split(train, y)): dtrain = xgb.DMatrix(train.ix[itrain], y[itrain]) dvalid = xgb.DMatrix(train.ix[ival], y[ival]) watch = [(dtrain, 'train'), (dvalid, 'valid')] clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False) p = clf.predict(dvalid) v.loc[ival, cname2] += p score = metrics.log_loss(y[ival], p) z[cname2] += clf.predict(dtest) print( cname, 'seed %d step %d of %d: ' % (xgb_params['seed'], n + 1, skf.n_splits), score, state.now()) scores.append(score) z[cname2] /= N_splits cv.append(np.mean(scores)) print('seed %d loss: ' % (xgb_params['seed']), scores, np.mean(scores), np.std(scores)) z['y'] = z[cname2] print('cv:', cv, np.mean(cv), np.std(cv)) return cv, None
saved = state.load('model') #saved = None if debug_mode: saved = None if saved == None: train, y, test, _ = data.get() ftrain, ftest, _ = fea_1.get() ftrain2, ftest2, _ = fea_2.get() train = pd.concat([train, ftrain, ftrain2], axis=1) test = pd.concat([test, ftest, ftest2], axis=1) print(train.shape, test.shape) z = pd.DataFrame() z['id'] = test.id z['y'] = 0 v = pd.DataFrame() v['id'] = train.id v['y'] = y cv, _ = run(train, y, test, v, z) state.save('model', (v, z, cv, None)) else: v, z, cv, _ = saved return v, z, cv, _ if '__main__' == __name__: print('starting', state.now()) state.run_predict(predict, debug_mode, public_score) print('done.', state.now())
def run(state, train, y, test, v, z): #cname = sys._getframe().f_code.co_name cname = 'p' train.drop('id', axis=1, inplace=True) test.drop('id', axis=1, inplace=True) from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, space_eval def step_et(params): clf = ensemble.ExtraTreesRegressor(**params) cv = model_selection.cross_val_score(clf, train, y, scoring=metrics.make_scorer( metrics.log_loss), cv=10, n_jobs=-2) score = np.mean(cv) print(cname, score, params) return dict(loss=score, status=STATUS_OK) space_et = dict( n_estimators=hp.choice('n_estimators', range(50, 1500)), #criterion = hp.choice('criterion', ["gini", "entropy"]), min_samples_split=hp.choice('min_samples_split', range(2, 10)), min_samples_leaf=hp.choice('min_samples_leaf', range(1, 10)), max_features=hp.choice('max_features', range(1, 16)), random_state=1) trs = state.load('et_trials') if trs == None or debug_mode: tr = Trials() else: tr, _ = trs if len(tr.trials) > 0: print('reusing %d trials, best was:' % (len(tr.trials)), space_eval(space_et, tr.argmin)) best = tr.argmin while len(tr.trials) < 15: best = fmin(step_et, space_et, algo=tpe.suggest, max_evals=len(tr.trials) + 1, trials=tr) state.save('et_trials', (tr, space_et)) et_params = space_eval(space_et, best) print(et_params) N_splits = 9 N_seeds = 3 skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True) cv = [] for s in range(N_seeds): scores = [] cname2 = cname + str(s) v[cname2], z[cname2] = 0, 0 et_params['random_state'] = s + 4242 for n, (itrain, ival) in enumerate(skf.split(train, y)): clf = ensemble.ExtraTreesRegressor(**et_params) clf.fit(train.ix[itrain], y[itrain]) p = clf.predict(train.ix[ival]) v.loc[ival, cname2] += p score = metrics.log_loss(y[ival], p) z[cname2] += clf.predict(test) print( cname, 'seed %d step %d of %d: ' % (et_params['random_state'], n + 1, skf.n_splits), score, state.now()) scores.append(score) z[cname2] /= N_splits cv.append(np.mean(scores)) print('seed %d loss: ' % (et_params['random_state']), scores, np.mean(scores), np.std(scores)) z['y'] = z[cname2] print('cv:', cv, np.mean(cv), np.std(cv)) return cv, None
def run(train, y, test, v, z): #cname = sys._getframe().f_code.co_name from keras import layers from keras import models from keras import optimizers cname = 'p' train.drop('id', axis=1, inplace=True) test.drop('id', axis=1, inplace=True) num_splits = 9 scaler = preprocessing.RobustScaler() train = scaler.fit_transform(train) test = scaler.transform(test) input_dims = train.shape[1] def build_model(): input_ = layers.Input(shape=(input_dims, )) model = layers.Dense(512, kernel_initializer='Orthogonal')(input_) #model = layers.BatchNormalization()(model) #model = layers.advanced_activations.PReLU()(model) model = layers.Activation('selu')(model) #model = layers.Dropout(0.7)(model) model = layers.Dense(256, kernel_initializer='Orthogonal')(model) #model = layers.BatchNormalization()(model) model = layers.Activation('selu')(model) #model = layers.advanced_activations.PReLU()(model) #model = layers.Dropout(0.9)(model) model = layers.Dense(256, kernel_initializer='Orthogonal')(model) model = layers.BatchNormalization()(model) model = layers.Activation('selu')(model) #model = layers.advanced_activations.PReLU()(model) #model = layers.Dropout(0.9)(model) model = layers.Dense(16, kernel_initializer='Orthogonal')(model) #model = layers.BatchNormalization()(model) model = layers.Activation('selu')(model) #model = layers.advanced_activations.PReLU()(model) model = layers.Dense(1, activation='sigmoid')(model) model = models.Model(input_, model) model.compile(loss='binary_crossentropy', optimizer=optimizers.Nadam()) #print(model.summary(line_length=120)) return model batch_size = 128 np.random.seed(1234) build_model().summary(line_length=120) ss = model_selection.ShuffleSplit(n_splits=num_splits, random_state=11, test_size=1 / num_splits) scores = list() model_path = state.temp_name('keras_mlp_weights') v[cname] = 0 z[cname] = 0 for n, (itrain, ival) in enumerate(ss.split(train, y)): xtrain, xval = train[itrain], train[ival] ytrain, yval = y[itrain], y[ival] model = build_model() model.fit(xtrain, ytrain, batch_size=batch_size, epochs=10000, validation_data=(xval, yval), verbose=0, callbacks=build_keras_fit_callbacks(model_path), shuffle=True) model.load_weights(model_path) p = model.predict(xval) v.loc[ival, cname] += p.ravel() score = metrics.log_loss(y[ival], p) print(cname, 'fold %d: ' % (n + 1), score, state.now()) scores.append(score) z[cname] += model.predict(test).ravel() del model for i in range(3): gc.collect(i) print('scores:', scores, np.mean(scores), np.std(scores)) state.drop_temp(model_path) cv = np.mean(scores) z[cname] /= num_splits z['y'] = z[cname] return cv, None
z = pd.DataFrame() z['id'] = test.id z['y'] = 0 v = pd.DataFrame() v['id'] = train.id v['y'] = y cv, _ = run(train, y, test, v, z) state.save('model', (v, z, cv, None)) else: v, z, cv, _ = saved return v, z, cv, _ if '__main__' == __name__: print('starting', state.now()) v, z, cv, _ = predict() if not debug_mode: state.save_model(v, z, cv) if public_score == None: # если есть public score - перезаписывать отправленное уже не стоит state.save_predicts(z) else: import os if os.path.exists('../model_scores.csv'): mdf = pd.read_csv('../model_scores.csv') else: mdf = pd.DataFrame( columns=['timestamp', 'model', 'cv', 'cv std', 'public score']) idx = mdf.model == state.base_name_ if np.sum(idx) == 0:
def run(state, train, y, test, v, z): #cname = sys._getframe().f_code.co_name cname = 'p' train.drop('id', axis=1, inplace=True) test.drop('id', axis=1, inplace=True) from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, space_eval def step_rf(params): clf = ensemble.RandomForestRegressor(**params) cv = model_selection.cross_val_score(clf, train, y, scoring=metrics.make_scorer(metrics.log_loss), cv = 10, n_jobs = -2) score = np.mean(cv) print(cname, score, params) return dict(loss=score, status=STATUS_OK) space_rf = dict( n_estimators = hp.choice('n_estimators', range(50, 1500)), #criterion = hp.choice('criterion', ["gini", "entropy"]), min_samples_split = hp.choice('min_samples_split', range(2, 10)), min_samples_leaf = hp.choice('min_samples_leaf', range(1, 10)), max_features = hp.choice('max_features', range(1, 16)), random_state = 1 ) trs = state.load('rf_trials') if trs == None or debug_mode: tr = Trials() else: tr, _ = trs if len(tr.trials) > 0: print('reusing %d trials, best was:'%(len(tr.trials)), space_eval(space_rf, tr.argmin)) best = tr.argmin while len(tr.trials) < 15: best = fmin(step_rf, space_rf, algo=tpe.suggest, max_evals=len(tr.trials) + 1, trials = tr) state.save('et_trials', (tr, space_rf)) rf_params = space_eval(space_rf, best) print(rf_params) N_splits = 9 N_seeds = 3 skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True) cv = [] for s in range(N_seeds): scores = [] cname2 = cname + str(s) v[cname2], z[cname2] = 0, 0 rf_params['random_state'] = s + 4242 for n, (itrain, ival) in enumerate(skf.split(train, y)): clf = ensemble.RandomForestRegressor(**rf_params) clf.fit(train.ix[itrain], y[itrain]) p = clf.predict(train.ix[ival]) v.loc[ival, cname2] += p score = metrics.log_loss(y[ival], p) z[cname2] += clf.predict(test) print(cname, 'seed %d step %d of %d: '%(rf_params['random_state'], n+1, skf.n_splits), score, state.now()) scores.append(score) z[cname2] /= N_splits cv.append(np.mean(scores)) print('seed %d loss: '%(rf_params['random_state']), scores, np.mean(scores), np.std(scores)) z['y'] = z[cname2] print('cv:', cv, np.mean(cv), np.std(cv)) return cv, None