Exemple #1
0
def main():
    logfile = open(os.path.dirname(os.path.realpath(__file__)) + '/log/email_star.log', 'a')
    parser = optparse.OptionParser()
    parser.add_option("-s", "--start", dest="start", type="int", help="how far back to look for star feed items, in minutes")
    parser.add_option("-e", "--end", dest="end", type="int", help="delay for star feed items, in minutes from present")
    parser.add_option("-D", "--dry-run", action="store_true", dest="dry_run")
    (options, args) = parser.parse_args()
    items = wsgi.Star.search(created={"$gt": now() - 60*options.start, "$lt": now() - 60*options.end})
    for item in items:
      if item.get('entity_class') == "User":
        recipient = item.entity
      elif item.get('entity_class') == "Expr":
        recipient = item.entity.owner
      if not item.initiator.id == recipient.id:
        headers = wsgi.mail_feed(item, recipient, options.dry_run)
        logfile.write('\n' + time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.localtime(time.time())) + " " * 4 + headers['To'] + ' ' * ( 50 - len(headers['To']) )  + headers['Subject'] )
    logfile.close()
def run(train, y, test, v, z):
    #cname = sys._getframe().f_code.co_name
    cname = 'p'
    train.drop('id', axis=1, inplace=True)
    test.drop('id', axis=1, inplace=True)
    from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, space_eval
    dtrain = xgb.DMatrix(train, y)
    def step_xgb(params):
        cv = xgb.cv(params=params,
                    dtrain=dtrain,
                    num_boost_round=10000,
                    early_stopping_rounds=50,
                    nfold=10,
                    seed=params['seed'])
        score = cv.ix[len(cv)-1, 0]
        print(cname, score, len(cv), params)
        return dict(loss=score, status=STATUS_OK)
    space_xgb = dict(
            max_depth = hp.choice('max_depth', range(2, 8)),
            subsample = hp.quniform('subsample', 0.6, 1, 0.05),
            colsample_bytree = hp.quniform('colsample_bytree', 0.6, 1, 0.05),
            learning_rate = hp.quniform('learning_rate', 0.005, 0.03, 0.005),
            min_child_weight = hp.quniform('min_child_weight', 1, 6, 1),
            gamma = hp.quniform('gamma', 0.5, 10, 0.05),

            objective = 'binary:logistic',
            eval_metric = 'logloss',
            seed = 1,
            silent = 1
        )
    trs = state.load('xgb_trials')
    if trs == None:
        tr = Trials()
    else:
        tr, _ = trs
    if len(tr.trials) > 0:
        print('reusing %d trials, best was:'%(len(tr.trials)), space_eval(space_xgb, tr.argmin))
        best = tr.argmin
    while len(tr.trials) < 15:
        best = fmin(step_xgb, space_xgb, algo=tpe.suggest, max_evals=len(tr.trials) + 1, trials = tr)
        state.save('xgb_trials', (tr, space_xgb))
    xgb_params = space_eval(space_xgb, best)
    print(xgb_params)

    N_splits = 9
    N_seeds = 1

    skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True)
    dtest = xgb.DMatrix(test)
    for s in range(N_seeds):
        scores = []
        cname2 = cname + str(s)
        v[cname2], z[cname2] = 0, 0
        xgb_params['seed'] = s + 4242
        for n, (itrain, ival) in enumerate(skf.split(train, y)):
            dtrain = xgb.DMatrix(train.ix[itrain], y[itrain])
            dvalid = xgb.DMatrix(train.ix[ival], y[ival])
            watch = [(dtrain, 'train'), (dvalid, 'valid')]
            clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False)

            p = clf.predict(dvalid)
            v.loc[ival, cname2] += p
            score = metrics.log_loss(y[ival], p)
            z[cname2]  += clf.predict(dtest)
            print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, state.now())
            scores.append(score)
        z[cname2] /= N_splits

    cv = scores
    z['y'] = z[cname2]
    print('validation loss: ', cv, np.mean(cv), np.std(cv))

    return cv, None
        train, y, test, _ = data.get()
        z = pd.DataFrame()
        z['id'] = test.id
        z['y'] = 0

        v = pd.DataFrame()
        v['id'] = train.id
        v['y'] = y
        cv, _ = run(train, y, test, v, z)
        state.save('model', (v, z, cv, None))
    else:
        v, z, cv, _ = saved
    return v, z, cv, _

if '__main__' == __name__:
    print('starting', state.now())
    v, z, cv, _ = predict()
    state.save_model(v, z, cv)
    if public_score == None:
        # если есть public score - перезаписывать отправленное уже не стоит
        state.save_predicts(z)
    else:
        import os
        if os.path.exists('../model_scores.csv'):
            mdf = pd.read_csv('../model_scores.csv')
        else:
            mdf = pd.DataFrame(columns=['timestamp', 'model', 'cv', 'cv std', 'public score'])
        idx = mdf.model == state.base_name_
        if np.sum(idx) == 0:
            mdf.loc[len(mdf), 'model'] = state.base_name_
            idx = mdf.model == state.base_name_
def run(train, y, test, v, z):
    #cname = sys._getframe().f_code.co_name
    from keras import layers
    from keras import models
    from keras import optimizers
    cname = 'p'
    train.drop('id', axis=1, inplace=True)
    test.drop('id', axis=1, inplace=True)
    num_splits = 9
    scaler = preprocessing.RobustScaler()
    train = scaler.fit_transform(train)
    test = scaler.transform(test)
    input_dims = train.shape[1]
    def build_model():
        input_ = layers.Input(shape=(input_dims,))
        model = layers.Dense(256, kernel_initializer='Orthogonal')(input_)
        #model = layers.BatchNormalization()(model)
        #model = layers.advanced_activations.PReLU()(model)
        model = layers.Activation('selu')(model)
        #model = layers.Dropout(0.7)(model)

        model = layers.Dense(64, kernel_initializer='Orthogonal')(model)
        #model = layers.BatchNormalization()(model)
        model = layers.Activation('selu')(model)
        #model = layers.advanced_activations.PReLU()(model)
        #model = layers.Dropout(0.9)(model)

        model = layers.Dense(16, kernel_initializer='Orthogonal')(model)
        #model = layers.BatchNormalization()(model)
        model = layers.Activation('selu')(model)
        #model = layers.advanced_activations.PReLU()(model)

        model = layers.Dense(1, activation='sigmoid')(model)

        model = models.Model(input_, model)
        model.compile(loss = 'binary_crossentropy', optimizer = optimizers.Nadam())
        #print(model.summary(line_length=120))
        return model
    batch_size = 128
    np.random.seed(1234)
    build_model().summary(line_length=120)
    ss = model_selection.ShuffleSplit(n_splits=num_splits, random_state=11, test_size=1/num_splits)
    scores = list()
    model_path = state.temp_name('keras_mlp_weights')
    v[cname] = 0
    z[cname] = 0
    for n, (itrain, ival) in enumerate(ss.split(train, y)):
        xtrain, xval = train[itrain], train[ival]
        ytrain, yval = y[itrain], y[ival]
        model = build_model()
        model.fit(
                xtrain, ytrain,
                batch_size = batch_size,
                epochs = 10000,
                validation_data = (xval, yval),
                verbose = 0,
                callbacks = build_keras_fit_callbacks(model_path),
                shuffle = True
            )
        model.load_weights(model_path)
        p = model.predict(xval)
        v.loc[ival, cname] += p.ravel()
        score = metrics.log_loss(y[ival], p)
        print(cname, 'fold %d: '%(n+1), score, state.now())
        scores.append(score)
        z[cname] += model.predict(test).ravel()
        del model
        for i in range(3): gc.collect(i)
    state.drop_temp(model_path)
    cv=np.mean(scores)
    z[cname] /= num_splits
    z['y'] = z[cname]

    return cv, None
def predict():
    saved = state.load('model')
    #saved = None
    if debug_mode:
        saved = None
    if saved == None:
        train, y, test, _ = data.get()
        ftrain, ftest, _ = fea_1.get()
        ftrain2, ftest2, _ = fea_2.get()
        train = pd.concat([train, ftrain, ftrain2], axis=1)
        test = pd.concat([test, ftest, ftest2], axis=1)
        print(train.shape, test.shape)

        z = pd.DataFrame()
        z['id'] = test.id
        z['y'] = 0

        v = pd.DataFrame()
        v['id'] = train.id
        v['y'] = y
        cv, _ = run(train, y, test, v, z)
        state.save('model', (v, z, cv, None))
    else:
        v, z, cv, _ = saved
    return v, z, cv, _

if '__main__' == __name__:
    print('starting', state.now())
    state.run_predict(predict, debug_mode, public_score)
    print('done.', state.now())
def run(train, y, test, v, z):
    #cname = sys._getframe().f_code.co_name
    cname = 'p'
    train.drop('id', axis=1, inplace=True)
    test.drop('id', axis=1, inplace=True)
    from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, space_eval
    dtrain = xgb.DMatrix(train, y)

    def step_xgb(params):
        cv = xgb.cv(params=params,
                    dtrain=dtrain,
                    num_boost_round=10000,
                    early_stopping_rounds=50,
                    nfold=10,
                    seed=params['seed'])
        score = cv.ix[len(cv) - 1, 0]
        print(cname, score, len(cv), params)
        return dict(loss=score, status=STATUS_OK)

    space_xgb = dict(max_depth=hp.choice('max_depth', range(2, 9)),
                     subsample=hp.quniform('subsample', 0.6, 1, 0.05),
                     colsample_bytree=hp.quniform('colsample_bytree', 0.6, 1,
                                                  0.05),
                     learning_rate=hp.quniform('learning_rate', 0.005, 0.1,
                                               0.005),
                     min_child_weight=hp.quniform('min_child_weight', 1, 6, 1),
                     gamma=hp.quniform('gamma', 0.5, 10, 0.05),
                     reg_alpha=hp.quniform('reg_alpha', 0, 1, 0.001),
                     objective='binary:logistic',
                     eval_metric='logloss',
                     seed=1,
                     silent=1)
    trs = state.load('xgb_trials')
    if trs == None or debug_mode:
        tr = Trials()
    else:
        tr, _ = trs
    if len(tr.trials) > 0:
        print('reusing %d trials, best was:' % (len(tr.trials)),
              space_eval(space_xgb, tr.argmin))
        best = tr.argmin
    while len(tr.trials) < 15:
        best = fmin(step_xgb,
                    space_xgb,
                    algo=tpe.suggest,
                    max_evals=len(tr.trials) + 1,
                    trials=tr)
        state.save('xgb_trials', (tr, space_xgb))
    xgb_params = space_eval(space_xgb, best)
    print(xgb_params)

    N_splits = 9
    N_seeds = 3

    skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True)
    dtest = xgb.DMatrix(test)
    cv = []
    for s in range(N_seeds):
        scores = []
        cname2 = cname + str(s)
        v[cname2], z[cname2] = 0, 0
        xgb_params['seed'] = s + 4242
        for n, (itrain, ival) in enumerate(skf.split(train, y)):
            dtrain = xgb.DMatrix(train.ix[itrain], y[itrain])
            dvalid = xgb.DMatrix(train.ix[ival], y[ival])
            watch = [(dtrain, 'train'), (dvalid, 'valid')]
            clf = xgb.train(xgb_params,
                            dtrain,
                            10000,
                            watch,
                            early_stopping_rounds=100,
                            verbose_eval=False)

            p = clf.predict(dvalid)
            v.loc[ival, cname2] += p
            score = metrics.log_loss(y[ival], p)
            z[cname2] += clf.predict(dtest)
            print(
                cname, 'seed %d step %d of %d: ' %
                (xgb_params['seed'], n + 1, skf.n_splits), score, state.now())
            scores.append(score)
        z[cname2] /= N_splits
        cv.append(np.mean(scores))
        print('seed %d loss: ' % (xgb_params['seed']), scores, np.mean(scores),
              np.std(scores))
        z['y'] = z[cname2]

    print('cv:', cv, np.mean(cv), np.std(cv))
    return cv, None
    saved = state.load('model')
    #saved = None
    if debug_mode:
        saved = None
    if saved == None:
        train, y, test, _ = data.get()
        ftrain, ftest, _ = fea_1.get()
        ftrain2, ftest2, _ = fea_2.get()
        train = pd.concat([train, ftrain, ftrain2], axis=1)
        test = pd.concat([test, ftest, ftest2], axis=1)
        print(train.shape, test.shape)

        z = pd.DataFrame()
        z['id'] = test.id
        z['y'] = 0

        v = pd.DataFrame()
        v['id'] = train.id
        v['y'] = y
        cv, _ = run(train, y, test, v, z)
        state.save('model', (v, z, cv, None))
    else:
        v, z, cv, _ = saved
    return v, z, cv, _


if '__main__' == __name__:
    print('starting', state.now())
    state.run_predict(predict, debug_mode, public_score)
    print('done.', state.now())
def run(state, train, y, test, v, z):
    #cname = sys._getframe().f_code.co_name
    cname = 'p'
    train.drop('id', axis=1, inplace=True)
    test.drop('id', axis=1, inplace=True)
    from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, space_eval

    def step_et(params):
        clf = ensemble.ExtraTreesRegressor(**params)
        cv = model_selection.cross_val_score(clf,
                                             train,
                                             y,
                                             scoring=metrics.make_scorer(
                                                 metrics.log_loss),
                                             cv=10,
                                             n_jobs=-2)
        score = np.mean(cv)
        print(cname, score, params)
        return dict(loss=score, status=STATUS_OK)

    space_et = dict(
        n_estimators=hp.choice('n_estimators', range(50, 1500)),
        #criterion = hp.choice('criterion', ["gini", "entropy"]),
        min_samples_split=hp.choice('min_samples_split', range(2, 10)),
        min_samples_leaf=hp.choice('min_samples_leaf', range(1, 10)),
        max_features=hp.choice('max_features', range(1, 16)),
        random_state=1)
    trs = state.load('et_trials')
    if trs == None or debug_mode:
        tr = Trials()
    else:
        tr, _ = trs
    if len(tr.trials) > 0:
        print('reusing %d trials, best was:' % (len(tr.trials)),
              space_eval(space_et, tr.argmin))
        best = tr.argmin
    while len(tr.trials) < 15:
        best = fmin(step_et,
                    space_et,
                    algo=tpe.suggest,
                    max_evals=len(tr.trials) + 1,
                    trials=tr)
        state.save('et_trials', (tr, space_et))
    et_params = space_eval(space_et, best)
    print(et_params)

    N_splits = 9
    N_seeds = 3

    skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True)
    cv = []
    for s in range(N_seeds):
        scores = []
        cname2 = cname + str(s)
        v[cname2], z[cname2] = 0, 0
        et_params['random_state'] = s + 4242
        for n, (itrain, ival) in enumerate(skf.split(train, y)):
            clf = ensemble.ExtraTreesRegressor(**et_params)
            clf.fit(train.ix[itrain], y[itrain])
            p = clf.predict(train.ix[ival])
            v.loc[ival, cname2] += p
            score = metrics.log_loss(y[ival], p)
            z[cname2] += clf.predict(test)
            print(
                cname, 'seed %d step %d of %d: ' %
                (et_params['random_state'], n + 1, skf.n_splits), score,
                state.now())
            scores.append(score)
        z[cname2] /= N_splits
        cv.append(np.mean(scores))
        print('seed %d loss: ' % (et_params['random_state']), scores,
              np.mean(scores), np.std(scores))
        z['y'] = z[cname2]

    print('cv:', cv, np.mean(cv), np.std(cv))
    return cv, None
def run(train, y, test, v, z):
    #cname = sys._getframe().f_code.co_name
    from keras import layers
    from keras import models
    from keras import optimizers
    cname = 'p'
    train.drop('id', axis=1, inplace=True)
    test.drop('id', axis=1, inplace=True)
    num_splits = 9
    scaler = preprocessing.RobustScaler()
    train = scaler.fit_transform(train)
    test = scaler.transform(test)
    input_dims = train.shape[1]

    def build_model():
        input_ = layers.Input(shape=(input_dims, ))
        model = layers.Dense(512, kernel_initializer='Orthogonal')(input_)
        #model = layers.BatchNormalization()(model)
        #model = layers.advanced_activations.PReLU()(model)
        model = layers.Activation('selu')(model)
        #model = layers.Dropout(0.7)(model)

        model = layers.Dense(256, kernel_initializer='Orthogonal')(model)
        #model = layers.BatchNormalization()(model)
        model = layers.Activation('selu')(model)
        #model = layers.advanced_activations.PReLU()(model)
        #model = layers.Dropout(0.9)(model)

        model = layers.Dense(256, kernel_initializer='Orthogonal')(model)
        model = layers.BatchNormalization()(model)
        model = layers.Activation('selu')(model)
        #model = layers.advanced_activations.PReLU()(model)
        #model = layers.Dropout(0.9)(model)

        model = layers.Dense(16, kernel_initializer='Orthogonal')(model)
        #model = layers.BatchNormalization()(model)
        model = layers.Activation('selu')(model)
        #model = layers.advanced_activations.PReLU()(model)

        model = layers.Dense(1, activation='sigmoid')(model)

        model = models.Model(input_, model)
        model.compile(loss='binary_crossentropy', optimizer=optimizers.Nadam())
        #print(model.summary(line_length=120))
        return model

    batch_size = 128
    np.random.seed(1234)
    build_model().summary(line_length=120)
    ss = model_selection.ShuffleSplit(n_splits=num_splits,
                                      random_state=11,
                                      test_size=1 / num_splits)
    scores = list()
    model_path = state.temp_name('keras_mlp_weights')
    v[cname] = 0
    z[cname] = 0
    for n, (itrain, ival) in enumerate(ss.split(train, y)):
        xtrain, xval = train[itrain], train[ival]
        ytrain, yval = y[itrain], y[ival]
        model = build_model()
        model.fit(xtrain,
                  ytrain,
                  batch_size=batch_size,
                  epochs=10000,
                  validation_data=(xval, yval),
                  verbose=0,
                  callbacks=build_keras_fit_callbacks(model_path),
                  shuffle=True)
        model.load_weights(model_path)
        p = model.predict(xval)
        v.loc[ival, cname] += p.ravel()
        score = metrics.log_loss(y[ival], p)
        print(cname, 'fold %d: ' % (n + 1), score, state.now())
        scores.append(score)
        z[cname] += model.predict(test).ravel()
        del model
        for i in range(3):
            gc.collect(i)
    print('scores:', scores, np.mean(scores), np.std(scores))
    state.drop_temp(model_path)
    cv = np.mean(scores)
    z[cname] /= num_splits
    z['y'] = z[cname]

    return cv, None
        z = pd.DataFrame()
        z['id'] = test.id
        z['y'] = 0

        v = pd.DataFrame()
        v['id'] = train.id
        v['y'] = y
        cv, _ = run(train, y, test, v, z)
        state.save('model', (v, z, cv, None))
    else:
        v, z, cv, _ = saved
    return v, z, cv, _


if '__main__' == __name__:
    print('starting', state.now())
    v, z, cv, _ = predict()
    if not debug_mode:
        state.save_model(v, z, cv)
    if public_score == None:
        # если есть public score - перезаписывать отправленное уже не стоит
        state.save_predicts(z)
    else:
        import os
        if os.path.exists('../model_scores.csv'):
            mdf = pd.read_csv('../model_scores.csv')
        else:
            mdf = pd.DataFrame(
                columns=['timestamp', 'model', 'cv', 'cv std', 'public score'])
        idx = mdf.model == state.base_name_
        if np.sum(idx) == 0:
def run(state, train, y, test, v, z):
    #cname = sys._getframe().f_code.co_name
    cname = 'p'
    train.drop('id', axis=1, inplace=True)
    test.drop('id', axis=1, inplace=True)
    from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, space_eval
    def step_rf(params):
        clf = ensemble.RandomForestRegressor(**params)
        cv = model_selection.cross_val_score(clf,
                                             train, y,
                                             scoring=metrics.make_scorer(metrics.log_loss),
                                             cv = 10,
                                             n_jobs = -2)
        score = np.mean(cv)
        print(cname, score, params)
        return dict(loss=score, status=STATUS_OK)
    space_rf = dict(
        n_estimators = hp.choice('n_estimators', range(50, 1500)),
        #criterion = hp.choice('criterion', ["gini", "entropy"]),
        min_samples_split = hp.choice('min_samples_split', range(2, 10)),
        min_samples_leaf = hp.choice('min_samples_leaf', range(1, 10)),
        max_features = hp.choice('max_features', range(1, 16)),
        random_state = 1
        )
    trs = state.load('rf_trials')
    if trs == None or debug_mode:
        tr = Trials()
    else:
        tr, _ = trs
    if len(tr.trials) > 0:
        print('reusing %d trials, best was:'%(len(tr.trials)), space_eval(space_rf, tr.argmin))
        best = tr.argmin
    while len(tr.trials) < 15:
        best = fmin(step_rf, space_rf, algo=tpe.suggest, max_evals=len(tr.trials) + 1, trials = tr)
        state.save('et_trials', (tr, space_rf))
    rf_params = space_eval(space_rf, best)
    print(rf_params)

    N_splits = 9
    N_seeds = 3

    skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True)
    cv = []
    for s in range(N_seeds):
        scores = []
        cname2 = cname + str(s)
        v[cname2], z[cname2] = 0, 0
        rf_params['random_state'] = s + 4242
        for n, (itrain, ival) in enumerate(skf.split(train, y)):
            clf = ensemble.RandomForestRegressor(**rf_params)
            clf.fit(train.ix[itrain], y[itrain])
            p = clf.predict(train.ix[ival])
            v.loc[ival, cname2] += p
            score = metrics.log_loss(y[ival], p)
            z[cname2]  += clf.predict(test)
            print(cname, 'seed %d step %d of %d: '%(rf_params['random_state'], n+1, skf.n_splits), score, state.now())
            scores.append(score)
        z[cname2] /= N_splits
        cv.append(np.mean(scores))
        print('seed %d loss: '%(rf_params['random_state']), scores, np.mean(scores), np.std(scores))
        z['y'] = z[cname2]

    print('cv:', cv, np.mean(cv), np.std(cv))
    return cv, None