Ejemplos de make_submission en Python, ejemplos de utils.make_submission en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: main4.py Proyecto: BellaMENG/COMP-4901-NLP-2018-Fall

def main(opt):
    os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpu
    if opt.mode == "train":
        st = time.time()
        print('Loading data')
        x_train, y_train, x_valid, y_valid, vocabulary_size = load_data(
            "data", opt.debug)

        num_training_data = x_train.shape[0]
        sequence_length = x_train.shape[1]
        print(num_training_data)

        print('Vocab Size', vocabulary_size)

        model = build_model(opt.embedding_dim, opt.hidden_size, opt.drop,
                            sequence_length, vocabulary_size)
        print("Traning Model...")
        history = model.fit(
            x_train,
            y_train,
            batch_size=opt.batch_size,
            epochs=opt.epochs,
            verbose=1,
            callbacks=[TestCallback((x_valid, y_valid), model=model)])
        model.save(opt.saved_model)
        print("Training cost time: ", time.time() - st)

    elif opt.mode == "ensemble":
        model1 = load_model(opt.saved_model1)
        model1.name = 'model1'
        for layer in model1.layers:
            layer.name = layer.name + str("_1")
        model2 = load_model(opt.saved_model2)
        model2.name = 'model2'
        for layer in model2.layers:
            layer.name = layer.name + str("_2")
        models = [model1, model2]

        vocabulary = json.load(open(os.path.join("data", "vocab.json")))
        predict_dict = predict_final_word_models(models, vocabulary, opt.input)
        sub_file = make_submission(predict_dict, opt.student_id, opt.input)
        if opt.score:
            scoring(sub_file, os.path.join("data"), type="valid")


#         x_train, y_train, x_valid, y_valid, vocabulary_size = load_data(
#             "data", opt.debug)
#         num_training_data = x_train.shape[0]
#         sequence_length = x_train.shape[1]
#         model_inputs = Input(shape=(sequence_length,), dtype='int32')
#         model = ensemble(models, model_inputs)
#         model.save(opt.model_to_be_saved)

    else:
        model = load_model(opt.saved_model)
        vocabulary = json.load(open(os.path.join("data", "vocab.json")))
        predict_dict = predict_final_word(model, vocabulary, opt.input)
        sub_file = make_submission(predict_dict, opt.student_id, opt.input)
        if opt.score:
            scoring(sub_file, os.path.join("data"), type="valid")

Ejemplo n.º 2

0

Mostrar archivo

Archivo: main.py Proyecto: gusrb415/TextEncoder

def main(opt):
    os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpu
    if opt.mode == "train":
        st = time.time()
        print('Loading data')
        x_train, y_train, x_valid, y_valid, vocabulary_size = load_data(
            "data", opt.debug)

        num_training_data = x_train.shape[0]
        sequence_length = x_train.shape[1]
        print(num_training_data)

        print('Vocab Size', vocabulary_size)

        model = build_model(opt.embedding_dim, opt.hidden_size, opt.drop1,
                            opt.drop2, sequence_length, vocabulary_size)
        print("Training Model...")
        model.fit(x_train,
                  y_train,
                  batch_size=opt.batch_size,
                  epochs=opt.epochs,
                  verbose=2,
                  callbacks=[TestCallback((x_valid, y_valid), model=model)])
        model.save(opt.saved_model)
        print("Training cost time: ", time.time() - st)
    else:
        if opt.mode == "score_valid":
            model = load_model(opt.saved_model)
            vocabulary = json.load(open(os.path.join("data", "vocab.json")))
            predict_dict = predict_final_word([model], vocabulary, opt.input)
            sub_file = make_submission(predict_dict, opt.student_id, opt.input)
            scoring(sub_file, os.path.join("data"), type="valid")
        else:
            model0 = load_model('models/model0.h5')
            model1 = load_model('models/model1.h5')
            model2 = load_model('models/model2.h5')
            model3 = load_model('models/model3.h5')
            model4 = load_model('models/model4.h5')
            model5 = load_model('models/model5.h5')
            model6 = load_model('models/model6.h5')
            model7 = load_model('models/model7.h5')
            model8 = load_model('models/model8.h5')
            model9 = load_model('models/model9.h5')
            model_list = [
                model0, model1, model2, model3, model4, model5, model6, model7,
                model8, model9
            ]
            vocabulary = json.load(open(os.path.join("data", "vocab.json")))
            predict_dict = predict_final_word(model_list, vocabulary,
                                              opt.input)
            sub_file = make_submission(predict_dict, opt.student_id, opt.input)
            scoring(sub_file, os.path.join("data"), type="valid")

Ejemplo n.º 3

0

Mostrar archivo

def main(opt):
    os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpu
    if opt.mode == "train":
        st = time.time()
        print('Loading data')
        x_train, y_train, x_valid, y_valid, vocabulary_size = load_data(
            "data", opt.debug)

        num_training_data = x_train.shape[0]
        sequence_length = x_train.shape[1]
        print(num_training_data)

        print('Vocab Size', vocabulary_size)

        model = build_model(opt.embedding_dim, opt.hidden_size, opt.drop,
                            sequence_length, vocabulary_size, opt.optimizer)
        print("Traning Model...")
        history = model.fit(
            x_train,
            y_train,
            batch_size=opt.batch_size,
            epochs=opt.epochs,
            verbose=1,
            callbacks=[TestCallback((x_valid, y_valid), model=model)])
        model.save(opt.saved_model)
        print("Training cost time: ", time.time() - st)

    else:
        model = load_model(opt.saved_model)
        vocabulary = json.load(open(os.path.join("data", "vocab.json")))
        predict_dict = predict_final_word(model, vocabulary, opt.input)
        sub_file = make_submission(predict_dict, opt.student_id, opt.input)
        if opt.score:
            scoring(sub_file, os.path.join("data"), type="valid")

Ejemplo n.º 4

0

Mostrar archivo

def main():
    t = Timer()
    seed_everything(cfg.common.seed)

    logger_path.mkdir(exist_ok=True)
    logging.basicConfig(filename=logger_path / 'train.log',
                        level=logging.DEBUG)

    dh.save(logger_path / 'config.yml', cfg)

    with t.timer('load data'):
        train_df = pd.read_csv(const.TRAIN_PATH)
        test_df = pd.read_csv(const.TEST_PATH)

    with t.timer('make folds'):
        fold_df = factory.get_fold(cfg.validation, train_df)
        if cfg.validation.single:
            fold_df = fold_df[['fold_0']]
            fold_df /= fold_df['fold_0'].max()

    with t.timer('drop index'):
        if cfg.common.drop is not None:
            drop_idx = factory.get_drop_idx(cfg.common.drop)
            train_df = train_df.drop(drop_idx, axis=0).reset_index(drop=True)
            fold_df = fold_df.drop(drop_idx, axis=0).reset_index(drop=True)

    with t.timer('train model'):
        trainer = NNTrainer(run_name, fold_df, cfg)
        cv = trainer.train(train_df=train_df,
                           target_df=train_df[const.TARGET_COL])
        preds = trainer.predict(test_df)
        trainer.save()

        run_name_cv = f'{run_name}_{cv:.3f}'
        logger_path.rename(f'../logs/{run_name_cv}')
        logging.disable(logging.FATAL)

    with t.timer('make submission'):
        make_submission(run_name=run_name_cv,
                        y_pred=preds,
                        target_name='Label',
                        comp=False)
        if cfg.common.kaggle.submit:
            kaggle = Kaggle(cfg.compe.name, run_name_cv)
            kaggle.submit(comment)

Ejemplo n.º 5

0

Mostrar archivo

Archivo: main.py Proyecto: ireko8/Malware

def main():
    experiment_name = now()
    cv_path = Path(f"result/{experiment_name}")
    cv_path.mkdir(parents=True)

    copy_script(cv_path)
    log = Logger(experiment_name, cv_path / "exp.log")

    log.info("load data")
    with log.interval_timer("load data"):
        train_X = load_fs_tosh('all_snap', conf)
        train_y = feather.read_dataframe("features/HasDetections.ftr")
        train_y = train_y.HasDetections
        test = load_fs_tosh('all_snap', conf, test=True)

    log.info(pformat(list(train_X.columns)))
    cv = StratifiedKFold(n_splits=5, random_state=conf.seed)
    cv = cv.split(train_X, train_y)

    log.info("learning start")
    log.double_kiritori()
    with open('features/NN/conf_tosh_all_snap.pkl', 'rb') as p:
        embedd_conf = pickle.load(p)
    log.info(pformat(embedd_conf))
    score, pred, meta = NN_cv(train_X,
                              train_y,
                              cv,
                              log,
                              cv_path,
                              X_test=test,
                              split_conf=embedd_conf)
    log.info(score)
    log.double_kiritori()
    log.info("done")

    del train_X, train_y

    np.save(cv_path / "test_preds.npy", pred)
    np.save(cv_path / "oof_preds.npy", meta)

    make_submission(pred, f"submissions/{experiment_name}.csv.gz")

Ejemplo n.º 6

0

Mostrar archivo

Archivo: main.py Proyecto: tea1528/language_model

def main(opt):
    os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpu
    if opt.mode == "train":
        st = time.time()
        print('Loading data')
        x_train, y_train, x_valid, y_valid, vocabulary_size = load_data(
            "data", opt.debug)

        num_training_data = x_train.shape[0]
        sequence_length = x_train.shape[1]
        print(num_training_data)

        print('Vocab Size', vocabulary_size)

        model = build_model(opt.embedding_dim, opt.hidden_size, opt.drop,
                            sequence_length, vocabulary_size)
        print("Traning Model...")
        history = model.fit(
            x_train,
            y_train,
            batch_size=opt.batch_size,
            epochs=opt.epochs,
            verbose=1,
            callbacks=[TestCallback((x_valid, y_valid), model=model)])
        model.save(opt.saved_model)

        # Save the model architecture
        #with open('model_architecture.yaml', 'w') as f:
        #    f.write(model.to_json())

        print("Training cost time: ", time.time() - st)

    else:
        # Model reconstruction from JSON file
        #with open('model_architecture.yaml', 'r') as f:
        #    model = model_from_yaml(f.read())
        model = load_model(
            opt.saved_model,
            custom_objects={'LayerNormalization': LayerNormalization})
        vocabulary = json.load(open(os.path.join("data", "vocab.json")))
        predict_dict = predict_final_word(model, vocabulary, opt.input)
        sub_file = make_submission(predict_dict, opt.student_id, opt.input)
        if opt.score:
            scoring(sub_file, os.path.join("data"), type="valid")

Ejemplo n.º 7

0

Mostrar archivo

Archivo: stack.py Proyecto: glouppe/kaggle-higgs

    X_fold = np.hstack((X_fold, X_pred))

    all_X.append(X_fold)
    all_y.append(y_fold)
    all_w.append(w_fold)

X = np.vstack(all_X)
y = np.concatenate(all_y)
w = np.concatenate(all_w)

clf = Classifier(**params)
w = rescale(w)
w = rebalance(y, w)

try:
    clf.fit(X, y, sample_weight=w)
except:
    clf.fit(X, y)


# And make a submussion
print "Making submission..."
X_test, _, _, _ = load_test()
X_pred = load_predictions("stack/*-test.npy")
X_test = np.hstack((X_test, X_pred))

make_submission(clf, threshold, "output-stacking.csv", X_test=X_test)

import IPython; IPython.embed()

Ejemplo n.º 8

0

Mostrar archivo

Archivo: final.py Proyecto: glouppe/kaggle-higgs

    "bootstrap": False,
    "max_features": 27
}


# Train on the whole training set
def train(Classifier, params, X, y, w, verbose=1):
    if verbose > 0:
        print "[Start]"

    w = rescale(w)
    w = rebalance(y, w)

    clf = Classifier(**params)
    clf.fit(X, y, sample_weight=w)

    if verbose > 0:
        print "[End]"

    return clf


clf = train(Classifier, params, X, y, w)

# Make submission
threshold = -2.74420523643
make_submission(clf, threshold, "output-rs.csv")

import IPython
IPython.embed()

Ejemplo n.º 9

0

Mostrar archivo

Archivo: main.py Proyecto: Strali/toxic-text

def main(args):
    """Train (and evaluate) a GRU-based model for classifying toxic content in
    wikipedia comments. Takes a preprocessed (cleaned, tokenized, and padded)
    comments as input and outputs the probability of six different types of toxicity
    being contained in the comment. Execution is modified by a number of call
    arguments, described below.

    Parameters
    ----------
    --train (-t) : (Re)train the model. Leave this out if only doing inference or
        only evaluating on test set.
    --auxilliary_input (-a) : Use auxilliary input to the model for training and
        testing. Auxilliary input consists of class probabilities calculated using
        ridge regression. Requires that said auxilliary input is already generate
        for a given input sentence.
    --combine_data (-c) : Combine training and test data with additional figshare
        comments when fitting tokenizer to data.
    --submit (-s) : Turn test predictions into a submission for Kaggle.
    --visualise (-v) : Visualise attention activations for a sentence.
    --fasttext (-f) : Use word embeddings trained using fasttext instead of
        pre-trained GloVe embeddings.
    """

    TRAIN = args.train
    USE_AUXILLIARY_INPUT = args.auxilliary_input
    COMBINE_DATA = args.combine_data
    MAKE_SUBMISSION = args.submit
    VISUALISE_FULL_ATTENTION = args.visualise
    USE_FASTTEXT = args.fasttext

    MAX_NUM_WORDS = None
    MAX_LENGTH = 150
    EMBEDDING_DIM = 300
    SKIPGRAM = True

    MAX_EPOCHS = 50
    BATCH_SIZE = 512
    VAL_SPLIT = 0.2
    SENTENCE_NUM = 51

    TOXICITY_THRESHOLD = 0.6

    AVERAGE_ATTENTION = False

    BASE_LR = 0.0001
    MAX_LR = 0.005
    STEP_SIZE = 30000
    CLR_MODE = 'triangular'
    now = datetime.datetime.now()
    now = now.strftime('%Y%m%d%H%M')
    LOG_PATH = './logs/' + now
    WEIGHT_SAVE_PATH = 'weights_base.best.hdf5'
    SUBMISSION_SAVE_PATH = './submissions/submission_' + now + '.csv'
    ES_PATIENCE = 6
    TB_HIST_FREQ = 0
    TB_WRITE_GRAPH = True

    clr_params = {
        'base_lr': BASE_LR,
        'max_lr': MAX_LR,
        'step_size': STEP_SIZE,
        'mode': CLR_MODE
    }
    ckpt_params = {
        'filepath': WEIGHT_SAVE_PATH,
        'verbose': 1,
        'save_best_only': True,
        'save_weights_only': True
    }
    es_params = {'patience': ES_PATIENCE}
    tb_params = {
        'log_dir': LOG_PATH,
        'histogram_freq': TB_HIST_FREQ,
        'write_graph': TB_WRITE_GRAPH,
        'batch_size': BATCH_SIZE,
        'embeddings_freq': MAX_EPOCHS + 1
    }

    callbacks = get_callbacks(clr_params, ckpt_params, es_params, tb_params)

    CLASS_LIST = [
        'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'
    ]

    txt_prep = TextPreprocessor(max_nb_words=MAX_NUM_WORDS,
                                max_padding_length=MAX_LENGTH,
                                combine_data=COMBINE_DATA,
                                use_auxilliary_features=USE_AUXILLIARY_INPUT)
    if USE_AUXILLIARY_INPUT:
        X_train, X_aux, y_train, X_test, test_aux, word_index, sample_text, sample_target = \
            txt_prep.load_and_tokenize(class_list=CLASS_LIST,
                                       sample_index=SENTENCE_NUM)
    else:
        X_train, y_train, X_test, word_index, sample_text, sample_target = \
            txt_prep.load_and_tokenize(class_list=CLASS_LIST,
                                       sample_index=SENTENCE_NUM)

    tc = ToxicClassifier(embedding_dim=EMBEDDING_DIM,
                         num_timesteps=MAX_LENGTH,
                         word_index=word_index,
                         weight_path=WEIGHT_SAVE_PATH,
                         use_aux_input=USE_AUXILLIARY_INPUT,
                         average_attention=AVERAGE_ATTENTION,
                         use_ft=USE_FASTTEXT,
                         visualize=VISUALISE_FULL_ATTENTION)

    if USE_AUXILLIARY_INPUT:
        tc.set_input_and_labels(X_train, y_train, X_aux)
        tc.set_sample_sentence(sample_text, X_train[SENTENCE_NUM],
                               y_train[SENTENCE_NUM], X_aux[SENTENCE_NUM])
    else:
        tc.set_input_and_labels(X_train, y_train)
        tc.set_sample_sentence(sample_text, X_train[SENTENCE_NUM],
                               y_train[SENTENCE_NUM])

    tc.build_model(word_index=word_index, use_skipgram=SKIPGRAM)
    tc.model.summary()

    if TRAIN:
        tc.train(max_epochs=MAX_EPOCHS,
                 batch_size=BATCH_SIZE,
                 val_split=VAL_SPLIT,
                 callbacks=callbacks)

        sample_pred = tc.predict_sample_output()
        print('Original sentence: ', sample_text)
        print('Actual label: ', sample_target)
        print('Model prediction :', sample_pred[0, :])
        present_toxicity = get_toxicity_classes(sample_pred[0, :],
                                                TOXICITY_THRESHOLD, CLASS_LIST)
        print_toxicity_report(sample_pred[0, :], TOXICITY_THRESHOLD,
                              CLASS_LIST)

        if VISUALISE_FULL_ATTENTION:
            visualise_attention(tc.attention_history, sample_text)
        else:
            attention = tc.get_attention_output()
            attention /= sum(attention)  # Normalise to percentage
            label = tc.get_sample_labels()
            visualise_attention_with_text(attention, sample_text,
                                          sample_pred[0, :], present_toxicity,
                                          sample_target, label)

    if MAKE_SUBMISSION:
        print('Loading best weights and predicting on test data\n')
        if USE_AUXILLIARY_INPUT:
            make_aux_submission(tc.model,
                                X_test,
                                test_aux,
                                CLASS_LIST,
                                WEIGHT_SAVE_PATH,
                                SUBMISSION_SAVE_PATH,
                                post_process=True)
        else:
            make_submission(tc.model, X_test, CLASS_LIST, WEIGHT_SAVE_PATH,
                            SUBMISSION_SAVE_PATH)

Ejemplo n.º 10

0

Mostrar archivo

Archivo: predict.py Proyecto: Edwinngera/lasagne_CNN_framework

    shape = (None, 3, cfg.WIDTH, cfg.HEIGHT)
    predict_fn = models.get_predict_function(m_param, model_weights, file_fmt, shape);

    load_and_process = ld.LoadAndProcess(
            size = (cfg.WIDTH, cfg.HEIGHT),
            augmentation_params = None,
            crop = None,
            color_noise = 0,
            fill_size = cfg.pretrained);

    batch_size = cfg.batch_size;
    test_imgs,test_labels = ld.list_imgs_labels(cfg.data_dir,data='test');
    test_data = ld.ImgStream(test_imgs, test_labels, batch_size,
            cycle=False, file_dir_fmt=cfg.data_dir+'/test/{}',
            load_and_process = load_and_process, preload=None);

    print("num of test cases: {}".format(len(test_data)));

    res = [];
    c = 0;
    for imgs,labels in test_data:
        res.append(predict_fn(imgs));
        c += 1;
        if c%50 == 0:
            print("{} processed ".format(c*batch_size));

    res = np.concatenate(res);
    filename = cfg.output_dir + "/submit_{}.csv".format(fname);
    print(res[-1])
    utils.make_submission(filename, test_imgs, res, 0.5e-3);

Ejemplo n.º 11

0

Mostrar archivo

Archivo: run.py Proyecto: Naoki1101/kaggle-ashrae

        convert_type=config['data']['convert_type'])

    logging.disable(logging.FATAL)

    if OOF_PARAMS['save_oof']:
        np.save(f'../logs/{RUN_NAME}/oof.npy', oof)
        save_oof_plot(RUN_NAME, train_y, oof, type_='reg', dia=True)

with t.timer('save features importances'):
    save_importances(RUN_NAME, models, FEATURES)

with t.timer('make submission'):
    output_path = LOGGER_PATH / f'{METER_TYPE}.csv'
    make_submission(y_pred=np.mean(preds, axis=1),
                    target_name=TARGET_NAME,
                    sample_path=SAMPLE_SUB_PATH,
                    output_path=str(output_path),
                    comp=True)

LOGGER_PATH.rename(f'../logs/{RUN_NAME}_{np.mean(scores):.3f}')

process_minutes = t.get_processing_time()

with t.timer('notify'):
    message = f'''{MODEL_NAME}\ncv: {np.mean(scores):.3f}\nscores: {scores}\ntime: {process_minutes:.2f}[min]'''

    send_line(NOTIFY_PARAMS['line']['token'], message)

    send_notion(token_v2=NOTIFY_PARAMS['notion']['token_v2'],
                url=NOTIFY_PARAMS['notion']['url'],
                name=RUN_NAME,

Ejemplo n.º 12

0

Mostrar archivo

Archivo: kaggle_otto_nn.py Proyecto: worldofpiggy/kaggle

    model.add(Dropout(0.5))
    
    model.add(Dense(612, 612, init='glorot_uniform'))
    model.add(PReLU((612,)))
    model.add(BatchNormalization((612,)))
    model.add(Dropout(0.5))
    
    model.add(Dense(612, nb_classes, init='glorot_uniform'))
    model.add(Activation('softmax'))   
    model.compile(loss='categorical_crossentropy', optimizer="adam")
    #model.compile(loss='categorical_crossentropy', optimizer="sgd")
    

print("Training model...")
ne = 17
bs = 32
vs = 0.15
model.fit(X, y, nb_epoch=ne, batch_size=bs, validation_split= vs)

print ("Saving model (will overwrite existing one)")
filename = "keras-nn-%d-%d-%d"%(ne,bs,vs)
ut.save(model, filename, verbose=True)

print("Generating submission...")
proba = model.predict_proba(X_test)
ut.make_submission(proba, ids, encoder, fname='keras-otto-proba-93.csv')

#print(type(proba))
#print(proba[0:10,])

Ejemplo n.º 13

0

Mostrar archivo

    l=0
    i=0
    while l<len(set_X_test):
    if(len(set_X_test[l])>0):
        set_X_test[l]['CSPL_RECEIVED_CALLS'] =   listPred[i]
        i=i+1
    l=l+1
    """


    #on réassemble les valeurs de prédiction
    resultPred= pd.concat(set_X_test)
    resultPred=resultPred.sort_index()
    incremental_prediction.append(resultPred)
print "score global = ",score_global.mean()

print("Merging incremental learning...")
resultPred_final=pd.concat(incremental_prediction)
resultPred_final=resultPred_final.sort_values(by=['DATE', 'cod_ASS_ASSIGNMENT'])

print("Make every prediction positif, ceil it ...")
resultPred_final['CSPL_RECEIVED_CALLS']=resultPred_final['CSPL_RECEIVED_CALLS'].apply(lambda x: x*(x>0))
#resultPred_final['CSPL_RECEIVED_CALLS']=resultPred_final['CSPL_RECEIVED_CALLS'].apply(lambda x: 2.5*x)
resultPred_final['CSPL_RECEIVED_CALLS']=resultPred_final['CSPL_RECEIVED_CALLS'].apply(lambda x: math.ceil(x))

print "Write the submission ..."
make_submission(dataTest,resultPred_final)
print "End."

Ejemplo n.º 14

0

Mostrar archivo

Y_train=np.array(Y_train)
X_train=np.array(X_train)
X_test=np.array(X_test)


#### Creation of regressor 
reg=Regressor()


#### Cross validation
print "Cross validation ..."
#loo = cross_validation.LeaveOneOut(len(y_df))
loo=10
scores = cross_validation.cross_val_score(reg, X_train, Y_train, scoring='mean_squared_error', cv=loo,)
print "The score mean of cross validation : "
print scores.mean()

#### fit 
print "Fit ..."
reg.fit(X_train, Y_train)


#### Prediction
print "Prediction ..."
Y_pred = reg.predict(X_test)

#### write the submission
print "Write the submission ..."
make_submission(dataTest,Y_pred)

print "End."

Ejemplo n.º 15

0

Mostrar archivo

#             pred.iloc[idx] = 0

with t.timer('replace with leak'):
    leak = pd.read_feather(DATA_PATH / 'input/leak.feather')
    leak['timestamp'] = leak['timestamp'].astype(str)
    leak.rename(columns={'meter_reading': 'leak_meter_reading'}, inplace=True)

    test_and_leak = pd.merge(test, leak, on=['building_id', 'meter', 'timestamp'], how='left')
    leak_idx = test_and_leak['leak_meter_reading'].dropna().index
    pred.iloc[leak_idx] = test_and_leak.loc[leak_idx, 'leak_meter_reading']

with t.timer('make submission'):
    output_path = str(DATA_PATH / f'output/sub_{RUN_NAME}_{cv}.csv')
    make_submission(y_pred=pred, 
                    target_name=TARGET_NAME, 
                    sample_path=SAMPLE_SUB_PATH,
                    output_path=output_path,
                    comp=True)


# LOGGER_PATH.rename(f'../logs/{RUN_NAME}_{np.mean(scores):.3f}')

process_minutes = t.get_processing_time()

with t.timer('notify'):
    message = f'''{MODEL_NAME}\ncv: {cv:.3f}\nscores: \ntime: {process_minutes:.2f}[min]'''

    send_line(NOTIFY_PARAMS['line']['token'], message)

    send_notion(token_v2=NOTIFY_PARAMS['notion']['token_v2'],
                url=NOTIFY_PARAMS['notion']['url'],

Ejemplo n.º 16

0

Mostrar archivo

Archivo: ensemble_preds.py Proyecto: worldofpiggy/kaggle

from __future__ import print_function

import numpy as np
import pandas as pd
import utils as ut
import os
import xgboost as xgb
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.normalization import BatchNormalization
from keras.layers.advanced_activations import PReLU
from keras.utils import np_utils, generic_utils

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

np.random.seed(1337) # for reproducibility

## check if raw data exist
print("Loading data...")
X, labels = ut.load_data('data/train.csv', train=True)
data, ids = ut.load_data('data/test.csv', train=False)

print("Preprocessing labels")
y, encoder = ut.preprocess_labels(labels)

prediction_files = ["xgb-otto-proba-round-430-eta-0.csv", 
                    "keras-otto-proba-93.csv"]
ensemble = ut.ensemble(prediction_files, weights=[0.4, 0.6]) 
ut.make_submission(ensemble, ids, encoder, fname='ensemble-otto-selected-93.csv')

Ejemplo n.º 17

0

Mostrar archivo

    logging.disable(logging.FATAL)

    if 'nn' in MODEL_NAME:
        save_learning_curve(RUN_NAME, models)

    if SETTINGS_PARAMS['oof']['save']:
        np.save(f'../logs/{RUN_NAME}/oof.npy', oof)
        save_oof_plot(RUN_NAME, train_y, oof, type_='reg', dia=True)

with t.timer('save features importances'):
    save_importances(RUN_NAME, models, FEATURES)

with t.timer('make submission'):
    output_path = f'../data/output/{RUN_NAME}_{np.mean(scores):.3f}.csv'
    make_submission(y_pred=np.mean(preds, axis=1), target_name=COMPE_PARAMS['target_name'],
                    sample_path=PATH_PARAMS['sample'], output_path=str(output_path), comp=False)

LOGGER_PATH.rename(f'../logs/{RUN_NAME}_{np.mean(scores):.3f}')

process_minutes = t.get_processing_time()

with t.timer('notify'):
    message = f'''{MODEL_NAME}\ncv: {np.mean(scores):.3f}\nscores: {scores}\ntime: {process_minutes:.2f}[min]'''

    send_line(NOTIFY_PARAMS['line']['token'], message)

    send_notion(token_v2=NOTIFY_PARAMS['notion']['token_v2'],
                url=NOTIFY_PARAMS['notion']['url'],
                name=RUN_NAME,
                created=NOW,
                model=MODEL_NAME.split('_')[0],

Ejemplo n.º 18

0

Mostrar archivo

Archivo: stack.py Proyecto: glouppe/kaggle-higgs

    X_pred = load_predictions("stack/*-fold%d.npy" % i)
    X_fold = np.hstack((X_fold, X_pred))

    all_X.append(X_fold)
    all_y.append(y_fold)
    all_w.append(w_fold)

X = np.vstack(all_X)
y = np.concatenate(all_y)
w = np.concatenate(all_w)

clf = Classifier(**params)
w = rescale(w)
w = rebalance(y, w)

try:
    clf.fit(X, y, sample_weight=w)
except:
    clf.fit(X, y)

# And make a submussion
print "Making submission..."
X_test, _, _, _ = load_test()
X_pred = load_predictions("stack/*-test.npy")
X_test = np.hstack((X_test, X_pred))

make_submission(clf, threshold, "output-stacking.csv", X_test=X_test)

import IPython
IPython.embed()

Ejemplo n.º 19

0

Mostrar archivo

model.summary()
# %%
X, y = get_data(as_gray=False)

batch_size = 128
ra = ROCAUC(batch_size)
es = EarlyStopping(monitor='val_auc', patience=2, mode='max')
mc = ModelCheckpoint(f'data/models/model.h5',
                     monitor='val_auc',
                     save_best_only=True,
                     mode='max',
                     verbose=1)
model.fit(X,
          y,
          batch_size=batch_size,
          epochs=50,
          validation_split=.2,
          callbacks=[ra, es, mc])

# %%
model.fit(X, y, batch_size=batch_size, epochs=5)

# %%
X_test, test_ids = get_data(test=True, as_gray=False)

test_predictions = model.predict(X_test, batch_size=batch_size)
test_predictions = test_predictions.flatten()
make_submission(test_ids, test_predictions,
                'submissions/first_transfer_cnn.csv')

Ejemplo n.º 20

0

Mostrar archivo

 def on_epoch_end(self, epoch, logs={}):
     x, y = self.test_data
     predict_dict = predict_final_word(self.model, self.vocabulary, self.filename)
     sub_file = make_submission(predict_dict, opt.student_id, opt.input)
     scoring(sub_file, os.path.join("data"), type="valid")

Ejemplo n.º 21

0

Mostrar archivo

Archivo: main.py Proyecto: yhuangbl/language_model

def main(opt):
    os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpu
    np.random.seed(opt.seed)  # set a seed for reproduciaiblity
    if opt.mode == "train":
        st = time.time()
        print('Loading data')
        x_train, y_train, x_valid, y_valid, vocabulary_size = load_data(
            "data", opt.debug)

        num_training_data = x_train.shape[0]
        sequence_length = x_train.shape[1]
        print(num_training_data)

        print('Vocab Size', vocabulary_size)

        model = build_model(opt.model, opt.embedding_dim, opt.hidden_size,
                            opt.drop, opt.filter, sequence_length,
                            vocabulary_size)
        adam = Adam()
        model.compile(loss='sparse_categorical_crossentropy', optimizer=adam)
        print("Traning Model...")
        checkpoint = ModelCheckpoint(opt.saved_model,
                                     monitor='val_loss',
                                     verbose=1,
                                     save_best_only=True,
                                     mode='min')
        early = EarlyStopping(monitor="val_loss", mode="min", patience=5)
        history = model.fit(x_train,
                            y_train,
                            batch_size=opt.batch_size,
                            epochs=100,
                            verbose=1,
                            validation_data=(x_valid, y_valid),
                            callbacks=[
                                TestCallback((x_valid, y_valid), model=model),
                                checkpoint, early
                            ])
        model.save(opt.saved_model)
        print("Training cost time: ", time.time() - st)
    elif opt.mode == "ensemble":
        x_train, y_train, x_valid, y_valid, vocabulary_size = load_data(
            "data", opt.debug)

        num_training_data = x_train.shape[0]
        sequence_length = x_train.shape[1]
        print(num_training_data)

        print('Vocab Size', vocabulary_size)

        ENSEMBLE_DIR = "models/ensemble/"
        model_files = []
        for (dirpath, dirnames, filenames) in os.walk(ENSEMBLE_DIR):
            model_files.extend(filenames)
            break
        models = []
        model_count = 0
        for filename in model_files:
            model = load_model(ENSEMBLE_DIR + filename)
            model.name = "model" + str(model_count)
            model_count += 1
            models.append(model)

        build_save_ensemble_model(opt.saved_model, models, sequence_length)
    else:
        model = load_model(opt.saved_model)
        vocabulary = json.load(open(os.path.join("data", "vocab.json")))
        predict_dict = predict_final_word(model, vocabulary, opt.input)
        sub_file = make_submission(predict_dict, opt.student_id, opt.input)
        if opt.score:
            scoring(sub_file, os.path.join("data"), type="valid")

Ejemplo n.º 22

0

Mostrar archivo

Archivo: main.py Proyecto: AnotherGitAccount/MachineLearning3

    X_TS = utils.create_fingerprints(TS["SMILES"].values)

    if METHOD == "DT":
        depths, scores = doDecisionTree(X_LS, Y_LS)
        print(scores)

    elif METHOD == "KNN":
        depths, scores = doKNN(X_LS, Y_LS)
        print(scores)

        classifier_knn = KNeighborsClassifier(n_neighbors=50)
        classifier_knn.fit(X_LS, Y_LS)
        pred = classifier_knn.predict_proba(X_TS)
        auc_predicted = 0.7
        fname = utils.make_submission(pred[:, 1], auc_predicted, 'knn_50')
        print('Submission file "{}" successfully written'.format(fname))

    elif METHOD == "RF":
        #ts, depths, scores = doRandomForest(X_LS, Y_LS)
        #print(scores)

        classifier_rf = RandomForestClassifier(n_estimators=800, max_depth=700)
        classifier_rf.fit(X_LS, Y_LS)
        pred = classifier_rf.predict_proba(X_TS)
        auc_predicted = 0.78
        fname = utils.make_submission(pred[:, 1], auc_predicted, 'final')
        print('Submission file "{}" successfully written'.format(fname))

    elif METHOD == "MLP":
        layers, neurones, scores = doMLP(X_LS, Y_LS)

Ejemplo n.º 23

0

Mostrar archivo

Archivo: train.py Proyecto: Naoki1101/kaggle-Melanoma

def main():
    t = Timer()
    seed_everything(cfg.common.seed)

    logger_path.mkdir(exist_ok=True)
    logging.basicConfig(filename=logger_path / 'train.log', level=logging.DEBUG)

    dh.save(logger_path / 'config.yml', cfg)

    with t.timer('load data'):
        train_x = dh.load('../data/input/train_concated.csv')
        train_org_x = dh.load('../data/input/train.csv')
        train_2019_x = dh.load('../data/input/train_2019.csv')
        test_x = dh.load('../data/input/test.csv')

    with t.timer('make folds'):
        fold_org_df = factory.get_fold(cfg.validation.val1, train_org_x, train_org_x[[cfg.common.target]])
        fold2019_df = factory.get_fold(cfg.validation.val2, train_2019_x, train_2019_x[[cfg.common.target]])
        fold_df = pd.concat([fold_org_df, fold2019_df], axis=0, sort=False, ignore_index=True)
        if cfg.validation.val1.single:
            fold_df = fold_df[['fold_0']]
            fold_df /= fold_df['fold_0'].max()

    with t.timer('load features'):
        features = dh.load('../configs/feature/all.yml')['features']
        for f in features:
            train_x[f] = dh.load(f'../features/{f}_train.feather')[f].fillna(-1)
            test_x[f] = dh.load(f'../features/{f}_test.feather')[f].fillna(-1)

    with t.timer('drop several rows'):
        if cfg.common.drop is not None:
            drop_idx = factory.get_drop_idx(cfg.common.drop)
            train_x = train_x.drop(drop_idx, axis=0).reset_index(drop=True)
            fold_df = fold_df.drop(drop_idx, axis=0).reset_index(drop=True)

    with t.timer('train model'):
        result = train_model(run_name, train_x, fold_df, cfg)
    
    logging.disable(logging.FATAL)
    run_name_cv = f'{run_name}_{result["cv"]:.3f}'
    logger_path.rename(f'../logs/{run_name_cv}')

    with t.timer('predict'):
        preds = predict_test(run_name_cv, test_x, fold_df, cfg)

    with t.timer('post process'):
        duplicates = {
            'ISIC_5224960': 1,
            'ISIC_9207777': 1,
            'ISIC_6457527': 1,
            'ISIC_8347588': 0,
            'ISIC_8372206': 1,
            'ISIC_9353360': 1,
            'ISIC_3689290': 0,
            'ISIC_3584949': 0,  
        }
        for image_name, target in duplicates.items():
            idx = test_x[test_x['image_name'] == image_name].index[0]
            preds[idx] = target

    with t.timer('make submission'):
        sample_path = f'../data/input/sample_submission.csv'
        output_path = f'../data/output/{run_name_cv}.csv'
        make_submission(y_pred=preds,
                        target_name=cfg.common.target,
                        sample_path=sample_path,
                        output_path=output_path,
                        comp=False)

    with t.timer('kaggle api'):
        kaggle = Kaggle(cfg.compe.compe_name, run_name_cv)
        if cfg.common.kaggle.submit:
            kaggle.submit(comment)

    with t.timer('notify'):
        process_minutes = t.get_processing_time()
        message = f'''{model_name}\ncv: {result["cv"]:.3f}\ntime: {process_minutes:.2f}[h]'''
        send_line(notify_params.line.token, message)

        notion = Notion(token=notify_params.notion.token_v2)
        notion.set_url(url=notify_params.notion.url)
        notion.insert_rows({
            'name': run_name_cv,
            'created': now,
            'model': cfg.model.name,
            'local_cv': round(result['cv'], 4),
            'time': process_minutes,
            'comment': comment
        })

Ejemplo n.º 24

0

Mostrar archivo

Archivo: train.py Proyecto: Naoki1101/kaggle-Melanoma

def main():
    t = Timer()
    seed_everything(cfg.common.seed)

    logger_path.mkdir(exist_ok=True)
    logging.basicConfig(filename=logger_path / 'train.log',
                        level=logging.DEBUG)

    dh.save(logger_path / 'config.yml', cfg)
    dh.save(logger_path / 'features.yml', features_params)

    with t.timer('load data'):
        train_df = dh.load('../data/input/train.csv')
        train2019_df = dh.load('../data/input/train_concated.csv')
        train_x = factory.get_features(features, cfg.data.loader.train)
        test_x = factory.get_features(features, cfg.data.loader.test)
        train_y = factory.get_target(cfg.data.target)

    with t.timer('add oof'):
        if cfg.data.features.oof.name is not None:
            oof, preds = factory.get_oof(cfg.data)
            train_x['oof'] = oof
            test_x['oof'] = preds
            features.append('oof')

    with t.timer('make folds'):
        fold_df = factory.get_fold(cfg.validation, train_df,
                                   train_df[['target']])
        fold_df = pd.concat([
            fold_df,
            pd.DataFrame(np.zeros((len(train2019_df), len(fold_df.columns))),
                         columns=fold_df.columns)
        ],
                            axis=0,
                            sort=False,
                            ignore_index=True)
        if cfg.validation.single:
            fold_df = fold_df[['fold_0']]
            fold_df /= fold_df['fold_0'].max()

    with t.timer('drop index'):
        if cfg.common.drop is not None:
            drop_idx = factory.get_drop_idx(cfg.common.drop)
            train_x = train_x.drop(drop_idx, axis=0).reset_index(drop=True)
            train_y = train_y.drop(drop_idx, axis=0).reset_index(drop=True)
            fold_df = fold_df.drop(drop_idx, axis=0).reset_index(drop=True)

    with t.timer('prepare for ad'):
        if cfg.data.adversarial_validation:
            train_x, train_y = factory.get_ad(cfg, train_x, test_x)

    with t.timer('train and predict'):
        trainer = Trainer(cfg)
        cv = trainer.train(train_df=train_x,
                           target_df=train_y,
                           fold_df=fold_df)
        preds = trainer.predict(test_x)
        trainer.save(run_name)

        run_name_cv = f'{run_name}_{cv:.3f}'
        logger_path.rename(f'../logs/{run_name_cv}')
        logging.disable(logging.FATAL)

    with t.timer('make submission'):
        sample_path = f'../data/input/sample_submission.csv'
        output_path = f'../data/output/{run_name_cv}.csv'
        make_submission(y_pred=preds,
                        target_name=cfg.data.target.name,
                        sample_path=sample_path,
                        output_path=output_path,
                        comp=False)
        if cfg.common.kaggle.submit:
            kaggle = Kaggle(cfg.compe.name, run_name_cv)
            kaggle.submit(comment)

    with t.timer('notify'):
        process_minutes = t.get_processing_time()
        message = f'''{cfg.model.name}\ncv: {cv:.3f}\ntime: {process_minutes}[min]'''
        send_line(notify_params.line.token, message)

        notion = Notion(token=notify_params.notion.token_v2)
        notion.set_url(url=notify_params.notion.url)
        notion.insert_rows({
            'name': run_name_cv,
            'created': now,
            'model': options.model,
            'local_cv': round(cv, 4),
            'time': process_minutes,
            'comment': comment
        })

Ejemplo n.º 25

0

Mostrar archivo

def main():
    t = Timer()
    seed_everything(cfg.common.seed)

    logger_path.mkdir(exist_ok=True)

    dh.save(logger_path / 'config.yml', cfg)

    with t.timer('load data'):
        train_df = dh.load('../data/input/train_data.csv')
        test_df = dh.load('../data/input/test_data.csv')

        oof = np.zeros((len(train_df), len(cfg.models)))
        preds = np.zeros((len(test_df), len(cfg.models)))

        for i, m in enumerate(cfg.models):
            name = getattr(cfg.models, m).name

            log_dir = Path(f'../logs/{name}')
            model_oof = dh.load(log_dir / 'oof.npy')
            model_cfg = dh.load(log_dir / 'config.yml')
            if model_cfg.common.drop:
                drop_idxs = np.array([])
                for drop_name in model_cfg.common.drop:
                    drop_idx = dh.load(f'../pickle/{drop_name}.npy')
                    drop_idxs = np.append(drop_idxs, drop_idx)
                model_oof = factory.fill_dropped(model_oof, drop_idx)

            model_preds = dh.load(f'../logs/{name}/raw_preds.npy')

            oof[:, i] = model_oof[:len(train_df)]
            preds[:, i] = model_preds

    with t.timer('drop index'):
        if cfg.common.drop is not None:
            drop_idxs = np.array([])
            for drop_name in model_cfg.common.drop:
                drop_idx = dh.load(f'../pickle/{drop_name}.npy')
                drop_idxs = np.append(drop_idxs, drop_idx)
            train_df = train_df.drop(drop_idxs, axis=0).reset_index(drop=True)

    with t.timer('optimize model weight'):
        metric = factory.get_metrics(cfg.common.metrics.name)
        y_true = train_df[cfg.common.target]

        def objective(trial):
            p_list = [0 for i in range(len(cfg.models))]
            for i in range(len(cfg.models) - 1):
                p_list[i] = trial.suggest_discrete_uniform(f'p{i}', 0.0, 1.0 - sum(p_list), 0.01)
            p_list[-1] = round(1 - sum(p_list[:-1]), 2)

            y_pred = np.zeros(len(train_df))
            for i in range(oof.shape[1]):
                y_pred += oof[:, i] * p_list[i]

            return metric(y_true, y_pred)

        study = optuna.create_study(direction='minimize')
        study.optimize(objective, timeout=10)
        best_params = list(study.best_params.values())
        best_weight = best_params + [round(1 - sum(best_params), 2)]

    with t.timer('ensemble'):
        ensemble_oof = np.zeros(len(train_df))
        ensemble_preds = np.zeros(len(test_df))
        for i in range(len(best_weight)):
            ensemble_oof += oof[:, i] * best_weight[i]
            ensemble_preds += preds[:, i] * best_weight[i]

        dh.save(f'../logs/{run_name}/oof.npy', ensemble_oof)
        dh.save(f'../logs/{run_name}/raw_preds.npy', ensemble_preds)

        cv = metric(y_true, ensemble_oof)
        run_name_cv = f'{run_name}_{cv:.3f}'
        logger_path.rename(f'../logs/{run_name_cv}')

        print('\n\n===================================\n')
        print(f'CV: {cv:.4f}')
        print(f'BEST WEIGHT: {best_weight}')
        print('\n===================================\n\n')

    with t.timer('make submission'):
        sample_path = f'../data/input/sample_submission.feather'
        output_path = f'../data/output/{run_name_cv}.csv'
        make_submission(y_pred=ensemble_preds,
                        target_name=cfg.common.target,
                        sample_path=sample_path,
                        output_path=output_path,
                        comp=False)
        if cfg.common.kaggle.submit:
            kaggle = Kaggle(cfg.compe.compe_name, run_name_cv)
            kaggle.submit(comment)

    with t.timer('notify'):
        process_minutes = t.get_processing_time()
        message = f'''{options.model}\ncv: {cv:.3f}\ntime: {process_minutes}[min]'''
        send_line(notify_params.line.token, message)

        notion = Notion(token=notify_params.notion.token_v2)
        notion.set_url(url=notify_params.notion.url)
        notion.insert_rows({
            'name': run_name_cv,
            'created': now,
            'model': options.model,
            'local_cv': round(cv, 4),
            'time': process_minutes,
            'comment': comment
        })

Ejemplo n.º 26

0

Mostrar archivo

# check correlation between base learner predictions
np.corrcoef(X_train_level_2.T)
sns.jointplot(X_train_level_2[:,0], X_train_level_2[:,1])
plt.show()

# simple convex combination between pair
alphas_to_try = np.linspace(0, 1, 1001)
rmse_best = np.Inf
for alpha in alphas_to_try:
    mix = alpha * X_train_level_2[:,0] + (1-alpha) * X_train_level_2[:,1]
    rmse_new = np.sqrt(mean_squared_error(Y_train_level_2, mix))
    if rmse_new < rmse_best:
        alpha_best = alpha
        rmse_best = rmse_new

score = round(rmse_best, 6)
pred_test = alpha_best * X_test_level_2[:,0] + (1-alpha_best) * X_test_level_2[:,1]
ids = np.array(df.loc[df['date_block_num'] == 34, 'ID'])
submission = make_submission(ids, np.array(pred_test).flatten())

# export
today = datetime.datetime.now()
sub_id = today.strftime('%y%m%d') + '_' + today.strftime("%H%M") + \
		'_score_' + str(score)
folder = OUT_FOLDER + '/' + sub_id
os.mkdir(folder)
print('\n---- ' + sub_id + ' ----')
submission.to_csv(os.path.join(folder, 'submission.csv'), index=False)

Ejemplo n.º 27

0

Mostrar archivo

Archivo: hog_svm.py Proyecto: jingjing-shi/ids705-solar-kaggle

# 5 fold cross validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
prediction_scores = np.empty(y.shape[0], dtype='object')

for train_idx, val_idx in tqdm(skf.split(X, y)):
    X_train, X_val = X[train_idx], X[val_idx]
    y_train = y[train_idx]

    clf = clf.fit(X_train, y_train)
    y_pred = clf.predict_proba(X_val)[:, 1]

    # Save the predictions for this fold
    prediction_scores[val_idx] = y_pred

plt.title('SVM 5-fold cross validation ROC AUC')
plot_roc(y, prediction_scores)
plt.savefig('report/figures/svm_roc.png', dpi=300)

plot_prediction_samples(imgs, y, prediction_scores, 'SVM Prediction Samples')
plt.savefig('report/figures/svm_confmat.png', dpi=300)
# %%

# load and preprocess test data then create submission
X_test, test_ids = get_data(test=True)
X_test = np.stack([get_HOG(img, **hog_params) for img in X_test])

clf = clf.fit(X, y)
test_predictions = clf.predict_proba(X_test)[:, 1]
make_submission(test_ids,
                test_predictions,
                fname='submissions/svc_10_hog_16_4_fulltrain.csv')

Ejemplo n.º 28

0

Mostrar archivo

Archivo: anomaly_detection.py Proyecto: junzhuang-code/audio_classification

X_test = read_pickle('../audio_data/X_test4d.pkl')
Y_train = read_pickle('../audio_data/Y_train1d.pkl')
print("The shape of X_train/X_test/Y_train: ", X_train.shape, X_test.shape, Y_train.shape)

# Instantiate the model
bigan = BIGAN(X_train.shape[1], X_train.shape[2], X_train.shape[3])

if is_trainable:
    # Training the BiGAN
    bigan.train_by_batch(X_train, epochs=NUM_EPOCHS, batch_size=BATCH_SIZE)
    #bilstm.train_all(X_train_, Y_train_, BATCH_SIZE, NUM_EPOCHS)
else:
    # Restore the checkpoint
    checkpoint_dir = './runs/checkpoint_bigan'
    checkpoint = tf.train.Checkpoint()
    checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir)).expect_partial()
    print("Checkpoint restored for Anomaly Detection!")

    # Anomaly Detection
    AS = bigan.compute_anomaly_score(X_train, Y_train, X_test)
    # Prediction
    ts = NUM_OUTLIERS/len(X_test) # Find out the best threshold
    Y_pred_AS = bigan.predict_outlier(AS, ts)
    #print("Y_pred_AS: ", Counter(Y_pred_AS))

    # Geneate final Y_pred and make submission
    Y_pred = np.load('Y_pred.npy')
    Y_pred_new = gen_Y_pred(Y_pred, Y_pred_AS)
    print("Y_pred_new.shape: ", Y_pred_new.shape)
    make_submission(Y_pred_new, "submission")

Ejemplo n.º 29

0

Mostrar archivo

models = []
for train_idx, val_idx in skf.split(X, y):
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    model = get_model()
    y_train = to_categorical(y_train)
    model.fit_generator(imagen.flow(X_train, y_train, batch_size=BATCH_SIZE),
                        steps_per_epoch=batch_per_epoch,
                        epochs=EPOCHS,
                        verbose=0)
    prediction_scores[val_idx] = model.predict(X_val, batch_size=BATCH_SIZE)[:,
                                                                             1]
    cur_auc = roc_auc_score(y_val, prediction_scores[val_idx])
    print(cur_auc)
    if cur_auc < 0.8:
        break
    models.append(model)

print(roc_auc_score(y, prediction_scores))

# %%
X_test, test_ids = get_data(test=True, as_gray=False)
X_test = X_test / 255.

test_predictions = np.mean(
    [m.predict(X_test, batch_size=BATCH_SIZE)[:, 1] for m in models], axis=0)

make_submission(test_ids, test_predictions, 'submissions/homebrew_cnn_CV.csv')
# %%
[m.save(f'data/models/model_fold_{i}.h5') for i, m in enumerate(models)]

Ejemplo n.º 30

0

Mostrar archivo

    X_LS = fingerprints.transform(LS['SMILES'].values, FINGERPRINT)
    y_LS = LS['ACTIVE'].values

    # Variance threshold (feature selection)
    selector = VarianceThreshold()
    selector.fit(X_LS)
    X_LS = selector.transform(X_LS)

    # Cross validation score
    cv = ShuffleSplit(n_splits=5, test_size=0.25, random_state=0)
    scores = cross_val_score(MODEL, X_LS, y_LS, cv=cv, scoring='roc_auc')

    # Estimated AUC
    AUC = scores.mean()

    # Train model
    MODEL.fit(X_LS, y_LS)

    # Create fingerprint features of test set
    X_TS = fingerprints.transform(TS['SMILES'].values, FINGERPRINT)
    X_TS = selector.transform(X_TS)

    # Predict
    prob = MODEL.predict_proba(X_TS)[:, -1]

    # Writing the submission file
    os.makedirs(DESTINATION, exist_ok=True)
    fname = utils.make_submission(prob, AUC, DESTINATION + 'submission')

    print('Submission file "{}" successfully written'.format(fname))