def main(opt): os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpu if opt.mode == "train": st = time.time() print('Loading data') x_train, y_train, x_valid, y_valid, vocabulary_size = load_data( "data", opt.debug) num_training_data = x_train.shape[0] sequence_length = x_train.shape[1] print(num_training_data) print('Vocab Size', vocabulary_size) model = build_model(opt.embedding_dim, opt.hidden_size, opt.drop, sequence_length, vocabulary_size) print("Traning Model...") history = model.fit( x_train, y_train, batch_size=opt.batch_size, epochs=opt.epochs, verbose=1, callbacks=[TestCallback((x_valid, y_valid), model=model)]) model.save(opt.saved_model) print("Training cost time: ", time.time() - st) elif opt.mode == "ensemble": model1 = load_model(opt.saved_model1) model1.name = 'model1' for layer in model1.layers: layer.name = layer.name + str("_1") model2 = load_model(opt.saved_model2) model2.name = 'model2' for layer in model2.layers: layer.name = layer.name + str("_2") models = [model1, model2] vocabulary = json.load(open(os.path.join("data", "vocab.json"))) predict_dict = predict_final_word_models(models, vocabulary, opt.input) sub_file = make_submission(predict_dict, opt.student_id, opt.input) if opt.score: scoring(sub_file, os.path.join("data"), type="valid") # x_train, y_train, x_valid, y_valid, vocabulary_size = load_data( # "data", opt.debug) # num_training_data = x_train.shape[0] # sequence_length = x_train.shape[1] # model_inputs = Input(shape=(sequence_length,), dtype='int32') # model = ensemble(models, model_inputs) # model.save(opt.model_to_be_saved) else: model = load_model(opt.saved_model) vocabulary = json.load(open(os.path.join("data", "vocab.json"))) predict_dict = predict_final_word(model, vocabulary, opt.input) sub_file = make_submission(predict_dict, opt.student_id, opt.input) if opt.score: scoring(sub_file, os.path.join("data"), type="valid")
def main(opt): os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpu if opt.mode == "train": st = time.time() print('Loading data') x_train, y_train, x_valid, y_valid, vocabulary_size = load_data( "data", opt.debug) num_training_data = x_train.shape[0] sequence_length = x_train.shape[1] print(num_training_data) print('Vocab Size', vocabulary_size) model = build_model(opt.embedding_dim, opt.hidden_size, opt.drop1, opt.drop2, sequence_length, vocabulary_size) print("Training Model...") model.fit(x_train, y_train, batch_size=opt.batch_size, epochs=opt.epochs, verbose=2, callbacks=[TestCallback((x_valid, y_valid), model=model)]) model.save(opt.saved_model) print("Training cost time: ", time.time() - st) else: if opt.mode == "score_valid": model = load_model(opt.saved_model) vocabulary = json.load(open(os.path.join("data", "vocab.json"))) predict_dict = predict_final_word([model], vocabulary, opt.input) sub_file = make_submission(predict_dict, opt.student_id, opt.input) scoring(sub_file, os.path.join("data"), type="valid") else: model0 = load_model('models/model0.h5') model1 = load_model('models/model1.h5') model2 = load_model('models/model2.h5') model3 = load_model('models/model3.h5') model4 = load_model('models/model4.h5') model5 = load_model('models/model5.h5') model6 = load_model('models/model6.h5') model7 = load_model('models/model7.h5') model8 = load_model('models/model8.h5') model9 = load_model('models/model9.h5') model_list = [ model0, model1, model2, model3, model4, model5, model6, model7, model8, model9 ] vocabulary = json.load(open(os.path.join("data", "vocab.json"))) predict_dict = predict_final_word(model_list, vocabulary, opt.input) sub_file = make_submission(predict_dict, opt.student_id, opt.input) scoring(sub_file, os.path.join("data"), type="valid")
def main(opt): os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpu if opt.mode == "train": st = time.time() print('Loading data') x_train, y_train, x_valid, y_valid, vocabulary_size = load_data( "data", opt.debug) num_training_data = x_train.shape[0] sequence_length = x_train.shape[1] print(num_training_data) print('Vocab Size', vocabulary_size) model = build_model(opt.embedding_dim, opt.hidden_size, opt.drop, sequence_length, vocabulary_size, opt.optimizer) print("Traning Model...") history = model.fit( x_train, y_train, batch_size=opt.batch_size, epochs=opt.epochs, verbose=1, callbacks=[TestCallback((x_valid, y_valid), model=model)]) model.save(opt.saved_model) print("Training cost time: ", time.time() - st) else: model = load_model(opt.saved_model) vocabulary = json.load(open(os.path.join("data", "vocab.json"))) predict_dict = predict_final_word(model, vocabulary, opt.input) sub_file = make_submission(predict_dict, opt.student_id, opt.input) if opt.score: scoring(sub_file, os.path.join("data"), type="valid")
def main(): t = Timer() seed_everything(cfg.common.seed) logger_path.mkdir(exist_ok=True) logging.basicConfig(filename=logger_path / 'train.log', level=logging.DEBUG) dh.save(logger_path / 'config.yml', cfg) with t.timer('load data'): train_df = pd.read_csv(const.TRAIN_PATH) test_df = pd.read_csv(const.TEST_PATH) with t.timer('make folds'): fold_df = factory.get_fold(cfg.validation, train_df) if cfg.validation.single: fold_df = fold_df[['fold_0']] fold_df /= fold_df['fold_0'].max() with t.timer('drop index'): if cfg.common.drop is not None: drop_idx = factory.get_drop_idx(cfg.common.drop) train_df = train_df.drop(drop_idx, axis=0).reset_index(drop=True) fold_df = fold_df.drop(drop_idx, axis=0).reset_index(drop=True) with t.timer('train model'): trainer = NNTrainer(run_name, fold_df, cfg) cv = trainer.train(train_df=train_df, target_df=train_df[const.TARGET_COL]) preds = trainer.predict(test_df) trainer.save() run_name_cv = f'{run_name}_{cv:.3f}' logger_path.rename(f'../logs/{run_name_cv}') logging.disable(logging.FATAL) with t.timer('make submission'): make_submission(run_name=run_name_cv, y_pred=preds, target_name='Label', comp=False) if cfg.common.kaggle.submit: kaggle = Kaggle(cfg.compe.name, run_name_cv) kaggle.submit(comment)
def main(): experiment_name = now() cv_path = Path(f"result/{experiment_name}") cv_path.mkdir(parents=True) copy_script(cv_path) log = Logger(experiment_name, cv_path / "exp.log") log.info("load data") with log.interval_timer("load data"): train_X = load_fs_tosh('all_snap', conf) train_y = feather.read_dataframe("features/HasDetections.ftr") train_y = train_y.HasDetections test = load_fs_tosh('all_snap', conf, test=True) log.info(pformat(list(train_X.columns))) cv = StratifiedKFold(n_splits=5, random_state=conf.seed) cv = cv.split(train_X, train_y) log.info("learning start") log.double_kiritori() with open('features/NN/conf_tosh_all_snap.pkl', 'rb') as p: embedd_conf = pickle.load(p) log.info(pformat(embedd_conf)) score, pred, meta = NN_cv(train_X, train_y, cv, log, cv_path, X_test=test, split_conf=embedd_conf) log.info(score) log.double_kiritori() log.info("done") del train_X, train_y np.save(cv_path / "test_preds.npy", pred) np.save(cv_path / "oof_preds.npy", meta) make_submission(pred, f"submissions/{experiment_name}.csv.gz")
def main(opt): os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpu if opt.mode == "train": st = time.time() print('Loading data') x_train, y_train, x_valid, y_valid, vocabulary_size = load_data( "data", opt.debug) num_training_data = x_train.shape[0] sequence_length = x_train.shape[1] print(num_training_data) print('Vocab Size', vocabulary_size) model = build_model(opt.embedding_dim, opt.hidden_size, opt.drop, sequence_length, vocabulary_size) print("Traning Model...") history = model.fit( x_train, y_train, batch_size=opt.batch_size, epochs=opt.epochs, verbose=1, callbacks=[TestCallback((x_valid, y_valid), model=model)]) model.save(opt.saved_model) # Save the model architecture #with open('model_architecture.yaml', 'w') as f: # f.write(model.to_json()) print("Training cost time: ", time.time() - st) else: # Model reconstruction from JSON file #with open('model_architecture.yaml', 'r') as f: # model = model_from_yaml(f.read()) model = load_model( opt.saved_model, custom_objects={'LayerNormalization': LayerNormalization}) vocabulary = json.load(open(os.path.join("data", "vocab.json"))) predict_dict = predict_final_word(model, vocabulary, opt.input) sub_file = make_submission(predict_dict, opt.student_id, opt.input) if opt.score: scoring(sub_file, os.path.join("data"), type="valid")
X_fold = np.hstack((X_fold, X_pred)) all_X.append(X_fold) all_y.append(y_fold) all_w.append(w_fold) X = np.vstack(all_X) y = np.concatenate(all_y) w = np.concatenate(all_w) clf = Classifier(**params) w = rescale(w) w = rebalance(y, w) try: clf.fit(X, y, sample_weight=w) except: clf.fit(X, y) # And make a submussion print "Making submission..." X_test, _, _, _ = load_test() X_pred = load_predictions("stack/*-test.npy") X_test = np.hstack((X_test, X_pred)) make_submission(clf, threshold, "output-stacking.csv", X_test=X_test) import IPython; IPython.embed()
"bootstrap": False, "max_features": 27 } # Train on the whole training set def train(Classifier, params, X, y, w, verbose=1): if verbose > 0: print "[Start]" w = rescale(w) w = rebalance(y, w) clf = Classifier(**params) clf.fit(X, y, sample_weight=w) if verbose > 0: print "[End]" return clf clf = train(Classifier, params, X, y, w) # Make submission threshold = -2.74420523643 make_submission(clf, threshold, "output-rs.csv") import IPython IPython.embed()
def main(args): """Train (and evaluate) a GRU-based model for classifying toxic content in wikipedia comments. Takes a preprocessed (cleaned, tokenized, and padded) comments as input and outputs the probability of six different types of toxicity being contained in the comment. Execution is modified by a number of call arguments, described below. Parameters ---------- --train (-t) : (Re)train the model. Leave this out if only doing inference or only evaluating on test set. --auxilliary_input (-a) : Use auxilliary input to the model for training and testing. Auxilliary input consists of class probabilities calculated using ridge regression. Requires that said auxilliary input is already generate for a given input sentence. --combine_data (-c) : Combine training and test data with additional figshare comments when fitting tokenizer to data. --submit (-s) : Turn test predictions into a submission for Kaggle. --visualise (-v) : Visualise attention activations for a sentence. --fasttext (-f) : Use word embeddings trained using fasttext instead of pre-trained GloVe embeddings. """ TRAIN = args.train USE_AUXILLIARY_INPUT = args.auxilliary_input COMBINE_DATA = args.combine_data MAKE_SUBMISSION = args.submit VISUALISE_FULL_ATTENTION = args.visualise USE_FASTTEXT = args.fasttext MAX_NUM_WORDS = None MAX_LENGTH = 150 EMBEDDING_DIM = 300 SKIPGRAM = True MAX_EPOCHS = 50 BATCH_SIZE = 512 VAL_SPLIT = 0.2 SENTENCE_NUM = 51 TOXICITY_THRESHOLD = 0.6 AVERAGE_ATTENTION = False BASE_LR = 0.0001 MAX_LR = 0.005 STEP_SIZE = 30000 CLR_MODE = 'triangular' now = datetime.datetime.now() now = now.strftime('%Y%m%d%H%M') LOG_PATH = './logs/' + now WEIGHT_SAVE_PATH = 'weights_base.best.hdf5' SUBMISSION_SAVE_PATH = './submissions/submission_' + now + '.csv' ES_PATIENCE = 6 TB_HIST_FREQ = 0 TB_WRITE_GRAPH = True clr_params = { 'base_lr': BASE_LR, 'max_lr': MAX_LR, 'step_size': STEP_SIZE, 'mode': CLR_MODE } ckpt_params = { 'filepath': WEIGHT_SAVE_PATH, 'verbose': 1, 'save_best_only': True, 'save_weights_only': True } es_params = {'patience': ES_PATIENCE} tb_params = { 'log_dir': LOG_PATH, 'histogram_freq': TB_HIST_FREQ, 'write_graph': TB_WRITE_GRAPH, 'batch_size': BATCH_SIZE, 'embeddings_freq': MAX_EPOCHS + 1 } callbacks = get_callbacks(clr_params, ckpt_params, es_params, tb_params) CLASS_LIST = [ 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate' ] txt_prep = TextPreprocessor(max_nb_words=MAX_NUM_WORDS, max_padding_length=MAX_LENGTH, combine_data=COMBINE_DATA, use_auxilliary_features=USE_AUXILLIARY_INPUT) if USE_AUXILLIARY_INPUT: X_train, X_aux, y_train, X_test, test_aux, word_index, sample_text, sample_target = \ txt_prep.load_and_tokenize(class_list=CLASS_LIST, sample_index=SENTENCE_NUM) else: X_train, y_train, X_test, word_index, sample_text, sample_target = \ txt_prep.load_and_tokenize(class_list=CLASS_LIST, sample_index=SENTENCE_NUM) tc = ToxicClassifier(embedding_dim=EMBEDDING_DIM, num_timesteps=MAX_LENGTH, word_index=word_index, weight_path=WEIGHT_SAVE_PATH, use_aux_input=USE_AUXILLIARY_INPUT, average_attention=AVERAGE_ATTENTION, use_ft=USE_FASTTEXT, visualize=VISUALISE_FULL_ATTENTION) if USE_AUXILLIARY_INPUT: tc.set_input_and_labels(X_train, y_train, X_aux) tc.set_sample_sentence(sample_text, X_train[SENTENCE_NUM], y_train[SENTENCE_NUM], X_aux[SENTENCE_NUM]) else: tc.set_input_and_labels(X_train, y_train) tc.set_sample_sentence(sample_text, X_train[SENTENCE_NUM], y_train[SENTENCE_NUM]) tc.build_model(word_index=word_index, use_skipgram=SKIPGRAM) tc.model.summary() if TRAIN: tc.train(max_epochs=MAX_EPOCHS, batch_size=BATCH_SIZE, val_split=VAL_SPLIT, callbacks=callbacks) sample_pred = tc.predict_sample_output() print('Original sentence: ', sample_text) print('Actual label: ', sample_target) print('Model prediction :', sample_pred[0, :]) present_toxicity = get_toxicity_classes(sample_pred[0, :], TOXICITY_THRESHOLD, CLASS_LIST) print_toxicity_report(sample_pred[0, :], TOXICITY_THRESHOLD, CLASS_LIST) if VISUALISE_FULL_ATTENTION: visualise_attention(tc.attention_history, sample_text) else: attention = tc.get_attention_output() attention /= sum(attention) # Normalise to percentage label = tc.get_sample_labels() visualise_attention_with_text(attention, sample_text, sample_pred[0, :], present_toxicity, sample_target, label) if MAKE_SUBMISSION: print('Loading best weights and predicting on test data\n') if USE_AUXILLIARY_INPUT: make_aux_submission(tc.model, X_test, test_aux, CLASS_LIST, WEIGHT_SAVE_PATH, SUBMISSION_SAVE_PATH, post_process=True) else: make_submission(tc.model, X_test, CLASS_LIST, WEIGHT_SAVE_PATH, SUBMISSION_SAVE_PATH)
shape = (None, 3, cfg.WIDTH, cfg.HEIGHT) predict_fn = models.get_predict_function(m_param, model_weights, file_fmt, shape); load_and_process = ld.LoadAndProcess( size = (cfg.WIDTH, cfg.HEIGHT), augmentation_params = None, crop = None, color_noise = 0, fill_size = cfg.pretrained); batch_size = cfg.batch_size; test_imgs,test_labels = ld.list_imgs_labels(cfg.data_dir,data='test'); test_data = ld.ImgStream(test_imgs, test_labels, batch_size, cycle=False, file_dir_fmt=cfg.data_dir+'/test/{}', load_and_process = load_and_process, preload=None); print("num of test cases: {}".format(len(test_data))); res = []; c = 0; for imgs,labels in test_data: res.append(predict_fn(imgs)); c += 1; if c%50 == 0: print("{} processed ".format(c*batch_size)); res = np.concatenate(res); filename = cfg.output_dir + "/submit_{}.csv".format(fname); print(res[-1]) utils.make_submission(filename, test_imgs, res, 0.5e-3);
convert_type=config['data']['convert_type']) logging.disable(logging.FATAL) if OOF_PARAMS['save_oof']: np.save(f'../logs/{RUN_NAME}/oof.npy', oof) save_oof_plot(RUN_NAME, train_y, oof, type_='reg', dia=True) with t.timer('save features importances'): save_importances(RUN_NAME, models, FEATURES) with t.timer('make submission'): output_path = LOGGER_PATH / f'{METER_TYPE}.csv' make_submission(y_pred=np.mean(preds, axis=1), target_name=TARGET_NAME, sample_path=SAMPLE_SUB_PATH, output_path=str(output_path), comp=True) LOGGER_PATH.rename(f'../logs/{RUN_NAME}_{np.mean(scores):.3f}') process_minutes = t.get_processing_time() with t.timer('notify'): message = f'''{MODEL_NAME}\ncv: {np.mean(scores):.3f}\nscores: {scores}\ntime: {process_minutes:.2f}[min]''' send_line(NOTIFY_PARAMS['line']['token'], message) send_notion(token_v2=NOTIFY_PARAMS['notion']['token_v2'], url=NOTIFY_PARAMS['notion']['url'], name=RUN_NAME,
model.add(Dropout(0.5)) model.add(Dense(612, 612, init='glorot_uniform')) model.add(PReLU((612,))) model.add(BatchNormalization((612,))) model.add(Dropout(0.5)) model.add(Dense(612, nb_classes, init='glorot_uniform')) model.add(Activation('softmax')) model.compile(loss='categorical_crossentropy', optimizer="adam") #model.compile(loss='categorical_crossentropy', optimizer="sgd") print("Training model...") ne = 17 bs = 32 vs = 0.15 model.fit(X, y, nb_epoch=ne, batch_size=bs, validation_split= vs) print ("Saving model (will overwrite existing one)") filename = "keras-nn-%d-%d-%d"%(ne,bs,vs) ut.save(model, filename, verbose=True) print("Generating submission...") proba = model.predict_proba(X_test) ut.make_submission(proba, ids, encoder, fname='keras-otto-proba-93.csv') #print(type(proba)) #print(proba[0:10,])
l=0 i=0 while l<len(set_X_test): if(len(set_X_test[l])>0): set_X_test[l]['CSPL_RECEIVED_CALLS'] = listPred[i] i=i+1 l=l+1 """ #on réassemble les valeurs de prédiction resultPred= pd.concat(set_X_test) resultPred=resultPred.sort_index() incremental_prediction.append(resultPred) print "score global = ",score_global.mean() print("Merging incremental learning...") resultPred_final=pd.concat(incremental_prediction) resultPred_final=resultPred_final.sort_values(by=['DATE', 'cod_ASS_ASSIGNMENT']) print("Make every prediction positif, ceil it ...") resultPred_final['CSPL_RECEIVED_CALLS']=resultPred_final['CSPL_RECEIVED_CALLS'].apply(lambda x: x*(x>0)) #resultPred_final['CSPL_RECEIVED_CALLS']=resultPred_final['CSPL_RECEIVED_CALLS'].apply(lambda x: 2.5*x) resultPred_final['CSPL_RECEIVED_CALLS']=resultPred_final['CSPL_RECEIVED_CALLS'].apply(lambda x: math.ceil(x)) print "Write the submission ..." make_submission(dataTest,resultPred_final) print "End."
Y_train=np.array(Y_train) X_train=np.array(X_train) X_test=np.array(X_test) #### Creation of regressor reg=Regressor() #### Cross validation print "Cross validation ..." #loo = cross_validation.LeaveOneOut(len(y_df)) loo=10 scores = cross_validation.cross_val_score(reg, X_train, Y_train, scoring='mean_squared_error', cv=loo,) print "The score mean of cross validation : " print scores.mean() #### fit print "Fit ..." reg.fit(X_train, Y_train) #### Prediction print "Prediction ..." Y_pred = reg.predict(X_test) #### write the submission print "Write the submission ..." make_submission(dataTest,Y_pred) print "End."
# pred.iloc[idx] = 0 with t.timer('replace with leak'): leak = pd.read_feather(DATA_PATH / 'input/leak.feather') leak['timestamp'] = leak['timestamp'].astype(str) leak.rename(columns={'meter_reading': 'leak_meter_reading'}, inplace=True) test_and_leak = pd.merge(test, leak, on=['building_id', 'meter', 'timestamp'], how='left') leak_idx = test_and_leak['leak_meter_reading'].dropna().index pred.iloc[leak_idx] = test_and_leak.loc[leak_idx, 'leak_meter_reading'] with t.timer('make submission'): output_path = str(DATA_PATH / f'output/sub_{RUN_NAME}_{cv}.csv') make_submission(y_pred=pred, target_name=TARGET_NAME, sample_path=SAMPLE_SUB_PATH, output_path=output_path, comp=True) # LOGGER_PATH.rename(f'../logs/{RUN_NAME}_{np.mean(scores):.3f}') process_minutes = t.get_processing_time() with t.timer('notify'): message = f'''{MODEL_NAME}\ncv: {cv:.3f}\nscores: \ntime: {process_minutes:.2f}[min]''' send_line(NOTIFY_PARAMS['line']['token'], message) send_notion(token_v2=NOTIFY_PARAMS['notion']['token_v2'], url=NOTIFY_PARAMS['notion']['url'],
from __future__ import print_function import numpy as np import pandas as pd import utils as ut import os import xgboost as xgb from keras.models import Sequential from keras.layers.core import Dense, Dropout, Activation from keras.layers.normalization import BatchNormalization from keras.layers.advanced_activations import PReLU from keras.utils import np_utils, generic_utils from sklearn.preprocessing import LabelEncoder from sklearn.preprocessing import StandardScaler np.random.seed(1337) # for reproducibility ## check if raw data exist print("Loading data...") X, labels = ut.load_data('data/train.csv', train=True) data, ids = ut.load_data('data/test.csv', train=False) print("Preprocessing labels") y, encoder = ut.preprocess_labels(labels) prediction_files = ["xgb-otto-proba-round-430-eta-0.csv", "keras-otto-proba-93.csv"] ensemble = ut.ensemble(prediction_files, weights=[0.4, 0.6]) ut.make_submission(ensemble, ids, encoder, fname='ensemble-otto-selected-93.csv')
logging.disable(logging.FATAL) if 'nn' in MODEL_NAME: save_learning_curve(RUN_NAME, models) if SETTINGS_PARAMS['oof']['save']: np.save(f'../logs/{RUN_NAME}/oof.npy', oof) save_oof_plot(RUN_NAME, train_y, oof, type_='reg', dia=True) with t.timer('save features importances'): save_importances(RUN_NAME, models, FEATURES) with t.timer('make submission'): output_path = f'../data/output/{RUN_NAME}_{np.mean(scores):.3f}.csv' make_submission(y_pred=np.mean(preds, axis=1), target_name=COMPE_PARAMS['target_name'], sample_path=PATH_PARAMS['sample'], output_path=str(output_path), comp=False) LOGGER_PATH.rename(f'../logs/{RUN_NAME}_{np.mean(scores):.3f}') process_minutes = t.get_processing_time() with t.timer('notify'): message = f'''{MODEL_NAME}\ncv: {np.mean(scores):.3f}\nscores: {scores}\ntime: {process_minutes:.2f}[min]''' send_line(NOTIFY_PARAMS['line']['token'], message) send_notion(token_v2=NOTIFY_PARAMS['notion']['token_v2'], url=NOTIFY_PARAMS['notion']['url'], name=RUN_NAME, created=NOW, model=MODEL_NAME.split('_')[0],
X_pred = load_predictions("stack/*-fold%d.npy" % i) X_fold = np.hstack((X_fold, X_pred)) all_X.append(X_fold) all_y.append(y_fold) all_w.append(w_fold) X = np.vstack(all_X) y = np.concatenate(all_y) w = np.concatenate(all_w) clf = Classifier(**params) w = rescale(w) w = rebalance(y, w) try: clf.fit(X, y, sample_weight=w) except: clf.fit(X, y) # And make a submussion print "Making submission..." X_test, _, _, _ = load_test() X_pred = load_predictions("stack/*-test.npy") X_test = np.hstack((X_test, X_pred)) make_submission(clf, threshold, "output-stacking.csv", X_test=X_test) import IPython IPython.embed()
model.summary() # %% X, y = get_data(as_gray=False) batch_size = 128 ra = ROCAUC(batch_size) es = EarlyStopping(monitor='val_auc', patience=2, mode='max') mc = ModelCheckpoint(f'data/models/model.h5', monitor='val_auc', save_best_only=True, mode='max', verbose=1) model.fit(X, y, batch_size=batch_size, epochs=50, validation_split=.2, callbacks=[ra, es, mc]) # %% model.fit(X, y, batch_size=batch_size, epochs=5) # %% X_test, test_ids = get_data(test=True, as_gray=False) test_predictions = model.predict(X_test, batch_size=batch_size) test_predictions = test_predictions.flatten() make_submission(test_ids, test_predictions, 'submissions/first_transfer_cnn.csv')
def on_epoch_end(self, epoch, logs={}): x, y = self.test_data predict_dict = predict_final_word(self.model, self.vocabulary, self.filename) sub_file = make_submission(predict_dict, opt.student_id, opt.input) scoring(sub_file, os.path.join("data"), type="valid")
def main(opt): os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpu np.random.seed(opt.seed) # set a seed for reproduciaiblity if opt.mode == "train": st = time.time() print('Loading data') x_train, y_train, x_valid, y_valid, vocabulary_size = load_data( "data", opt.debug) num_training_data = x_train.shape[0] sequence_length = x_train.shape[1] print(num_training_data) print('Vocab Size', vocabulary_size) model = build_model(opt.model, opt.embedding_dim, opt.hidden_size, opt.drop, opt.filter, sequence_length, vocabulary_size) adam = Adam() model.compile(loss='sparse_categorical_crossentropy', optimizer=adam) print("Traning Model...") checkpoint = ModelCheckpoint(opt.saved_model, monitor='val_loss', verbose=1, save_best_only=True, mode='min') early = EarlyStopping(monitor="val_loss", mode="min", patience=5) history = model.fit(x_train, y_train, batch_size=opt.batch_size, epochs=100, verbose=1, validation_data=(x_valid, y_valid), callbacks=[ TestCallback((x_valid, y_valid), model=model), checkpoint, early ]) model.save(opt.saved_model) print("Training cost time: ", time.time() - st) elif opt.mode == "ensemble": x_train, y_train, x_valid, y_valid, vocabulary_size = load_data( "data", opt.debug) num_training_data = x_train.shape[0] sequence_length = x_train.shape[1] print(num_training_data) print('Vocab Size', vocabulary_size) ENSEMBLE_DIR = "models/ensemble/" model_files = [] for (dirpath, dirnames, filenames) in os.walk(ENSEMBLE_DIR): model_files.extend(filenames) break models = [] model_count = 0 for filename in model_files: model = load_model(ENSEMBLE_DIR + filename) model.name = "model" + str(model_count) model_count += 1 models.append(model) build_save_ensemble_model(opt.saved_model, models, sequence_length) else: model = load_model(opt.saved_model) vocabulary = json.load(open(os.path.join("data", "vocab.json"))) predict_dict = predict_final_word(model, vocabulary, opt.input) sub_file = make_submission(predict_dict, opt.student_id, opt.input) if opt.score: scoring(sub_file, os.path.join("data"), type="valid")
X_TS = utils.create_fingerprints(TS["SMILES"].values) if METHOD == "DT": depths, scores = doDecisionTree(X_LS, Y_LS) print(scores) elif METHOD == "KNN": depths, scores = doKNN(X_LS, Y_LS) print(scores) classifier_knn = KNeighborsClassifier(n_neighbors=50) classifier_knn.fit(X_LS, Y_LS) pred = classifier_knn.predict_proba(X_TS) auc_predicted = 0.7 fname = utils.make_submission(pred[:, 1], auc_predicted, 'knn_50') print('Submission file "{}" successfully written'.format(fname)) elif METHOD == "RF": #ts, depths, scores = doRandomForest(X_LS, Y_LS) #print(scores) classifier_rf = RandomForestClassifier(n_estimators=800, max_depth=700) classifier_rf.fit(X_LS, Y_LS) pred = classifier_rf.predict_proba(X_TS) auc_predicted = 0.78 fname = utils.make_submission(pred[:, 1], auc_predicted, 'final') print('Submission file "{}" successfully written'.format(fname)) elif METHOD == "MLP": layers, neurones, scores = doMLP(X_LS, Y_LS)
def main(): t = Timer() seed_everything(cfg.common.seed) logger_path.mkdir(exist_ok=True) logging.basicConfig(filename=logger_path / 'train.log', level=logging.DEBUG) dh.save(logger_path / 'config.yml', cfg) with t.timer('load data'): train_x = dh.load('../data/input/train_concated.csv') train_org_x = dh.load('../data/input/train.csv') train_2019_x = dh.load('../data/input/train_2019.csv') test_x = dh.load('../data/input/test.csv') with t.timer('make folds'): fold_org_df = factory.get_fold(cfg.validation.val1, train_org_x, train_org_x[[cfg.common.target]]) fold2019_df = factory.get_fold(cfg.validation.val2, train_2019_x, train_2019_x[[cfg.common.target]]) fold_df = pd.concat([fold_org_df, fold2019_df], axis=0, sort=False, ignore_index=True) if cfg.validation.val1.single: fold_df = fold_df[['fold_0']] fold_df /= fold_df['fold_0'].max() with t.timer('load features'): features = dh.load('../configs/feature/all.yml')['features'] for f in features: train_x[f] = dh.load(f'../features/{f}_train.feather')[f].fillna(-1) test_x[f] = dh.load(f'../features/{f}_test.feather')[f].fillna(-1) with t.timer('drop several rows'): if cfg.common.drop is not None: drop_idx = factory.get_drop_idx(cfg.common.drop) train_x = train_x.drop(drop_idx, axis=0).reset_index(drop=True) fold_df = fold_df.drop(drop_idx, axis=0).reset_index(drop=True) with t.timer('train model'): result = train_model(run_name, train_x, fold_df, cfg) logging.disable(logging.FATAL) run_name_cv = f'{run_name}_{result["cv"]:.3f}' logger_path.rename(f'../logs/{run_name_cv}') with t.timer('predict'): preds = predict_test(run_name_cv, test_x, fold_df, cfg) with t.timer('post process'): duplicates = { 'ISIC_5224960': 1, 'ISIC_9207777': 1, 'ISIC_6457527': 1, 'ISIC_8347588': 0, 'ISIC_8372206': 1, 'ISIC_9353360': 1, 'ISIC_3689290': 0, 'ISIC_3584949': 0, } for image_name, target in duplicates.items(): idx = test_x[test_x['image_name'] == image_name].index[0] preds[idx] = target with t.timer('make submission'): sample_path = f'../data/input/sample_submission.csv' output_path = f'../data/output/{run_name_cv}.csv' make_submission(y_pred=preds, target_name=cfg.common.target, sample_path=sample_path, output_path=output_path, comp=False) with t.timer('kaggle api'): kaggle = Kaggle(cfg.compe.compe_name, run_name_cv) if cfg.common.kaggle.submit: kaggle.submit(comment) with t.timer('notify'): process_minutes = t.get_processing_time() message = f'''{model_name}\ncv: {result["cv"]:.3f}\ntime: {process_minutes:.2f}[h]''' send_line(notify_params.line.token, message) notion = Notion(token=notify_params.notion.token_v2) notion.set_url(url=notify_params.notion.url) notion.insert_rows({ 'name': run_name_cv, 'created': now, 'model': cfg.model.name, 'local_cv': round(result['cv'], 4), 'time': process_minutes, 'comment': comment })
def main(): t = Timer() seed_everything(cfg.common.seed) logger_path.mkdir(exist_ok=True) logging.basicConfig(filename=logger_path / 'train.log', level=logging.DEBUG) dh.save(logger_path / 'config.yml', cfg) dh.save(logger_path / 'features.yml', features_params) with t.timer('load data'): train_df = dh.load('../data/input/train.csv') train2019_df = dh.load('../data/input/train_concated.csv') train_x = factory.get_features(features, cfg.data.loader.train) test_x = factory.get_features(features, cfg.data.loader.test) train_y = factory.get_target(cfg.data.target) with t.timer('add oof'): if cfg.data.features.oof.name is not None: oof, preds = factory.get_oof(cfg.data) train_x['oof'] = oof test_x['oof'] = preds features.append('oof') with t.timer('make folds'): fold_df = factory.get_fold(cfg.validation, train_df, train_df[['target']]) fold_df = pd.concat([ fold_df, pd.DataFrame(np.zeros((len(train2019_df), len(fold_df.columns))), columns=fold_df.columns) ], axis=0, sort=False, ignore_index=True) if cfg.validation.single: fold_df = fold_df[['fold_0']] fold_df /= fold_df['fold_0'].max() with t.timer('drop index'): if cfg.common.drop is not None: drop_idx = factory.get_drop_idx(cfg.common.drop) train_x = train_x.drop(drop_idx, axis=0).reset_index(drop=True) train_y = train_y.drop(drop_idx, axis=0).reset_index(drop=True) fold_df = fold_df.drop(drop_idx, axis=0).reset_index(drop=True) with t.timer('prepare for ad'): if cfg.data.adversarial_validation: train_x, train_y = factory.get_ad(cfg, train_x, test_x) with t.timer('train and predict'): trainer = Trainer(cfg) cv = trainer.train(train_df=train_x, target_df=train_y, fold_df=fold_df) preds = trainer.predict(test_x) trainer.save(run_name) run_name_cv = f'{run_name}_{cv:.3f}' logger_path.rename(f'../logs/{run_name_cv}') logging.disable(logging.FATAL) with t.timer('make submission'): sample_path = f'../data/input/sample_submission.csv' output_path = f'../data/output/{run_name_cv}.csv' make_submission(y_pred=preds, target_name=cfg.data.target.name, sample_path=sample_path, output_path=output_path, comp=False) if cfg.common.kaggle.submit: kaggle = Kaggle(cfg.compe.name, run_name_cv) kaggle.submit(comment) with t.timer('notify'): process_minutes = t.get_processing_time() message = f'''{cfg.model.name}\ncv: {cv:.3f}\ntime: {process_minutes}[min]''' send_line(notify_params.line.token, message) notion = Notion(token=notify_params.notion.token_v2) notion.set_url(url=notify_params.notion.url) notion.insert_rows({ 'name': run_name_cv, 'created': now, 'model': options.model, 'local_cv': round(cv, 4), 'time': process_minutes, 'comment': comment })
def main(): t = Timer() seed_everything(cfg.common.seed) logger_path.mkdir(exist_ok=True) dh.save(logger_path / 'config.yml', cfg) with t.timer('load data'): train_df = dh.load('../data/input/train_data.csv') test_df = dh.load('../data/input/test_data.csv') oof = np.zeros((len(train_df), len(cfg.models))) preds = np.zeros((len(test_df), len(cfg.models))) for i, m in enumerate(cfg.models): name = getattr(cfg.models, m).name log_dir = Path(f'../logs/{name}') model_oof = dh.load(log_dir / 'oof.npy') model_cfg = dh.load(log_dir / 'config.yml') if model_cfg.common.drop: drop_idxs = np.array([]) for drop_name in model_cfg.common.drop: drop_idx = dh.load(f'../pickle/{drop_name}.npy') drop_idxs = np.append(drop_idxs, drop_idx) model_oof = factory.fill_dropped(model_oof, drop_idx) model_preds = dh.load(f'../logs/{name}/raw_preds.npy') oof[:, i] = model_oof[:len(train_df)] preds[:, i] = model_preds with t.timer('drop index'): if cfg.common.drop is not None: drop_idxs = np.array([]) for drop_name in model_cfg.common.drop: drop_idx = dh.load(f'../pickle/{drop_name}.npy') drop_idxs = np.append(drop_idxs, drop_idx) train_df = train_df.drop(drop_idxs, axis=0).reset_index(drop=True) with t.timer('optimize model weight'): metric = factory.get_metrics(cfg.common.metrics.name) y_true = train_df[cfg.common.target] def objective(trial): p_list = [0 for i in range(len(cfg.models))] for i in range(len(cfg.models) - 1): p_list[i] = trial.suggest_discrete_uniform(f'p{i}', 0.0, 1.0 - sum(p_list), 0.01) p_list[-1] = round(1 - sum(p_list[:-1]), 2) y_pred = np.zeros(len(train_df)) for i in range(oof.shape[1]): y_pred += oof[:, i] * p_list[i] return metric(y_true, y_pred) study = optuna.create_study(direction='minimize') study.optimize(objective, timeout=10) best_params = list(study.best_params.values()) best_weight = best_params + [round(1 - sum(best_params), 2)] with t.timer('ensemble'): ensemble_oof = np.zeros(len(train_df)) ensemble_preds = np.zeros(len(test_df)) for i in range(len(best_weight)): ensemble_oof += oof[:, i] * best_weight[i] ensemble_preds += preds[:, i] * best_weight[i] dh.save(f'../logs/{run_name}/oof.npy', ensemble_oof) dh.save(f'../logs/{run_name}/raw_preds.npy', ensemble_preds) cv = metric(y_true, ensemble_oof) run_name_cv = f'{run_name}_{cv:.3f}' logger_path.rename(f'../logs/{run_name_cv}') print('\n\n===================================\n') print(f'CV: {cv:.4f}') print(f'BEST WEIGHT: {best_weight}') print('\n===================================\n\n') with t.timer('make submission'): sample_path = f'../data/input/sample_submission.feather' output_path = f'../data/output/{run_name_cv}.csv' make_submission(y_pred=ensemble_preds, target_name=cfg.common.target, sample_path=sample_path, output_path=output_path, comp=False) if cfg.common.kaggle.submit: kaggle = Kaggle(cfg.compe.compe_name, run_name_cv) kaggle.submit(comment) with t.timer('notify'): process_minutes = t.get_processing_time() message = f'''{options.model}\ncv: {cv:.3f}\ntime: {process_minutes}[min]''' send_line(notify_params.line.token, message) notion = Notion(token=notify_params.notion.token_v2) notion.set_url(url=notify_params.notion.url) notion.insert_rows({ 'name': run_name_cv, 'created': now, 'model': options.model, 'local_cv': round(cv, 4), 'time': process_minutes, 'comment': comment })
# check correlation between base learner predictions np.corrcoef(X_train_level_2.T) sns.jointplot(X_train_level_2[:,0], X_train_level_2[:,1]) plt.show() # simple convex combination between pair alphas_to_try = np.linspace(0, 1, 1001) rmse_best = np.Inf for alpha in alphas_to_try: mix = alpha * X_train_level_2[:,0] + (1-alpha) * X_train_level_2[:,1] rmse_new = np.sqrt(mean_squared_error(Y_train_level_2, mix)) if rmse_new < rmse_best: alpha_best = alpha rmse_best = rmse_new score = round(rmse_best, 6) pred_test = alpha_best * X_test_level_2[:,0] + (1-alpha_best) * X_test_level_2[:,1] ids = np.array(df.loc[df['date_block_num'] == 34, 'ID']) submission = make_submission(ids, np.array(pred_test).flatten()) # export today = datetime.datetime.now() sub_id = today.strftime('%y%m%d') + '_' + today.strftime("%H%M") + \ '_score_' + str(score) folder = OUT_FOLDER + '/' + sub_id os.mkdir(folder) print('\n---- ' + sub_id + ' ----') submission.to_csv(os.path.join(folder, 'submission.csv'), index=False)
# 5 fold cross validation skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) prediction_scores = np.empty(y.shape[0], dtype='object') for train_idx, val_idx in tqdm(skf.split(X, y)): X_train, X_val = X[train_idx], X[val_idx] y_train = y[train_idx] clf = clf.fit(X_train, y_train) y_pred = clf.predict_proba(X_val)[:, 1] # Save the predictions for this fold prediction_scores[val_idx] = y_pred plt.title('SVM 5-fold cross validation ROC AUC') plot_roc(y, prediction_scores) plt.savefig('report/figures/svm_roc.png', dpi=300) plot_prediction_samples(imgs, y, prediction_scores, 'SVM Prediction Samples') plt.savefig('report/figures/svm_confmat.png', dpi=300) # %% # load and preprocess test data then create submission X_test, test_ids = get_data(test=True) X_test = np.stack([get_HOG(img, **hog_params) for img in X_test]) clf = clf.fit(X, y) test_predictions = clf.predict_proba(X_test)[:, 1] make_submission(test_ids, test_predictions, fname='submissions/svc_10_hog_16_4_fulltrain.csv')
X_test = read_pickle('../audio_data/X_test4d.pkl') Y_train = read_pickle('../audio_data/Y_train1d.pkl') print("The shape of X_train/X_test/Y_train: ", X_train.shape, X_test.shape, Y_train.shape) # Instantiate the model bigan = BIGAN(X_train.shape[1], X_train.shape[2], X_train.shape[3]) if is_trainable: # Training the BiGAN bigan.train_by_batch(X_train, epochs=NUM_EPOCHS, batch_size=BATCH_SIZE) #bilstm.train_all(X_train_, Y_train_, BATCH_SIZE, NUM_EPOCHS) else: # Restore the checkpoint checkpoint_dir = './runs/checkpoint_bigan' checkpoint = tf.train.Checkpoint() checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir)).expect_partial() print("Checkpoint restored for Anomaly Detection!") # Anomaly Detection AS = bigan.compute_anomaly_score(X_train, Y_train, X_test) # Prediction ts = NUM_OUTLIERS/len(X_test) # Find out the best threshold Y_pred_AS = bigan.predict_outlier(AS, ts) #print("Y_pred_AS: ", Counter(Y_pred_AS)) # Geneate final Y_pred and make submission Y_pred = np.load('Y_pred.npy') Y_pred_new = gen_Y_pred(Y_pred, Y_pred_AS) print("Y_pred_new.shape: ", Y_pred_new.shape) make_submission(Y_pred_new, "submission")
models = [] for train_idx, val_idx in skf.split(X, y): X_train, X_val = X[train_idx], X[val_idx] y_train, y_val = y[train_idx], y[val_idx] model = get_model() y_train = to_categorical(y_train) model.fit_generator(imagen.flow(X_train, y_train, batch_size=BATCH_SIZE), steps_per_epoch=batch_per_epoch, epochs=EPOCHS, verbose=0) prediction_scores[val_idx] = model.predict(X_val, batch_size=BATCH_SIZE)[:, 1] cur_auc = roc_auc_score(y_val, prediction_scores[val_idx]) print(cur_auc) if cur_auc < 0.8: break models.append(model) print(roc_auc_score(y, prediction_scores)) # %% X_test, test_ids = get_data(test=True, as_gray=False) X_test = X_test / 255. test_predictions = np.mean( [m.predict(X_test, batch_size=BATCH_SIZE)[:, 1] for m in models], axis=0) make_submission(test_ids, test_predictions, 'submissions/homebrew_cnn_CV.csv') # %% [m.save(f'data/models/model_fold_{i}.h5') for i, m in enumerate(models)]
X_LS = fingerprints.transform(LS['SMILES'].values, FINGERPRINT) y_LS = LS['ACTIVE'].values # Variance threshold (feature selection) selector = VarianceThreshold() selector.fit(X_LS) X_LS = selector.transform(X_LS) # Cross validation score cv = ShuffleSplit(n_splits=5, test_size=0.25, random_state=0) scores = cross_val_score(MODEL, X_LS, y_LS, cv=cv, scoring='roc_auc') # Estimated AUC AUC = scores.mean() # Train model MODEL.fit(X_LS, y_LS) # Create fingerprint features of test set X_TS = fingerprints.transform(TS['SMILES'].values, FINGERPRINT) X_TS = selector.transform(X_TS) # Predict prob = MODEL.predict_proba(X_TS)[:, -1] # Writing the submission file os.makedirs(DESTINATION, exist_ok=True) fname = utils.make_submission(prob, AUC, DESTINATION + 'submission') print('Submission file "{}" successfully written'.format(fname))