def fine_tuning(): # get unique drivers drivers = pd.read_csv('../input/driver_imgs_list.csv') unique_drivers = np.array(list((set(drivers['subject'])))) dlist = list(set(drivers['subject'])) clist = ['c0', 'c1', 'c2', 'c3', 'c4', 'c5', 'c6', 'c7', 'c8', 'c9'] import itertools dc_list = list(itertools.product(dlist, clist)) random.seed(random_state) random.shuffle(dc_list) kf = StratifiedKFold(map(lambda x: x[1:],np.array(dc_list)[:,1]), n_folds=nfolds, shuffle=False, random_state=random_state) num_fold = 0 fold_number = 0 cv_pred_list = [] cv_score = [] all_pred_list = [] for train_drivers, test_drivers in kf: #if fold_number <= 0: # fold_number += 1 # continue # add test drivers for training i.e., semi-supervised learning #sample_rate_test = 0.3 np.random.seed(fold_number) sample_rate_test = np.random.uniform(0.1, 0.3) print 'sample_rate_test:', sample_rate_test preds_test = pd.read_csv('submission/ensemble1_8.csv') # extract c9 tmp_c9 = preds_test.iloc[(preds_test.c9>0.450).values,:] preds_test = preds_test.iloc[(preds_test.c9<=0.450).values,:] # extrsct features > 0.9, except c9 tmp = preds_test.iloc[:,:10].max(1) tmp_idx = tmp.iloc[(tmp > 0.90).values].index preds_test = preds_test.ix[tmp_idx] # concat preds_test = pd.concat([preds_test, tmp_c9], axis=0) preds_test.reset_index(drop=True, inplace=True) preds_test_label = preds_test.iloc[:,:10].idxmax(1) preds_test_img = preds_test['img'] preds_test_semi = pd.DataFrame() preds_test_semi['subject'] = ['p099'] * len(preds_test) preds_test_semi['classname'] = preds_test_label.values preds_test_semi['img'] = preds_test_img.values preds_test_semi['classname'] = 'test_' + preds_test_semi['classname'] preds_test_semi = preds_test_semi.sample(int(len(preds_test) * sample_rate_test) , random_state=fold_number) """ try: print 'Fold{} training with drivers split'.format(fold_number) dc_list_tr = np.array(dc_list)[train_drivers].tolist() dc_list_te = np.array(dc_list)[test_drivers].tolist() print 'combination for validation' for i in dc_list_te: print i, print print 'number of validation drivers: {}'.format(len(set(np.array(dc_list_te)[:,0]))) print 'number of validation class: {}'.format(len(set(np.array(dc_list_te)[:,1]))) print pd.Series(np.array(dc_list_te)[:,0]).value_counts() print pd.Series(np.array(dc_list_te)[:,1]).value_counts() def f_tr(data): if data.tolist() in dc_list_tr: return True else: return False def f_te(data): if data.tolist() in dc_list_te: return True else: return False index_tr = drivers[['subject', 'classname']].apply(f_tr, axis=1).values index_te = drivers[['subject', 'classname']].apply(f_te, axis=1).values alltrain_drivers = drivers[index_tr] allvalid_drivers = drivers[index_te] # add tet images for training alltrain_drivers = pd.concat([alltrain_drivers, preds_test_semi]) alltrain_drivers.to_csv('../input/driver_imgs_list_alltrain.csv', index=False) allvalid_drivers.to_csv('../input/driver_imgs_list_allvalid.csv', index=False) print 'final training' model = final_training() # recalculate #print 'train drivers: {}'.format(train_drivers_fold) #print 'validation drivers: {}'.format(test_drivers_fold) samples_per_epoch = len(alltrain_drivers) nb_val_samples = len(allvalid_drivers) print 'training data: {}'.format(samples_per_epoch) print 'validation data: {}'.format(nb_val_samples) #samples_per_epoch = batch_size * (samples_per_epoch // batch_size) #nb_val_samples = batch_size * (nb_val_samples // batch_size) # #model = vgg_std16_model(img_rows, img_cols, color_type_global) #print 'create generator for saving bottlebeck features' train_data_generator = generate_arrays_from_file( \ '../input/driver_imgs_list_alltrain.csv' ,isvalidation=False, usingalldata=False) valid_data_generator = generate_arrays_from_file( \ '../input/driver_imgs_list_allvalid.csv' , isvalidation=True, usingalldata=False) callbacks = [ EarlyStopping(monitor='val_loss', patience=4, verbose=0), ] #trianing model.fit_generator(train_data_generator, samples_per_epoch=samples_per_epoch, nb_epoch=nb_epoch_all, nb_val_samples=nb_val_samples, validation_data=valid_data_generator, max_q_size=10) predictions_valid = model.evaluate_generator(valid_data_generator, val_samples=nb_val_samples) #score = log_loss(Y_valid, predictions_valid) print('Score log_loss: ', predictions_valid) cv_score.append(predictions_valid) info_string = 'loss_' + str(predictions_valid) \ + '_r_' + str(img_rows) \ + '_c_' + str(img_cols) \ + '_folds_' + str(fold_number) \ + '_ep_' + str(nb_epoch) cv_pred_list.append('submission/' + info_string + '.csv') # predictions with new version test_data_generator = test_prediction('../input/imgs/test/*.jpg') preds = model.predict_generator(test_data_generator, val_samples=79726) save_pred(preds, '../input/imgs/test/*.jpg', \ submission_name=info_string) del model gc.collect() except Exception as e: print str(e) # delete top model weights if os.path.exists(top_model_weights_path): os.remove(top_model_weights_path) """ ### Using all data with random split try: print 'Fold{} training with all data'.format(fold_number) drivers = pd.read_csv('../input/driver_imgs_list.csv') # random split np.random.seed(fold_number) split_data = np.random.uniform(0.05, 0.15) random_index = random.sample(range(len(drivers)),int(len(drivers)*(1-split_data))) alltrain_drivers = drivers.iloc[(drivers.index.isin(random_index)), :] allvalid_drivers = drivers.iloc[~(drivers.index.isin(random_index)), :] # add tet images for training alltrain_drivers = pd.concat([alltrain_drivers, preds_test_semi]) alltrain_drivers.to_csv('../input/driver_imgs_list_alltrain.csv', index=False) allvalid_drivers.to_csv('../input/driver_imgs_list_allvalid.csv', index=False) print 'final training' model = final_training() # recalculate #print 'train drivers: {}'.format(train_drivers_fold) #print 'validation drivers: {}'.format(test_drivers_fold) samples_per_epoch = len(alltrain_drivers) nb_val_samples = len(allvalid_drivers) print 'training data: {}'.format(samples_per_epoch) print 'validation data: {}'.format(nb_val_samples) #samples_per_epoch = batch_size * (samples_per_epoch // batch_size) #nb_val_samples = batch_size * (nb_val_samples // batch_size) # #model = vgg_std16_model(img_rows, img_cols, color_type_global) #print 'create generator for saving bottlebeck features' train_data_generator = generate_arrays_from_file( \ '../input/driver_imgs_list_alltrain.csv' ,isvalidation=False, usingalldata=False) valid_data_generator = generate_arrays_from_file( \ '../input/driver_imgs_list_allvalid.csv' , isvalidation=True, usingalldata=False) callbacks = [ EarlyStopping(monitor='val_loss', patience=4, verbose=0), ] #trianing model.fit_generator(train_data_generator, samples_per_epoch=samples_per_epoch, nb_epoch=nb_epoch_all, nb_val_samples=nb_val_samples, validation_data=valid_data_generator, max_q_size=10) #callbacks=callbacks) predictions_valid = model.evaluate_generator(valid_data_generator, val_samples=nb_val_samples) #score = log_loss(Y_valid, predictions_valid) print('Score log_loss: ', predictions_valid) #cv_score.append(predictions_valid) info_string = 'all_loss_' + str(predictions_valid) \ + '_r_' + str(img_rows) \ + '_c_' + str(img_cols) \ + '_folds_' + str(fold_number) \ + '_ep_' + str(nb_epoch_all) \ + 'test' + str(sample_rate_test) all_pred_list.append('submission/' + info_string + '.csv') # predictions with new version test_data_generator = test_prediction('../input/imgs/test/*.jpg') preds = model.predict_generator(test_data_generator, val_samples=79726) save_pred(preds, '../input/imgs/test/*.jpg', \ submission_name=info_string) del model gc.collect() except Exception as e: print str(e) fold_number += 1 continue # next fold fold_number += 1 #print 'CV mean: {:.6}, std: {:.6}'.format(np.mean(cv_score), np.std(cv_score)) #averaging(cv_pred_list, 'ensemble_{}_CV{:.3}'.format(model_name, np.mean(cv_score))) averaging(all_pred_list, 'ensemble_{}_all'.format(model_name)) #cv_all_pred_list = cv_pred_list[:] #cv_all_pred_list.extend(all_pred_list) #averaging(cv_all_pred_list, 'ensemble_{}_CV_all'.format(model_name)) return
def afms(kcs, opps, actuals, stu, student_label, item_label, nfolds=3, seed=None): """ Executes AFM+S on the provided data and returns model fits and parameter estimates """ sv = DictVectorizer() qv = DictVectorizer() ov = DictVectorizer() S = sv.fit_transform(stu) Q = qv.fit_transform(kcs) O = ov.fit_transform(opps) X = hstack((S, Q, O)) y = np.array(actuals) l2 = [1.0 for i in range(S.shape[1])] l2 += [0.0 for i in range(Q.shape[1])] l2 += [0.0 for i in range(O.shape[1])] bounds = [(None, None) for i in range(S.shape[1])] bounds += [(None, None) for i in range(Q.shape[1])] bounds += [(0, None) for i in range(O.shape[1])] X = X.toarray() X2 = Q.toarray() model = BoundedLogistic(first_bounds=bounds, first_l2=l2) model.fit(X, X2, y) coef_s = model.coef1_[0:S.shape[1]] coef_s = [[k, v, invlogit(v)] for k, v in sv.inverse_transform([coef_s])[0].items()] coef_q = model.coef1_[S.shape[1]:S.shape[1] + Q.shape[1]] coef_qint = qv.inverse_transform([coef_q])[0] coef_o = model.coef1_[S.shape[1] + Q.shape[1]:S.shape[1] + Q.shape[1] + O.shape[1]] coef_qslope = ov.inverse_transform([coef_o])[0] coef_qslip = qv.inverse_transform([model.coef2_])[0] kc_vals = [] all_kcs = set(coef_qint).union(set(coef_qslope)).union(set(coef_qslip)) for kc in all_kcs: kc_vals.append([ kc, coef_qint.setdefault(kc, 0.0), invlogit(coef_qint.setdefault(kc, 0.0)), coef_qslope.setdefault(kc, 0.0), coef_qslip.setdefault(kc, 0.0) ]) cvs = [ KFold(len(y), n_folds=nfolds, shuffle=True, random_state=seed), StratifiedKFold(y, n_folds=nfolds, shuffle=True, random_state=seed), LabelKFold(student_label, n_folds=nfolds), LabelKFold(item_label, n_folds=nfolds) ] # scores_header = [] scores = [] for cv in cvs: score = [] for train_index, test_index in cv: X_train, X_test = X[train_index], X[test_index] X2_train, X2_test = X2[train_index], X2[test_index] y_train, y_test = y[train_index], y[test_index] model.fit(X_train, X2_train, y_train) score.append(model.mean_squared_error(X_test, X2_test, y_test)) # scores_header.append(cv_name) scores.append(np.mean(np.sqrt(score))) return scores, kc_vals, coef_s
DecisionTreeClassifier(criterion='entropy'), GaussianNB(), XGBClassifier(max_depth=5, n_estimators=500) ] # Mean scores in K-FOLD precisions = np.zeros((n_folds, len(classifiers))) recalls = np.zeros((n_folds, len(classifiers))) f_scores = np.zeros((n_folds, len(classifiers))) accuracies = np.zeros((n_folds, len(classifiers))) cv_s = np.zeros((n_folds * len(classifiers))) i = 0 """ Performing cross validation """ # print('Begin k-fold') for train_idx, test_idx in StratifiedKFold(labels, n_folds=n_folds, shuffle=True): print("FOLD: " + str(i + 1)) X_train = texts[train_idx] y_train = labels[train_idx] X_test = texts[test_idx] y_test = labels[test_idx] models = list() models.append( lsa(vectorizer=vectorizer, classifier=classifiers[0], k=k)) if chosen_ds == 'filatova' and args.star:
stop_words=None, max_features=None, decode_error='ignore')), #('tfidf', TfidfTransformer(use_idf=False)), ('clf', SVC(C=5.2, kernel='linear', probability=True)) ]) vot_clf = VotingClassifier(estimators=[('glove', glove_clf), ('linear', char_clf)], voting='soft') print char_clf.named_steps print "TRAIN" print 80 * '=' cv = StratifiedKFold(data.Stance, n_folds=10, shuffle=True, random_state=1) pred_stances = cross_val_predict(vot_clf, data.Abstract, data.Stance, cv=cv) print classification_report(data.Stance, pred_stances, digits=4) macro_f = fbeta_score(data.Stance, pred_stances, 1.0, labels=['AGAINST', 'FAVOR', 'NONE'], average='weighted') print 'macro-average of F-score(FAVOR) and F-score(AGAINST): {:.4f}\n'.format(
INPUT_MASK_PATH = '/neurospin/brainomics/2018_euaims_leap_predict_vbm/results/VBM/1.5mm/data/mask.nii' NFOLDS_OUTER = 6 NFOLDS_INNER = 5 shutil.copy(INPUT_DATA_X, WD) shutil.copy(INPUT_DATA_y, WD) shutil.copy(INPUT_MASK_PATH, WD) ############################################################################# ## Create config file y = np.load(INPUT_DATA_y) site = np.load( "/neurospin/brainomics/2018_euaims_leap_predict_vbm/results/VBM/1.5mm/by_age/data/adolescents/site.npy" ) cv_outer = [[tr, te] for tr, te in StratifiedKFold( y.ravel(), n_folds=NFOLDS_OUTER, random_state=42)] cv_outer[0][0] = np.transpose(np.where(site != 1)).ravel() cv_outer[0][1] = np.transpose(np.where(site == 1)).ravel() cv_outer[1][0] = np.transpose(np.where(site != 2)).ravel() cv_outer[1][1] = np.transpose(np.where(site == 2)).ravel() cv_outer[2][0] = np.transpose(np.where(site != 3)).ravel() cv_outer[2][1] = np.transpose(np.where(site == 3)).ravel() cv_outer[3][0] = np.transpose(np.where(site != 4)).ravel() cv_outer[3][1] = np.transpose(np.where(site == 4)).ravel() cv_outer[4][0] = np.transpose(np.where(site != 5)).ravel() cv_outer[4][1] = np.transpose(np.where(site == 5)).ravel()
def handle(self, *args, **options): print(settings.BASE_DIR, 'BASE_DIR') print(os.path.join(settings.BASE_DIR, '../')) page_num = options['n'] save_flag = options['save'] # self.stdout.write(str(page_num), ending='\n') # [Category_obj1, Category_obj2, ...] categories = get_categories("https://gunosy.com/") all_contents = [] all_links = [] all_labels = [] for i, category in enumerate(categories): for page_num in range(1, page_num + 1): pager_query = '?page=%d' % page_num url = category.url + pager_query print(url) links, contents = get_links_and_contents(url) all_links.extend(links) all_contents.extend(contents) all_labels.extend([i] * len(links)) res = get_words_matrix(all_contents) dictionary = corpora.Dictionary(res) dictionary.filter_extremes(no_below=10, no_above=0.2) # [ [(w_id1, w_id1_cnt), (w_id2, w_id2_cnt),...] , # [(w_id1, w_id1_cnt), (w_id2, w_id2_cnt),...] , # ] bows = [dictionary.doc2bow(x) for x in res] X = np.array([(matutils.corpus2dense([vec], num_terms=len(dictionary)).T[0]) for vec in bows]) y = np.array(all_labels) # cross validation skf = StratifiedKFold(y, n_folds=5, shuffle=True, random_state=7654) scores = [] for train_index, test_index in skf: #print(train_index, test_index) train_X, test_X = X[train_index], X[test_index] train_y, test_y = y[train_index], y[test_index] score = evaluate_model(train_X, test_X, train_y, test_y) scores.append(score) scores = np.array(scores) print(scores) print('結果', np.mean(scores)) if save_flag: def save_as_pickle(file_path, obj): with open(file_path, 'wb') as f: pickle.dump(obj, f) # pickleに保存 cat_prob = calc_cat(y) word_cat_prob = calc_each_word_bar_cat(X, y) cvt_y_to_category_name = dict(list(starmap(lambda i, x: ( i, x.name), enumerate(categories)))) # ラベルy から カテゴリ名に変換するdict dir_path = (os.path.join(settings.BASE_DIR, '../')) save_as_pickle(dir_path + 'dictionay.dump', dictionary) save_as_pickle(dir_path + 'cat_prob.dump', cat_prob) save_as_pickle(dir_path + 'word_cat_prob.dump', word_cat_prob) save_as_pickle( dir_path + 'cvt_y_to_category_name.dump', cvt_y_to_category_name) print('------ Save completed. ------') else: print('------ not saved. -------')
def lstm_predict(): dim = 100 # preprocess for lstm print('preprocess data...') x_train, y_train = load_data() x_test, test_id = load_testdata() # print(x_train.shape,x_test.shape) x_train = preprocess_data(x_train) x_test = preprocess_data(x_test) data = pd.concat([x_train, x_test], axis=0).astype(str) texts = [[word for word in document.split(' ')] for document in data.values] # texts, label = select_data(texts,label) frequency = defaultdict(int) for text in texts: for token in text: #token = int(token) #print token frequency[token] += 1 texts = [[token for token in text if frequency[token] >= 20] for text in texts] vocab = set([word for doc in texts for word in doc]) vocab_size = len(vocab) print('generate embedding_matrix...') embedding_matrix = np.zeros((vocab_size, dim)) word2index = {} model = Word2Vec.load('../feature/predict/100w2vModel.m') # model = get_pretrained_w2vmodel() for i, word in enumerate(vocab): word2index[word] = i # if model is selftrained embedding_matrix[i] = model[word] # if model is pretrained # if word in model.keys(): # try: # embedding_matrix[i] = model[word] # except: # print('error: ',word) # else: # embedding_matrix[i] = np.zeros(dim) print('generate encoded_texts...') encoded_texts = [] for doc in texts: encoded_doc = [] for word in doc: encoded_doc.append(word2index[word]) encoded_texts.append(encoded_doc) # print(encoded_texts[:100]) # max_length = max([len(doc) for doc in texts]) max_length = config['max_length'] print('generate padded_texts...') padded_texts = pad_sequences(encoded_texts, maxlen=max_length, padding='post') x_train = padded_texts[:len(x_train)] x_test = padded_texts[len(x_train):] # x_train, x_test, y_train, y_test = train_test_split(padded_texts, label, test_size=0.2,random_state=42) print(len(x_train), len(x_test), len(y_train)) # lstm model structure print('Construct lstm model...') model = Sequential() embedding = Embedding(input_dim=vocab_size, output_dim=dim, mask_zero=True, weights=[embedding_matrix], input_length=max_length, trainable=False) model.add(embedding) model.add( LSTM(units=50, activation='sigmoid', recurrent_activation='hard_sigmoid')) model.add(Dropout(0.5)) model.add(Dense(1)) model.add(Activation('sigmoid')) # Compile and train the model print('Compiling the Model...') model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) print("Train...") skf = StratifiedKFold(y_train, n_folds=3, shuffle=True) new_train = np.zeros((len(x_train), 1)) new_test = np.zeros((len(x_test), 1)) for i, (trainid, valid) in enumerate(skf): print('fold' + str(i)) train_x = x_train[trainid] train_y = y_train[trainid] val_x = x_train[valid] model.fit(train_x, train_y, batch_size=config['batch_size'], epochs=config['n_epoch'], verbose=1) new_train[valid] = model.predict_proba(val_x) new_test += model.predict_proba(x_test) new_test /= 3 stacks = [] stacks_name = [] stack = np.vstack([new_train, new_test]) stacks.append(stack) stacks = np.hstack(stacks) clf_stacks = pd.DataFrame(data=stacks, columns=['lstm']) clf_stacks.to_csv('../feature/predict/lstm_prob2.csv', index=0)
from utility import formatAndPrintMetrics from utility import cross_entropy from sklearn.metrics import confusion_matrix from sklearn.metrics import precision_recall_fscore_support from sklearn.metrics import accuracy_score import numpy as np import matplotlib.pyplot as plt from sklearn import preprocessing fullTrainFile = 'C:/myD/workarea/KaggleWallmartWorkarea/kaggle_wallmart/data_CSV/train_svm_light.v2.new.txt' X, Y = get_data(fullTrainFile) le = preprocessing.LabelEncoder() le.fit(Y) Y = le.transform(Y) skf = StratifiedKFold(Y, n_folds=3, random_state=app_random_state_value) skfList = list(skf) train_index, test_index = skfList[0] XD = X#.todense() xTr, xTe = XD[train_index], XD[test_index] yTr, yTe = Y[train_index], Y[test_index] clf = MultinomialNB() clf = SGDClassifier(loss="hinge", penalty="l2") clf = GradientBoostingClassifier(n_estimators=200, learning_rate=0.8, max_depth=15, subsample=0.9, verbose=5, random_state=app_random_state_value) clf = GaussianNB() #yhTeGNB = yhTe #yLogPGNB = yLogP clf = RandomForestClassifier(n_estimators=500,verbose=1,n_jobs=4, random_state=app_random_state_value) #### Temp code to experiment on single class classification(binary-1/0) yTrMod = [1 if a == 37 else 0 for a in yTr] yTeMod = [1 if a == 37 else 0 for a in yTe]
dataset = pd.read_json("../data/preprocessed.json") dataset = dataset.reset_index(drop=True) def in_arange(s, e, step): return np.append(np.arange(s, e, step), e) ################################################# from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer from sklearn import svm from sklearn.pipeline import Pipeline from sklearn.grid_search import GridSearchCV from sklearn.cross_validation import StratifiedKFold skf = StratifiedKFold(dataset["highest_reaction"], n_folds=10) # param_grid = {'vect__ngram_range': [(1, 1), (1, 2), (2, 2)], # 'vect__min_df': in_arange(1, 30, 1), # 'vect__max_df': in_arange(0.7, 1.0, 0.1), # 'tfidf__use_idf': [True], # 'clf__C': in_arange(0.1, 2.0, 0.3), # } # param_grid = {'vect__ngram_range': [(1, 2)], # 'vect__min_df': in_arange(1, 20, 1), # 'vect__max_df': in_arange(0.01, 0.5, 0.1), # 'tfidf__use_idf': [True], # 'clf__C': in_arange(0.1, 2.0, 0.1), # }
print data.shape outcome_var = 'ckd' predictor_var = [c for c in columns if c not in ["ckd"]] X = data[predictor_var] y = data[outcome_var] print(X) print(y) # Create the RFE object and compute a cross-validated score. #svc = SVC(kernel="linear") from sklearn import linear_model model = linear_model.LogisticRegression(fit_intercept=True, multi_class="ovr") # The "accuracy" scoring is proportional to the number of correct # classifications rfecv = RFECV(estimator=model, step=1, cv=StratifiedKFold(y, 2), scoring='accuracy') rfecv.fit(X, y) print('accuracy scoring', rfecv.scoring) print("Ranking of the features : %d" % rfecv.ranking_) print("Optimal number of features : %d" % rfecv.n_features_) # Plot number of features VS. cross-validation scores plt.figure() plt.xlabel("Number of features selected") plt.ylabel("Cross validation score (nb of correct classifications)") plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) plt.show()
if np.isinf(x) or x > y_max: x = y_max x_row.append(x) X.append(x_row) y = [] for y_true in fs_list: if y_true >= TH: y.append(1) else: y.append(0) X = np.asarray(X) y = np.asarray(y) kf = StratifiedKFold(y,n_folds=10,random_state=0,shuffle=True) cm = np.zeros((2,2)) for tr_ind,ts_ind in kf: if clf == 'SVM': lr = SVC(cache_size=20000,kernel='rbf',C=1,probability=True) if clf == 'LR': lr = LogisticRegression(C=0.0001,n_jobs=-1) if clf == 'RF': lr = RandomForestClassifier(n_estimators=100,criterion='gini',max_features='sqrt',n_jobs=-1) lr.fit(X[tr_ind],y[tr_ind]) y_pred = lr.predict(X[ts_ind]) y_true = y[ts_ind] cm_one_fold = confusion_matrix(y_true,y_pred) if cm_one_fold.shape == (2,2):
'num_var22_ult3', # 0.03452566096423017 'saldo_medio_var5_hace3', # 0.04074650077760498 'saldo_medio_var5_hace2', # 0.04292379471228616 'SumZeros', # 0.04696734059097978 'saldo_var30', # 0.09611197511664074 'var38', # 0.1390357698289269 'var15' ] # 0.20964230171073095 features = train.columns[1:-1] todrop = list(set(tokeep).difference(set(features))) train.drop(todrop, inplace=True, axis=1) test.drop(todrop, inplace=True, axis=1) features = train.columns[1:-1] split = 10 skf = StratifiedKFold(train.TARGET.values, n_folds=split, shuffle=False, random_state=42) train_preds = None test_preds = None visibletrain = blindtrain = train index = 0 print('Change num_rounds to 350') num_rounds = 350 params = {} params["objective"] = "binary:logistic" params["eta"] = 0.03 params["subsample"] = 0.8 params["colsample_bytree"] = 0.7 params["silent"] = 1 params["max_depth"] = 5
def init(): os.makedirs(WD, exist_ok=True) shutil.copy(os.path.join(DATA_PATH, 'X.npy'), WD) shutil.copy(os.path.join(DATA_PATH, 'y.npy'), WD) # VBM if DATA_TYPE == "image": shutil.copy(os.path.join(DATA_PATH, 'mask.nii'), WD) elif DATA_TYPE == "mesh": shutil.copy(os.path.join(DATA_PATH, 'mask.npy'), WD) shutil.copy(os.path.join(DATA_PATH, 'lrh.pial.gii'), WD) shutil.copy(os.path.join(DATA_PATH, "Atv.npz"), WD) site = np.load( "/neurospin/brainomics/2016_schizConnect/analysis/all_studies+VIP/VBM/all_subjects/data/site.npy" ) ## Create config file os.chdir(WD) X = np.load("X.npy") y = np.load("y.npy") cv_outer = [[tr, te] for tr, te in StratifiedKFold( y.ravel(), n_folds=NFOLDS_OUTER, random_state=42)] cv_outer[0][0] = np.transpose(np.where(site != 1)).ravel() cv_outer[0][1] = np.transpose( np.where(site == 1)).ravel() #CV00 TEST ON COBRE cv_outer[1][0] = np.transpose(np.where(site != 2)).ravel() cv_outer[1][1] = np.transpose( np.where(site == 2)).ravel() #CV01 TEST ON NMORPHch cv_outer[2][0] = np.transpose(np.where(site != 3)).ravel() cv_outer[2][1] = np.transpose( np.where(site == 3)).ravel() #CV02 TEST ON NUSDAST cv_outer[3][0] = np.transpose(np.where(site != 4)).ravel() cv_outer[3][1] = np.transpose( np.where(site == 4)).ravel() #CV03 TEST ON VIP assert len(cv_outer[0][0]) == 442 assert len(cv_outer[1][0]) == 526 assert len(cv_outer[2][0]) == 336 assert len(cv_outer[3][0]) == 514 cv_outer[0][0] = cv_outer[0][0][:int(np.around(len(cv_outer[0][0]) * 0.1))] cv_outer[1][0] = cv_outer[1][0][:int(np.around(len(cv_outer[1][0]) * 0.1))] cv_outer[2][0] = cv_outer[2][0][:int(np.around(len(cv_outer[2][0]) * 0.1))] cv_outer[3][0] = cv_outer[3][0][:int(np.around(len(cv_outer[3][0]) * 0.1))] assert len(cv_outer[0][0]) == 44 assert len(cv_outer[1][0]) == 53 assert len(cv_outer[2][0]) == 34 assert len(cv_outer[3][0]) == 51 import collections cv = collections.OrderedDict() for cv_outer_i, (tr_val, te) in enumerate(cv_outer): cv["cv%02d/all" % (cv_outer_i)] = [tr_val, te] cv_inner = StratifiedKFold(y[tr_val].ravel(), n_folds=NFOLDS_INNER, random_state=42) for cv_inner_i, (tr, val) in enumerate(cv_inner): cv["cv%02d/cvnested%02d" % ((cv_outer_i), cv_inner_i)] = [tr_val[tr], tr_val[val]] for k in cv: cv[k] = [cv[k][0].tolist(), cv[k][1].tolist()] C_range = [[100], [10], [1], [1e-1], [1e-2], [1e-3], [1e-4], [1e-5], [1e-6], [1e-7], [1e-8], [1e-9]] config = dict(data=dict(X="X.npy", y="y.npy"), params=C_range, resample=cv, structure_linear_operator_tv="Atv.npz", map_output="results", user_func=user_func_filename) json.dump(config, open(os.path.join(WD, "config_cv_largerange.json"), "w")) # Build utils files: sync (push/pull) and PBS import brainomics.cluster_gabriel as clust_utils cmd = "mapreduce.py --map %s/config_cv_largerange.json" % WD_CLUSTER clust_utils.gabriel_make_qsub_job_files(WD, cmd, walltime="250:00:00", suffix="_cv_largerange", freecores=2)
# count += 1 # for i in negative_sample_index: # data_set[count][0] = whole_negative_index[i][0] # data_set[count][1] = whole_negative_index[i][1] # data_set[count][2] = 0 # count += 1 data_set = np.load('dataset/' + opts.r + '_train.npy') if opts.t == 'unique': pass else: test_auc_fold = [] test_aupr_fold = [] rs = np.random.randint(0, 1000, 1)[0] kf = StratifiedKFold(data_set[:, 2], n_folds=10, shuffle=True, random_state=rs) for train_index, test_index in kf: DTItrain, DTItest = data_set[train_index], data_set[test_index] DTItrain, DTIvalid = train_test_split(DTItrain, test_size=0.05, random_state=rs) v_auc, v_aupr, t_auc, t_aupr = train_and_evaluate( DTItrain=DTItrain, DTIvalid=DTIvalid, DTItest=DTItest, graph=graph, num_steps=2000) test_auc_fold.append(t_auc)
predict_data = pd.read_csv("../data/predict_data.csv") ########## tune the para ########## # learning_rate lambdas = [0.0001, 0.001, 0.01, 0.1, 1] # n_estimators ntree_list = [50, 100, 250, 500] # max_depth depth = [10, 25, 50] param_grid = dict(learning_rate=lambdas, n_estimators=ntree_list, max_depth=depth) # param_grid = dict(learning_rate = lambdas, n_estimators = ntree_list, max_depth = depth) train_data, train_label = get_lb_ft(train1, "Y_midprice") cv = StratifiedKFold(labels, n_folds=3, random_state=20151204, shuffle=TRUE) grid = GridSearchCV(GradientBoostingClassifier(), param_grid=param_grid, cv=cv, n_jobs=-1) grid.fit(train_data, train_label) print(grid.grid_scores_) print("The best parameters are %s with a score of %0.4f" % (grid.best_params_, grid.best_score_)) # fit the best model clf = GradientBoostingClassifier( learning_rate=grid.best_params_['learning_rate'], n_estimators=grid.best_params_['n_estimators'], max_depth=grid.best_params_['max_depth'])
###### transform all the categorical variables with one hot ###### transformation and standardize the numerical variable ###### transform the target variable ############################################################## df = pd.read_csv('../data/census-income.data', header=None) X_ = dataProcess(df, catList, numList) le = LabelEncoder() y_ = le.fit_transform(df[41].values) ############################################################## ###### play with stratified K fold ############################################################## skf = StratifiedKFold(y_, n_folds=5, shuffle=True, random_state=np.random.seed(10)) for train_index_s, test_index_s in skf: print "length(train_index_s): ", len(train_index_s) print "Counter(train_index_s): ", Counter(y_[train_index_s]) raw_input("press return") ############################################################## ###### re-balanced the data ############################################################## # new_train_index = dataBalance(y_,0.01) # X = X_[new_train_index,:]
def train_predict(train_file, test_file, predict_valid_file, predict_test_file, n_iter=100, hidden=4, lrate=.1, n_fold=5): _, y_val = load_svmlight_file(train_file) cv = StratifiedKFold(y_val, n_folds=n_fold, shuffle=True, random_state=2015) logging.info('Cross validation...') p_val = np.zeros_like(y_val) lloss = 0. for i_trn, i_val in cv: clf = NN(n=10000, h=hidden, a=lrate, seed=2015) logging.info('Epoch\tTrain\tValid') logging.info('=========================') for i_iter in range(n_iter): lloss_trn = 0. cnt_trn = 0 for i, (x, y) in enumerate(clf.read_sparse(train_file)): if i in i_val: p_val[i] = clf.predict(x) else: p = clf.predict(x) clf.update(x, p - y) lloss_trn += logloss(y, p) cnt_trn += 1 lloss_trn /= cnt_trn lloss_val = log_loss(y_val[i_val], p_val[i_val]) if (i_iter == 0) or ((i_iter + 1) % int(n_iter / 10) == 0) or (i_iter == n_iter - 1): logging.info('#{:4d}\t{:.4f}\t{:.4f}'.format( i_iter + 1, lloss_trn, lloss_val)) lloss += lloss_val logging.info('Log Loss = {:.4f}'.format(lloss / n_fold)) logging.info('Retraining with 100% data...') clf = NN(n=10000, h=hidden, a=lrate, seed=2015) for i_iter in range(n_iter): for x, y in clf.read_sparse(train_file): p = clf.predict(x) clf.update(x, p - y) logging.info('#{:4d}'.format(i_iter + 1)) _, y_tst = load_svmlight_file(test_file) p_tst = np.zeros_like(y_tst) for i, (x, _) in enumerate(clf.read_sparse(test_file)): p_tst[i] = clf.predict(x) logging.info('Saving predictions...') np.savetxt(predict_valid_file, p_val, fmt='%.6f') np.savetxt(predict_test_file, p_tst, fmt='%.6f')
if not os.path.exists(WD): os.makedirs(WD) os.chdir(WD) ############################################################################# ## Create config file y = np.load(INPUT_DATA_y) if os.path.exists("config.json"): inf = open("config.json", "r") old_conf = json.load(inf) cv = old_conf["resample"] inf.close() else: cv = [[tr.tolist(), te.tolist()] for tr, te in StratifiedKFold(y.ravel(), n_folds=5)] if cv[0] is not None: # Make sure first fold is None cv.insert(0, None) # parameters grid # Re-run with tv_range = np.hstack([np.arange(0, 1., .1), [0.05, 0.01, 0.005, 0.001]]) ratios = np.array([[1., 0., 1], [0., 1., 1], [.5, .5, 1], [.9, .1, 1], [.1, .9, 1], [.01, .99, 1], [.001, .999, 1]]) alphas = [.01, .05, .1, .5, 1.] k_range = [100, 1000, 10000, 100000, -1] l1l2tv = [ np.array([[float(1 - tv), float(1 - tv), tv]]) * ratios for tv in tv_range ] l1l2tv.append(np.array([[0., 0., 1.]])) l1l2tv = np.concatenate(l1l2tv)
y = np.array(y_list) h5f = h5py.File('/data/MIMIC/Xy_seq' + str(sequence_length) + '.h5', 'w') h5f.create_dataset('X', data=X) h5f.create_dataset('y', data=y) h5f.close() else: h5f = h5py.File('/data/MIMIC/Xy_seq' + str(sequence_length) + '.h5', 'r') X = h5f['X'][:] y = h5f['y'][:] print('Train model') cv = StratifiedKFold(y, n_folds=5, random_state=123) roc_auc = {'lstm': []} config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: for j, (train, test) in enumerate(cv): roc_auc['lstm'].append( lstm_fit_predict(X[train], y[train], X[test], y[test], roc_auc)) print('Cross fold: ', j, roc_auc) pkl.dump( roc_auc, open( '/data/MIMIC/lstm_encounter_scores_' + str(sequence_length) +
from util import plot_tree NUM_FOLDS = 10 PROP_VALIDATION = 0.1 #10% NUM_TREES = 500 NUM_PARAMETERS = 10 # Withhold some proportion of the data set for validation later (stratify) sss = StratifiedShuffleSplit(target, n_iter=1, test_size=PROP_VALIDATION) for train_index, test_index in sss: X_train, X_test = data.iloc[train_index], data.iloc[test_index] y_train, y_test = target[train_index], target[test_index] # Initialise folds on remaining data and store indexes for use later kf = StratifiedKFold(y_train, n_folds=NUM_FOLDS) fold_indexes = [] for train_index, test_index in kf: fold_indexes.append({"train_index": train_index, "test_index": test_index}) # Init score storing structures importance_scores = np.zeros([NUM_FOLDS, len(all_parameters)]) importance_scores_stdev = np.zeros([NUM_FOLDS, len(all_parameters)]) tree_cv_scores = np.zeros(NUM_FOLDS) parameter_union = np.zeros(len(all_parameters), dtype=int) # For each fold, build a forest, skim the NUM_PARAMETERS best features as # measured by the classifier's feature_importance property, then fit and score a # single decision tree on that feature set. for n_fold, indexer in enumerate(fold_indexes): print "\n[FRST] Constructing Forest#%d" % (n_fold + 1)
count = 0 for seq in nseqs: seq = str(seq) for i in range(len(seq) - k + 1): idx = kmers.index(tuple(seq[i:i + k])) kmers_d[count, idx] += 1 kmers_d[count, :] = np.divide(kmers_d[count, :], kmers_d[count, :].sum()) count += 1 #y = np_utils.to_categorical(y, max(y)+1) test_predicted1 = np.zeros((labels.shape[0], )) test_predicted2 = np.zeros((labels.shape[0], )) i = 0 #For each one of the 10 fold, fit the classifier and test for train_idx, test_idx in StratifiedKFold(y, 10, True): i += 1 print("fold " + str(i)) #train_idx = get_balanced_classes(train_idx, np.argmax(y, axis=1)) X_train, X_test = kmers_d[train_idx, :], kmers_d[test_idx, :] y_train, y_test = y[train_idx], y[test_idx] #Build new network #model = build_lstm(4) model1 = SVC(kernel='linear', class_weight='balanced', C=10) model2 = SVC(kernel='rbf', class_weight='balanced', C=10) model1.fit(X_train, y_train) model2.fit(X_train, y_train) test_predicted1[test_idx] = model1.predict(X_test) test_predicted2[test_idx] = model2.predict(X_test) #print(accuracy_score(np.argmax(y_test, axis=1), test_predicted[test_idx]))
# just applying it on the test set. scaler = Scaler() X = scaler.fit_transform(X) # For an initial search, a logarithmic grid with basis # 10 is often helpful. Using a basis of 2, a finer # tuning can be achieved but at a much higher cost. C_range = 10. ** np.arange(-3, 8) gamma_range = 10. ** np.arange(-5, 4) param_grid = dict(gamma=gamma_range, C=C_range) grid = GridSearchCV(SVC(), param_grid=param_grid, cv=StratifiedKFold(y=Y, k=5)) grid.fit(X, Y) print("The best classifier is: ", grid.best_estimator_) # plot the scores of the grid # grid_scores_ contains parameter settings and scores score_dict = grid.grid_scores_ # We extract just the scores scores = [x[1] for x in score_dict] scores = np.array(scores).reshape(len(C_range), len(gamma_range)) # Make a nice figure pl.figure(figsize=(8, 6))
def main(argv): # Define command line options p = optparse.OptionParser(description='Pyxit', prog='PyXit (PYthon piXiT)', version='PyXit 0.1') p.add_option('--dir_ls', type="string", dest="dir_ls", help="The learning set directory") p.add_option('--dir_ts', type="string", dest="dir_ts", help="The training set directory") p.add_option('--cv_k_folds', type="int", dest="cv_k_folds", help="The number of folds") p.add_option( '--cv_shuffle', default=False, action="store_true", dest="cv_shuffle", help="Whether cross-validation is performed using ShuffleSplit.") p.add_option('--cv_shuffle_test_fraction', default=0.1, type="float", dest="cv_shuffle_test_fraction", help="The proportion of data in shuffled test splits.") p.add_option('--pyxit_n_subwindows', default=10, type="int", dest="pyxit_n_subwindows", help="number of subwindows") p.add_option('--pyxit_min_size', default=0.5, type="float", dest="pyxit_min_size", help="min size") p.add_option('--pyxit_max_size', default=1.0, type="float", dest="pyxit_max_size", help="max size") p.add_option('--pyxit_target_width', default=16, type="int", dest="pyxit_target_width", help="target width") p.add_option('--pyxit_target_height', default=16, type="int", dest="pyxit_target_height", help="target height") p.add_option('--pyxit_interpolation', default=2, type="int", dest="pyxit_interpolation", help="interpolation method 1,2,3,4") p.add_option('--pyxit_transpose', default=False, action="store_true", dest="pyxit_transpose", help="transpose subwindows") p.add_option('--pyxit_colorspace', default=2, type="int", dest="pyxit_colorspace", help="colorspace 0=RGB, 1=TRGB, 2=HSV") p.add_option('--pyxit_fixed_size', default=False, action="store_true", dest="pyxit_fixed_size", help="extract fixed size subwindows") p.add_option('--pyxit_n_jobs', default=1, type="int", dest="pyxit_n_jobs", help="number of jobs") p.add_option('--pyxit_save_to', type="string", dest="pyxit_save_to", help="file to save the model into") p.add_option('--forest_n_estimators', default=10, type="int", dest="forest_n_estimators", help="number of base estimators (T)") p.add_option('--forest_max_features', default=1, type="int", dest="forest_max_features", help="max features at test node (k)") p.add_option('--forest_min_samples_split', default=1, type="int", dest="forest_min_samples_split", help="minimum node sample size (nmin)") p.add_option('--forest_shared_mem', default=False, action="store_true", dest="forest_shared_mem", help="shared mem") p.add_option( '--svm', default=0, dest="svm", help= "final svm classifier: 0=nosvm, 1=libsvm, 2=liblinear, 3=lr-l1, 4=lr-l2", type="int") p.add_option('--svm_c', default=1.0, type="float", dest="svm_c", help="svm C") p.add_option('--quiet', action="store_false", default=True, dest="verbose", help="Turn off verbose mode") p.add_option('--verbose', action="store_true", default=True, dest="verbose", help="Turn on verbose mode") options, arguments = p.parse_args(args=argv) # Check for errors in the options e = None if not options.dir_ls: e = "--dir_ls needs to be set." elif options.dir_ts and options.cv_k_folds: e = "--dir_ts and --cv_k_folds cannot be set at the same time." elif options.pyxit_save_to and options.cv_k_folds: e = "--pyxit_save_to and --cv_k_folds cannot be set at the time." if e: print "Error: %s" % e print "Run with -h option for help." sys.exit(1) if options.verbose: print "[pyxit.main] Options = ", options # Load data if options.verbose: print "[pyxit.main] Loading data..." X, y = build_from_dir(options.dir_ls) classes = np.unique(y) n_classes = len(classes) y_original = y y = np.searchsorted(classes, y) # Instantiate classifiers if options.verbose: print "[pyxit.main] Initializing PyxitClassifier..." forest = ExtraTreesClassifier( n_estimators=options.forest_n_estimators, max_features=options.forest_max_features, min_samples_split=options.forest_min_samples_split, n_jobs=options.pyxit_n_jobs, verbose=options.verbose) pyxit = PyxitClassifier(base_estimator=forest, n_subwindows=options.pyxit_n_subwindows, min_size=options.pyxit_min_size, max_size=options.pyxit_max_size, target_width=options.pyxit_target_width, target_height=options.pyxit_target_height, interpolation=options.pyxit_interpolation, transpose=options.pyxit_transpose, colorspace=options.pyxit_colorspace, fixed_size=options.pyxit_fixed_size, n_jobs=options.pyxit_n_jobs, verbose=options.verbose) if options.svm: if options.svm == SVM_LIBSVM: svm = SVC(probability=True, C=options.svm_c, kernel="linear") if options.svm == SVM_LIBLINEAR: svm = LinearSVC(C=options.svm_c) if options.svm == SVM_LRL1: svm = LogisticRegression(penalty="l1", C=options.svm_c) if options.svm == SVM_LRL2: svm = LogisticRegression(penalty="l2", C=options.svm_c) if options.svm == ET: svm = ExtraTreesClassifier( n_estimators=1000, max_features="sqrt", #max_features=1000, min_samples_split=2, n_jobs=options.pyxit_n_jobs, verbose=options.verbose) if options.svm == RF: svm = RandomForestClassifier( n_estimators=1000, #max_features=1000, max_features="sqrt", min_samples_split=2, n_jobs=options.pyxit_n_jobs, verbose=options.verbose) if options.svm == NN: svm = neighbors.KNeighborsClassifier(10) if options.verbose: print "[pyxit.main] PyxitClassifier =" print pyxit if options.svm: print "[pyxit.main] SVM =" print svm # Build and evaluate if options.dir_ls and not options.dir_ts and not options.cv_k_folds: if options.pyxit_save_to: fd = open(options.pyxit_save_to, "wb") pickle.dump(classes, fd, protocol=pickle.HIGHEST_PROTOCOL) if options.verbose: print "[pyxit.main] Fitting PyxitClassifier on %s" % options.dir_ls _X, _y = pyxit.extract_subwindows(X, y) pyxit.fit(X, y, _X=_X, _y=_y) if options.verbose: print "[pyxit.main] Saving PyxitClassifier into %s" % options.pyxit_save_to if options.pyxit_save_to: pickle.dump(pyxit, fd, protocol=pickle.HIGHEST_PROTOCOL) if options.svm: Xt = pyxit.transform(X, _X=_X, reset=True) if options.verbose: print "[pyxit.main] Fitting SVC on %s" % options.dir_ls svm.fit(Xt, y) if options.verbose: print "[pyxit.main] Saving SVM into %s" % options.pyxit_save_to if options.pyxit_save_to: pickle.dump(svm, fd, protocol=pickle.HIGHEST_PROTOCOL) if options.pyxit_save_to: fd.close() elif options.dir_ts: if options.pyxit_save_to: fd = open(options.pyxit_save_to, "wb") pickle.dump(classes, fd, protocol=pickle.HIGHEST_PROTOCOL) if options.verbose: print "[pyxit.main] Fitting PyxitClassifier on %s" % options.dir_ls _X, _y = pyxit.extract_subwindows(X, y) pyxit.fit(X, y, _X=_X, _y=_y) if options.pyxit_save_to: pickle.dump(pyxit, fd, protocol=pickle.HIGHEST_PROTOCOL) if options.svm: Xt = pyxit.transform(X, _X=_X, reset=True) if options.verbose: print "[pyxit.main] Fitting SVC on %s" % options.dir_ls svm.fit(Xt, y) if options.pyxit_save_to: pickle.dump(svm, fd, protocol=pickle.HIGHEST_PROTOCOL) if options.pyxit_save_to: fd.close() if options.verbose: print "[pyxit.main] Testing on %s" % options.dir_ts X_test, y_test = build_from_dir(options.dir_ts) y_test = np.searchsorted(classes, y_test) _X_test, _y_test = pyxit.extract_subwindows(X_test, y_test) y_true = y_test all_tested = np.ones(len(y_true), dtype=np.bool) if not options.svm: y_predict = pyxit.predict(X_test, _X=_X_test) y_proba = pyxit.predict_proba(X_test, _X=_X_test) else: Xt = pyxit.transform(X_test, _X=_X_test) y_predict = svm.predict(Xt) if options.svm != SVM_LIBLINEAR: y_proba = svm.predict_proba(Xt) elif options.cv_k_folds: if options.verbose: print "[pyxit.main] K-Fold cross-validation (K=%d)" % options.cv_k_folds _X, _y = pyxit.extract_subwindows(X, y) i = 1 step = 100. / options.cv_k_folds y_true = y y_predict = np.empty(y_true.shape, dtype=y.dtype) y_proba = np.empty((y_true.shape[0], n_classes)) all_tested = np.zeros(len(y_true), dtype=np.bool) cm = np.zeros((n_classes, n_classes), dtype=np.int32) if not options.cv_shuffle: cv = StratifiedKFold(y_true, options.cv_k_folds) else: cv = ShuffleSplit(len(X), n_iter=options.cv_k_folds, test_size=options.cv_shuffle_test_fraction) for train, test in cv: all_tested[test] = True _train = pyxit.extend_mask(train) _test = pyxit.extend_mask(test) if options.verbose: print "[pyxit.main] Fitting PyxitClassifier on fold %d" % i pyxit.fit(X[train], y[train], _X=_X[_train], _y=_y[_train]) if options.svm: Xt = pyxit.transform(X[train], _X=_X[_train], reset=True) if options.verbose: print "[pyxit.main] Fitting SVC on fold %d" % i svm.fit(Xt, y[train]) if options.verbose: print "[pyxit.main] Testing on fold %d" % i if not options.svm: y_predict[test] = pyxit.predict(X[test], _X=_X[_test]) y_proba[test] = pyxit.predict_proba(X[test], _X=_X[_test]) else: Xt = pyxit.transform(X[test], _X=_X[_test]) y_predict[test] = np.asarray(svm.predict(Xt), dtype=y.dtype) if hasattr(svm, "predict_proba"): y_proba[test] = svm.predict_proba(Xt) print svm if options.verbose: print "[pyxit.main] Classification error on fold %d = %f" % ( i, 1.0 * np.sum(y_true[test] != y_predict[test]) / len(y_true[test])) print "[pyxit.main] Cumulated confusion matrix =" cm += confusion_matrix(y_true[test], y_predict[test]) print_cm(cm, classes) i += 1 # Output some results if "all_tested" in locals(): if options.verbose: print "---" print "[pyxit.main] Test coverage =", sum(all_tested) / ( 1.0 * len(all_tested)) print "[pyxit.main] Overall classification error = %f" % ( 1.0 * np.sum(y_true[all_tested] != y_predict[all_tested]) / len(y_true[all_tested])) print "[pyxit.main] Overall confusion matrix =" print_cm( confusion_matrix(y_true[all_tested], y_predict[all_tested]), classes) #y_true = classes.take(y_true[all_tested], axis=0) y_predict = classes.take(y_predict[all_tested], axis=0) y_proba = np.max(y_proba, axis=1) d = {} for i in xrange(len(X)): d[X[i]] = (int(y_predict[i]), y_proba[i]) return d
def xgbLocalCVModel(taskName, config): params = config['params'] config['task'] = taskName trainFeature, testFeature, trainLabel, trainUid, testUid = readFeature( config) if taskName == 'gender': rounds = config['roundsGender'] elif taskName == 'age': rounds = config['roundsAge'] else: rounds = config['roundsEdu'] if config['multiClass'] == True: params['num_class'] = len(np.unique(trainLabel)) print params['num_class'] else: params['scale_pos_weight'] = (float)(len( trainLabel[trainLabel == 0])) / len(trainLabel[trainLabel == 1]) print params['scale_pos_weight'] if config['prob'] == True: params['objective'] = 'multi:softprob' else: params['objective'] = 'multi:softmax' print 'CV On XGB Model....' kfold = StratifiedKFold(y=trainLabel, n_folds=config['folds'], shuffle=True, random_state=params['seed']) f = 0 predict = [] true = [] uid = [] for index1, index2 in kfold: print 'fold:' + str(f) print index1, index2 localTrainFeature = trainFeature[index1, :] localTestFeature = trainFeature[index2, :] localTrainLabel = trainLabel[index1] localTestLabel = trainLabel[index2] localTestUid = trainUid[index2] uid = np.append(uid, localTestUid) print 'Build, Train and Predict XGB Model.....' #print localTrainFeature.shape[1] localPredict = xgbLocalModel(localTrainFeature, localTestFeature, localTrainLabel, localTestLabel, params, config, rounds) if config['prob'] == True: if f == 0: predict = localPredict else: predict = np.concatenate((predict, localPredict), axis=0) else: print error(localTestLabel, localPredict) predict = np.append(predict, localPredict) true = np.append(true, localTestLabel) f += 1 if config['prob'] == True: return predict, uid else: print "Total error" + str(error(true, predict)) return predict, uid
train_file = opts['--train'] test_file = opts['--test'] pred_file = opts['--pred'] epoch = int(opts['--epoch']) cv = int(opts['--cv']) nfolds = int(opts['--folds']) target_col = 'target' if cv == 0: nfolds = 2 X, y, y_coded, ids_train, scaler = load_train_data(train_file) X_test, ids_test = load_test_data(test_file, scaler) num_classes = len(y[0]) num_features = X.shape[1] skf = StratifiedKFold(y_coded, nfolds, random_state=2015) ids_train_folds = np.empty(0) for train_index, valid_index in skf: ids_train_folds = np.append(ids_train_folds, ids_train[valid_index]) #train = train.reindex(np.random.permutation(train.index)) param = {} param['objective'] = 'binary:logistic' param['eta'] = 0.1 param['booster'] = 'gblinear' param['max_depth'] = 12 param['eval_metric'] = 'logloss' param['silent'] = 1 param['nthread'] = 6 param['min_child_weight'] = 1
def stackFrame(data, config, clf_List): # -- get train /test feature and train label trainFeature = data['trainFeature'] testFeature = data['testFeature'] # -- get stack param from config cvfolds = config['folds'] # -- stack train and test for j, clf in enumerate(clf_List): modelName = config['modelName'][j] LogInfo("Model-" + modelName) for labelIndex in range(3): labelName = 'trainLabel' + str(labelIndex + 1) LogInfo(labelName) trainLabel = data[labelName] skf = list(StratifiedKFold(trainLabel, cvfolds)) config['task'] = config['taskList'][labelIndex] # -- define the stack model result blend_train = np.zeros( (trainFeature.shape[0], len(np.unique(trainLabel)))) blend_test = np.zeros( (testFeature.shape[0], len(np.unique(trainLabel)))) for i, (trainIndex, testIndex) in enumerate(skf): LogInfo("Fold-" + str(i)) X_train = trainFeature[trainIndex] y_train = trainLabel[trainIndex] X_test = trainFeature[testIndex] y_test = trainLabel[testIndex] if clf == 'xgb': y_pred, test_pred = xgbStackModel(X_train, X_test, y_train, y_test, testFeature, config) blend_test += test_pred else: clf.fit(X_train, y_train) y_pred = clf.predict_proba(X_test) y_pred_val = clf.predict(X_test) test_pred = clf.predict_proba(testFeature) blend_test += test_pred evalerror(y_pred_val, y_test) blend_train[testIndex, :] = y_pred blend_test = blend_test / cvfolds if labelIndex == 0: train = blend_train test = blend_test else: train = np.concatenate([train, blend_train], axis=1) test = np.concatenate([test, blend_test], axis=1) train = pd.DataFrame(train, columns=getColName(train.shape[1], modelName)) test = pd.DataFrame(test, columns=getColName(test.shape[1], modelName)) train.to_csv('../feature/stack-my/' + modelName + '_train_prob.csv', index=False) test.to_csv('../feature/stack-my/' + modelName + '_test_prob.csv', index=False)
dtype=np.float32, usecols=np.concatenate([[ 0 ], important_indices[important_indices >= n_date_features] + 1 - 1156])).values ], axis=1) y = pd.read_csv("../input/train_numeric.csv", index_col=0, dtype=np.float32, usecols=[0, 969]).values.ravel() # In[ ]: clf = XGBClassifier(max_depth=5, base_score=0.005) cv = StratifiedKFold(y, n_folds=3) preds = np.ones(y.shape[0]) for i, (train, test) in enumerate(cv): preds[test] = clf.fit(X[train], y[train]).predict_proba(X[test])[:, 1] print("fold {}, ROC AUC: {:.3f}".format( i, roc_auc_score(y[test], preds[test]))) print(roc_auc_score(y, preds)) # In[ ]: # pick the best threshold out-of-fold thresholds = np.linspace(0.01, 0.99, 50) mcc = np.array([matthews_corrcoef(y, preds > thr) for thr in thresholds]) plt.plot(thresholds, mcc) best_threshold = thresholds[mcc.argmax()] print(mcc.max())
test.drop(labels = ["v22",'v8','v23','v25','v31','v36','v37','v46','v51','v53','v54','v63','v73','v75','v79','v81','v82','v89','v92','v95','v105','v107','v108','v109','v110','v116','v117','v118','v119','v123','v124','v128'],axis=1, inplace = True) train = np.asarray(train, dtype=np.float32) labels = labels.ravel() X = train ; y = labels; X_submission = test; n_folds = 2 skf = list(StratifiedKFold(y, n_folds)) # BLEND 1 clfs = [ RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='gini'), RandomForestClassifier(n_estimators=80, max_features = "auto" ,min_samples_split = 30 , n_jobs=-1, criterion='entropy'), RandomForestClassifier(n_estimators=150, max_features = 80 ,min_samples_split = 50 , n_jobs=-1, criterion='entropy'), RandomForestClassifier(n_estimators=50, max_features = "auto" ,min_samples_split = 70 , n_jobs=-1, criterion='entropy'), ExtraTreesClassifier(n_estimators=120, n_jobs=-1, max_depth=50 , max_features = 60 , min_samples_leaf=40 , criterion='gini'), ExtraTreesClassifier(n_estimators=150, n_jobs=-1, max_depth=100 , max_features = 80 , min_samples_leaf=40 , criterion='entropy'), ExtraTreesClassifier(n_estimators=100, n_jobs=-1, max_depth=100 , max_features = 30 , min_samples_leaf=30 , criterion='entropy'), ExtraTreesClassifier(n_estimators=150, n_jobs=-1, max_depth=120 , max_features = "auto" , min_samples_leaf=20 , criterion='entropy') ] print "Creating train and test sets for blending."
testing_reduced.shape #Hyperparameters tuning run_gs = False if run_gs: parameter_grid = { 'max_depth': [4, 6, 8], 'n_estimators': [50, 10], 'max_features': ['sqrt', 'auto', 'log2'], 'min_samples_split': [1, 3, 10], 'min_samples_leaf': [1, 3, 10], 'bootstrap': [True, False], } forest = RandomForestClassifier() cross_validation = StratifiedKFold(targets, n_folds=5) grid_search = GridSearchCV(forest, scoring='accuracy', param_grid=parameter_grid, cv=cross_validation) grid_search.fit(training, targets) model = grid_search parameters = grid_search.best_params_ print('Best score: {}'.format(grid_search.best_score_)) print('Best parameters: {}'.format(grid_search.best_params_)) else: parameters = { 'bootstrap': False,
gbc=GradientBoostingClassifier(n_estimators=200) evaluate_model(gbc) ''' 0.971107544141252 0.8097014925373134 [0.65921788 0.82681564 0.81460674 0.8258427 0.85310734] 0.797066906117377 ''' single_model = RandomForestClassifier(n_estimators=100) # model = GradientBoostingClassifier() # model = LogisticRegression(C=0.5, penalty='l2', tol=1e-9) rfecv = RFECV( estimator = single_model , step = 1 , cv = StratifiedKFold( train_y , 2 ) , scoring = 'accuracy' ) evaluate_model(rfecv) ''' 0.9983948635634029 0.7910447761194029 [0.7877095 0.79888268 0.84269663 0.80898876 0.83050847] 0.8137572093211297 ''' from sklearn.ensemble import VotingClassifier voc = VotingClassifier([('lr', lr), ('rf', rfc), ('gbc', gbc)], voting='hard') evaluate_model(voc)