def run(input_train, input_test, output_name): """ Takes a file path as input, a file path as output, and produces a sorted csv of item IDs for Kaggle submission ------- input_train : 'full path of the training file' input_test : 'full path of the testing file' output_name : 'full path of the output file' """ data = pd.read_table(input_train) test = pd.read_table(input_test) testItemIds = test.itemid response = data.is_blocked dummies = sparse.csc_matrix(pd.get_dummies(data.subcategory)) pretestdummies = pd.get_dummies(test.subcategory) testdummies = sparse.csc_matrix(pretestdummies.drop(['Растения', 'Товары для компьютера'],axis=1)) words = np.array(data.description,str) testwords = np.array(test.description,str) del data, test vect = text.CountVectorizer(decode_error = u'ignore', strip_accents='unicode', ngram_range=(1,2)) corpus = np.concatenate((words, testwords)) vect.fit(corpus) counts = vect.transform(words) features = sparse.hstack((dummies,counts)) clf = RidgeClassifier() clf.fit(features, response) testcounts = vect.transform(testwords) testFeatures = sparse.hstack((testdummies,testcounts)) predicted_scores = clf.predict_proba(testFeatures).T[1] f = open(output_name,'w') f.write("id\n") for pred_score, item_id in sorted(zip(predicted_scores, testItemIds), reverse = True): f.write("%d\n" % (item_id)) f.close()
dev_accuracy = sum( (predictions == y_test).astype(int)) / predictions.shape[0] else: temp_classifier = classifier x_train, x_test, y_train, y_test = train_test_split(X_train_all, Y_train_all, test_size=0.1) temp_classifier.fit(x_train, y_train) dev_accuracy = temp_classifier.score(x_test, y_test) predictions = temp_classifier.predict(x_test) # if ridge_option: # print(temp_classifier.decision_function(x_test)) try: y_proba[i] = classifier.predict_proba(x_test) except: scores = classifier.decision_function(x_test) y_proba[i] = scores / (1 + scores) y_mistake[i] = np.mean( y_proba[i][y_proba[i].argmax(axis=1) != y_test].max(axis=1)) testArray = np.array([ np.mean(y_proba[i][y_test == j][y_proba[i][y_test == j].argmax( axis=1) != j].max(axis=1)) for j in range(4) ]) y_mistake_perClass[i] = testArray confusion_matrices[i] = confusion_matrix(y_test, predictions) print('Fold N°', str(i)) print('SCORE : ', dev_accuracy)
test = df_prob_age.iloc[3000000:] age_model_lst = [] age_score_lst = [] age_oof = np.zeros((3000000, 10), dtype='float32') for count, (train_idx, valid_idx) in enumerate(folds): X_train = train_val.iloc[train_idx].values X_val = train_val.iloc[valid_idx].values y_train = y_age[train_idx] y_val = y_age[valid_idx] print(f'Training Fold {count}...') model = RidgeClassifier(alpha=0.5) # model = LogisticRegression(n_jobs=30) model.fit(X_train, y_train) try: val_pred_prob = model.predict_proba(X_val) age_oof[valid_idx] = val_pred_prob val_pred = np.argmax(val_pred_prob, axis=1) # model.predict(X_val) # except: print(model.coef_) val_pred = model.predict(X_val) acc = accuracy_score(y_val, val_pred) print(f"Fold-{count}: Acc =", acc) age_score_lst.append(acc) age_model_lst.append(model) print(np.mean(age_score_lst)) test = df_prob_age.iloc[3000000:] age_pred = pd.DataFrame() age_pred['user_id'] = testdata['user_id'].copy()
score = cross_val_score(model, X_values, y_train, cv=kfold, n_jobs=1, scoring='roc_auc', verbose=0) print('score {:.4}'.format(score.mean())) #score 0.7853 roc auc: 0.783 col_name.startswith('number') or col_name.startswith('dt') or col_name.startswith('onehot') # score 0.781 roc auc: 0.7787 if col_name.startswith('number') # score 0.7832 roc auc: 0.7809 col_name.startswith('number') or col_name.startswith('onehot') # score 0.7854 roc auc: 0.7831 col_name.startswith('number') or col_name.startswith('dt') result = df_test_d[['target']].copy() result['prediction'] = model.predict_proba(X_test)[:, 1] result['prediction'] = model._predict_proba_lr(X_test)[:, 1] metric = roc_auc_score(result['target'], result['prediction']) print('roc auc: {:.4}'.format(metric)) # Обучение result = df_X_d[['target']].copy() result['prediction'] = model.predict_proba(X_values)[:, 1] result['prediction'] = model._predict_proba_lr(X_values)[:, 1] metric = roc_auc_score(result['target'], result['prediction']) print('roc auc: {:.4}'.format(metric)) # # roc auc: 0.7132 # roc auc: 0.9765
def ridge(X_train, y_train, X_test, params): model = RidgeClassifier(alpha=float(params[0])).fit(X_train, y_train) return model.predict_proba(X_test)[:,1]