from titanic import answer if __name__ == '__main__': newsgroups = datasets.fetch_20newsgroups( subset='all', categories=['alt.atheism', 'sci.space'] ) y = newsgroups.target vectorizer = TfidfVectorizer() tf_idf_features = vectorizer.fit_transform(newsgroups.data) feature_mapping = vectorizer.get_feature_names() grid = {'C': np.power(10.0, np.arange(-5, 6))} cv = KFold(y.size, n_folds=5, random_state=241) clf = SVC(kernel='linear', random_state=241) gs = GridSearchCV(clf, grid, scoring='accuracy', cv=cv) gs.fit(tf_idf_features, y) parameter_C = max(gs.grid_scores_, key=lambda x: x.mean_validation_score).parameters['C'] print(parameter_C) new_clf = SVC(parameter_C, kernel='linear', random_state=241) new_clf = new_clf.fit(tf_idf_features, y) weights = sorted(zip(new_clf.coef_.indices, new_clf.coef_.data), key=lambda x: abs(x[1]), reverse=True)[:10] print(weights) word_indexes = ([x for x, y in weights]) valueable_words = [feature_mapping[x] for x in word_indexes] valueable_words = sorted(valueable_words, key=str.lower) answer(' '.join(valueable_words), 'text_analyze_response.txt')
#!/usr/bin/env python import operator from titanic import answer from sklearn import datasets from sklearn import cross_validation from sklearn.neighbors import KNeighborsRegressor from sklearn.preprocessing import scale import numpy as np if __name__ == '__main__': data = datasets.load_boston() X = scale(data.data) accuracies = {} for i in np.linspace(1, 10, 200): knr = KNeighborsRegressor(weights='distance', p=i) kf = cross_validation.KFold(data.data.shape[0], 5, shuffle=True, random_state=42) scores = cross_validation.cross_val_score(knr, X, data.target, cv=kf, scoring='mean_squared_error') accuracies[i] = scores.mean() best_p, accuracy = max(accuracies.items(), key=operator.itemgetter(1)) answer(best_p, 'boston_metric.txt')
#!/usr/bin/env python from sklearn.svm import SVC from titanic import answer import numpy as np if __name__ == '__main__': train_data = np.genfromtxt('svm-data.csv', delimiter=',') X_train_data = features = train_data[:, 1:] Y_train_data = train_data[:, 0] clf = SVC(random_state=241, C=100000, kernel='linear') clf = clf.fit(X_train_data, Y_train_data) answer(' '.join([str(x + 1) for x in clf.support_]), 'svm_learn_response.txt')
from sklearn.metrics import f1_score from sklearn.metrics import roc_auc_score from sklearn.metrics import precision_recall_curve if __name__ == '__main__': data = pandas.read_csv('classification.csv') true_positive = data[data['true'] == 1][data['pred'] == 1] false_positive = data[data['true'] == 0][data['pred'] == 1] false_negative = data[data['true'] == 1][data['pred'] == 0] true_negative = data[data['true'] == 0][data['pred'] == 0] tp_count = true_positive.shape[0] fp_count = false_positive.shape[0] fn_count = false_negative.shape[0] tn_count = true_negative.shape[0] answer('%s %s %s %s' % (tp_count, fp_count, fn_count, tn_count), 'accuracy_metrics_classification_1.txt') accuracy = (tp_count + tn_count) / sum( [tp_count, fp_count, fn_count, tn_count]) precision = tp_count / (tp_count + fp_count) recall = tp_count / (tp_count + fn_count) f_score = f1_score(data['true'], data['pred']) answer('%s %s %s %s' % (accuracy, precision, recall, f_score), 'accuracy_metrics_classification_2.txt') scores = pandas.read_csv('scores.csv') roc_auc_scores = dict(score_logreg=roc_auc_score(scores['true'], scores['score_logreg']), score_svm=roc_auc_score(scores['true'], scores['score_svm']), score_knn=roc_auc_score(scores['true'],
if __name__ == '__main__': image = imread('parrots_4.jpg') image = img_as_float(image) min_clusters = 0 for i in range(1, 20 + 1): train_data = np.vstack(tuple(image.tolist())) clr = KMeans(i, init='k-means++', random_state=241) train_res = clr.fit_predict(train_data) mean_assimilated_colors = assimilate(train_data, train_res, MeanAssimilator()) median_assimilated_colors = assimilate(train_data, train_res, MedianAssimilator()) deltas = (train_data - mean_assimilated_colors) ** 2 mean_mse = sum([x.sum() for x in deltas]) / (train_data.shape[0] * train_data.shape[1]) mean_psnr = 10 * log10(1/mean_mse) deltas = (train_data - median_assimilated_colors) ** 2 median_mse = sum([x.sum() for x in deltas]) / (train_data.shape[0] * train_data.shape[1]) median_psnr = 10 * log10(1/median_mse) imsave(os.path.join(os.getcwd(), 'mean_assimilated_colors-%s.jpg' % i), np.reshape(mean_assimilated_colors, image.shape)) imsave(os.path.join(os.getcwd(), 'median_assimilated_colors-%s.jpg' % i), np.reshape(median_assimilated_colors, image.shape)) if mean_psnr > 20 or median_psnr > 20: min_clusters = i break answer(str(min_clusters), 'clustering.txt')
vectorizer = TfidfVectorizer(min_df=5) enc = DictVectorizer() data_train = pandas.read_csv('salary-train.csv', index_col=None) # type: DataFrame data_test = pandas.read_csv('salary-test-mini.csv', index_col=None) # type: DataFrame for key in data_train.keys()[:3]: data_train[key] = data_train[key].str.lower() data_train.replace('[^a-zA-Z0-9]', ' ', regex=True, inplace=True) data_train['LocationNormalized'].fillna('nan', inplace=True) data_train['ContractTime'].fillna('nan', inplace=True) data_test['LocationNormalized'].fillna('nan', inplace=True) data_test['ContractTime'].fillna('nan', inplace=True) X_train_categ = enc.fit_transform(data_train[['LocationNormalized', 'ContractTime']] .to_dict('records')) X_test_categ = enc.transform(data_test[['LocationNormalized', 'ContractTime']] .to_dict('records')) tf_idf_features = vectorizer.fit_transform(data_train['FullDescription']) tf_idf_features_test = vectorizer.transform(data_test['FullDescription']) train_features = hstack((tf_idf_features, X_train_categ), format='csr') test_features = hstack((tf_idf_features_test, X_test_categ), format='csr') regressor = Ridge(random_state=241, alpha=1) regressor.fit(train_features, data_train['SalaryNormalized']) res = regressor.predict(test_features) answer('%0.2f %0.2f' % (res[0], res[1]), 'salary_res.txt') answer('%s %s' % (res[0], res[1]), 'salary_res_2.txt')
import operator from sklearn import cross_validation from titanic import answer from sklearn.neighbors import KNeighborsClassifier import numpy as np from sklearn.preprocessing import scale def get_accuracies(X: np.array, Y: np.array): for i in range(1, 51): clf = KNeighborsClassifier(i) kf = cross_validation.KFold(len(data), 5, shuffle=True, random_state=42) scores = cross_validation.cross_val_score(clf, X, Y, cv=kf) yield scores.mean() if __name__ == '__main__': data = np.genfromtxt('wine.data', delimiter=',') classes = data[:, 0] features = data[:, 1:] accuracies = [x for x in get_accuracies(features, classes)] n_neighbors, accuracy = max(enumerate(accuracies), key=operator.itemgetter(1)) answer(n_neighbors+1, 'wine_kNN_1.txt') answer(accuracy, 'wine_kNN_2.txt') accuracies = [x for x in get_accuracies(scale(features), classes)] n_neighbors, accuracy = max(enumerate(accuracies), key=operator.itemgetter(1)) answer(n_neighbors+1, 'wine_kNN_3.txt') answer(accuracy, 'wine_kNN_4.txt')
train_data = np.vstack(tuple(image.tolist())) clr = KMeans(i, init='k-means++', random_state=241) train_res = clr.fit_predict(train_data) mean_assimilated_colors = assimilate(train_data, train_res, MeanAssimilator()) median_assimilated_colors = assimilate(train_data, train_res, MedianAssimilator()) deltas = (train_data - mean_assimilated_colors)**2 mean_mse = sum([x.sum() for x in deltas ]) / (train_data.shape[0] * train_data.shape[1]) mean_psnr = 10 * log10(1 / mean_mse) deltas = (train_data - median_assimilated_colors)**2 median_mse = sum([x.sum() for x in deltas ]) / (train_data.shape[0] * train_data.shape[1]) median_psnr = 10 * log10(1 / median_mse) imsave(os.path.join(os.getcwd(), 'mean_assimilated_colors-%s.jpg' % i), np.reshape(mean_assimilated_colors, image.shape)) imsave( os.path.join(os.getcwd(), 'median_assimilated_colors-%s.jpg' % i), np.reshape(median_assimilated_colors, image.shape)) if mean_psnr > 20 or median_psnr > 20: min_clusters = i break answer(str(min_clusters), 'clustering.txt')
(LEARNING_RATE / l) * sum([y * xi[0] * _diff_base(y, xi[0], xi[1], weight1, weight2) for xi, y in zip(x_train_data, y_train_data)]) - LEARNING_RATE * regularization_coeff * weight1 ) w2_new = ( weight2 + (LEARNING_RATE / l) * sum([y * xi[1] * _diff_base(y, xi[0], xi[1], weight1, weight2) for xi, y in zip(x_train_data, y_train_data)]) - LEARNING_RATE * regularization_coeff * weight2 ) if euclidean([weight1, weight2], [w1_new, w2_new]) <= THRESHOLD: return weight1, weight2 weight1, weight2 = w1_new, w2_new # emperical_risk = _compute_emperical_risk(x_train_data, y_train_data, weight1, weight2, # regularization_coeff) return weight1, weight2 if __name__ == '__main__': train_data = np.genfromtxt('data-logistic.csv', delimiter=',') X_train_data = train_data[:, 1:] Y_train_data = train_data[:, 0] w1, w2 = _compute_weights(X_train_data, Y_train_data, REGULARIZATION_COEFF) w1_, w2_ = _compute_weights(X_train_data, Y_train_data, 0) answer('%s %s' % ( roc_auc_score(Y_train_data, [_algorithm(x[0], x[1], w1_, w2_) for x in X_train_data]), roc_auc_score(Y_train_data, [_algorithm(x[0], x[1], w1, w2) for x in X_train_data]) ), 'logistic_res.txt')
from sklearn.metrics import f1_score from sklearn.metrics import roc_auc_score from sklearn.metrics import precision_recall_curve if __name__ == "__main__": data = pandas.read_csv("classification.csv") true_positive = data[data["true"] == 1][data["pred"] == 1] false_positive = data[data["true"] == 0][data["pred"] == 1] false_negative = data[data["true"] == 1][data["pred"] == 0] true_negative = data[data["true"] == 0][data["pred"] == 0] tp_count = true_positive.shape[0] fp_count = false_positive.shape[0] fn_count = false_negative.shape[0] tn_count = true_negative.shape[0] answer("%s %s %s %s" % (tp_count, fp_count, fn_count, tn_count), "accuracy_metrics_classification_1.txt") accuracy = (tp_count + tn_count) / sum([tp_count, fp_count, fn_count, tn_count]) precision = tp_count / (tp_count + fp_count) recall = tp_count / (tp_count + fn_count) f_score = f1_score(data["true"], data["pred"]) answer("%s %s %s %s" % (accuracy, precision, recall, f_score), "accuracy_metrics_classification_2.txt") scores = pandas.read_csv("scores.csv") roc_auc_scores = dict( score_logreg=roc_auc_score(scores["true"], scores["score_logreg"]), score_svm=roc_auc_score(scores["true"], scores["score_svm"]), score_knn=roc_auc_score(scores["true"], scores["score_knn"]), score_tree=roc_auc_score(scores["true"], scores["score_tree"]), )
#!/usr/bin/env python from sklearn.preprocessing import StandardScaler from sklearn.linear_model import Perceptron import numpy as np from titanic import answer if __name__ == '__main__': train_data = np.genfromtxt('perceptron-train.csv', delimiter=',') test_data = np.genfromtxt('perceptron-test.csv', delimiter=',') X_train_data = features = train_data[:, 1:] Y_train_data = train_data[:, 0] X_test_data = features = test_data[:, 1:] Y_test_data = test_data[:, 0] scaler = StandardScaler() clf = Perceptron(random_state=241) clf.fit(X_train_data, Y_train_data) scores = clf.score(X_test_data, Y_test_data) print(scores.mean()) X_train_data_scaled = scaler.fit_transform(X_train_data) X_test_data_scaled = scaler.transform(X_test_data) clf.fit(X_train_data_scaled, Y_train_data) scaled_scores = clf.score(X_test_data_scaled, Y_test_data) print(scores.mean(), scaled_scores.mean()) answer(scaled_scores.mean() - scores.mean(), 'feature_normalization.txt')
#!/usr/bin/env python from sklearn.svm import SVC from titanic import answer import numpy as np if __name__ == '__main__': train_data = np.genfromtxt('svm-data.csv', delimiter=',') X_train_data = features = train_data[:, 1:] Y_train_data = train_data[:, 0] clf = SVC(random_state=241, C=100000, kernel='linear') clf = clf.fit(X_train_data, Y_train_data) answer(' '.join([str(x+1) for x in clf.support_]), 'svm_learn_response.txt')
#!/usr/bin/env python import pandas from sklearn.cross_validation import KFold, cross_val_score from sklearn.ensemble import RandomForestRegressor from titanic import answer if __name__ == '__main__': data_train = pandas.read_csv('abalone.csv', index_col=None) # type: DataFrame data_train['Sex'] = data_train['Sex'].map(lambda x: 1 if x == 'M' else (-1 if x == 'F' else 0)) data = data_train.values[:, :-1] target = data_train.values[:, -1] i = None for i in range(1, 50): knr = RandomForestRegressor(i, random_state=1) kf = KFold(len(target), 5, shuffle=True, random_state=1) scores = cross_val_score(estimator=knr, X=data, y=target, scoring='r2', cv=kf) accuracy = scores.mean() if accuracy > 0.52: break answer(i, 'forest_res.txt')
from sklearn.preprocessing import scale def get_accuracies(X: np.array, Y: np.array): for i in range(1, 51): clf = KNeighborsClassifier(i) kf = cross_validation.KFold(len(data), 5, shuffle=True, random_state=42) scores = cross_validation.cross_val_score(clf, X, Y, cv=kf) yield scores.mean() if __name__ == '__main__': data = np.genfromtxt('wine.data', delimiter=',') classes = data[:, 0] features = data[:, 1:] accuracies = [x for x in get_accuracies(features, classes)] n_neighbors, accuracy = max(enumerate(accuracies), key=operator.itemgetter(1)) answer(n_neighbors + 1, 'wine_kNN_1.txt') answer(accuracy, 'wine_kNN_2.txt') accuracies = [x for x in get_accuracies(scale(features), classes)] n_neighbors, accuracy = max(enumerate(accuracies), key=operator.itemgetter(1)) answer(n_neighbors + 1, 'wine_kNN_3.txt') answer(accuracy, 'wine_kNN_4.txt')
#!/usr/bin/env python import pandas from pandas import DataFrame from titanic import answer from sklearn.decomposition import PCA import numpy as np if __name__ == '__main__': data_train = pandas.read_csv('close_prices.csv', index_col=None) # type: DataFrame data_indexes = pandas.read_csv('djia_index.csv', index_col=None) # type: DataFrame X_train = data_train.values[:, 1:] pca = None i = None for i in range(1, X_train.shape[1]): pca = PCA(i) pca.fit(X_train) print(i, pca.explained_variance_ratio_) if sum(pca.explained_variance_ratio_) > 0.9: break answer(i, 'pca_1.txt') transformed_features = pca.transform(X_train) pearson_c = np.corrcoef([transformed_features[:, 0], data_indexes['^DJI']])[1, 0] answer(pearson_c, 'pca_2.txt') index = np.argmax(pca.components_[0]) answer(data_train.keys()[1:][index], 'pca_3.txt')
data_train = pandas.read_csv('salary-train.csv', index_col=None) # type: DataFrame data_test = pandas.read_csv('salary-test-mini.csv', index_col=None) # type: DataFrame for key in data_train.keys()[:3]: data_train[key] = data_train[key].str.lower() data_train.replace('[^a-zA-Z0-9]', ' ', regex=True, inplace=True) data_train['LocationNormalized'].fillna('nan', inplace=True) data_train['ContractTime'].fillna('nan', inplace=True) data_test['LocationNormalized'].fillna('nan', inplace=True) data_test['ContractTime'].fillna('nan', inplace=True) X_train_categ = enc.fit_transform( data_train[['LocationNormalized', 'ContractTime']].to_dict('records')) X_test_categ = enc.transform( data_test[['LocationNormalized', 'ContractTime']].to_dict('records')) tf_idf_features = vectorizer.fit_transform(data_train['FullDescription']) tf_idf_features_test = vectorizer.transform(data_test['FullDescription']) train_features = hstack((tf_idf_features, X_train_categ), format='csr') test_features = hstack((tf_idf_features_test, X_test_categ), format='csr') regressor = Ridge(random_state=241, alpha=1) regressor.fit(train_features, data_train['SalaryNormalized']) res = regressor.predict(test_features) answer('%0.2f %0.2f' % (res[0], res[1]), 'salary_res.txt') answer('%s %s' % (res[0], res[1]), 'salary_res_2.txt')
train_probs = clf.predict_proba(X_data_train) test_probs = clf.predict_proba(X_data_test) train_losts = [] for pred in clf.staged_decision_function(X_data_train): train_losts.append(log_loss(Y_data_train, [1 / (1 + exp(-x)) for x in pred])) train_losts = np.array(train_losts) test_losts = [] for pred in clf.staged_decision_function(X_data_test): test_losts.append(log_loss(Y_data_test, [1 / (1 + exp(-x)) for x in pred])) test_losts = np.array(test_losts) figure() plot(test_losts, 'g', linewidth=2) plot(train_losts, 'r', linewidth=2) legend(['test', 'train']) savefig('image-%s.png' % learning_rate) if learning_rate == 0.2: answer2_argmin = np.argmin(test_losts) answer2_value = test_losts.min() f_clf = RandomForestClassifier(random_state=241, n_estimators=answer2_argmin) f_clf.fit(X_data_train, Y_data_train) rf_min_loss = log_loss(Y_data_test, f_clf.predict_proba(X_data_test)) answer('overfitting', 'gradient_boost_decision_trees-1.txt') answer('%s %s' % (answer2_value, answer2_argmin), 'gradient_boost_decision_trees-2.txt') answer(rf_min_loss, 'gradient_boost_decision_trees-3.txt')