Exemple #1
0
from titanic import answer

if __name__ == '__main__':
    newsgroups = datasets.fetch_20newsgroups(
        subset='all',
        categories=['alt.atheism', 'sci.space']
    )
    y = newsgroups.target
    vectorizer = TfidfVectorizer()
    tf_idf_features = vectorizer.fit_transform(newsgroups.data)
    feature_mapping = vectorizer.get_feature_names()

    grid = {'C': np.power(10.0, np.arange(-5, 6))}
    cv = KFold(y.size, n_folds=5, random_state=241)
    clf = SVC(kernel='linear', random_state=241)
    gs = GridSearchCV(clf, grid, scoring='accuracy', cv=cv)
    gs.fit(tf_idf_features, y)

    parameter_C = max(gs.grid_scores_, key=lambda x: x.mean_validation_score).parameters['C']
    print(parameter_C)
    new_clf = SVC(parameter_C, kernel='linear', random_state=241)
    new_clf = new_clf.fit(tf_idf_features, y)
    weights = sorted(zip(new_clf.coef_.indices, new_clf.coef_.data), key=lambda x: abs(x[1]),
                     reverse=True)[:10]
    print(weights)
    word_indexes = ([x for x, y in weights])
    valueable_words = [feature_mapping[x] for x in word_indexes]
    valueable_words = sorted(valueable_words, key=str.lower)
    answer(' '.join(valueable_words), 'text_analyze_response.txt')
Exemple #2
0
#!/usr/bin/env python
import operator
from titanic import answer

from sklearn import datasets
from sklearn import cross_validation
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import scale
import numpy as np

if __name__ == '__main__':
    data = datasets.load_boston()
    X = scale(data.data)
    accuracies = {}
    for i in np.linspace(1, 10, 200):
        knr = KNeighborsRegressor(weights='distance', p=i)
        kf = cross_validation.KFold(data.data.shape[0], 5, shuffle=True, random_state=42)
        scores = cross_validation.cross_val_score(knr, X, data.target, cv=kf,
                                                  scoring='mean_squared_error')
        accuracies[i] = scores.mean()
    best_p, accuracy = max(accuracies.items(), key=operator.itemgetter(1))
    answer(best_p, 'boston_metric.txt')
Exemple #3
0
#!/usr/bin/env python
from sklearn.svm import SVC
from titanic import answer
import numpy as np

if __name__ == '__main__':
    train_data = np.genfromtxt('svm-data.csv', delimiter=',')
    X_train_data = features = train_data[:, 1:]
    Y_train_data = train_data[:, 0]
    clf = SVC(random_state=241, C=100000, kernel='linear')
    clf = clf.fit(X_train_data, Y_train_data)
    answer(' '.join([str(x + 1) for x in clf.support_]),
           'svm_learn_response.txt')
Exemple #4
0
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_curve

if __name__ == '__main__':
    data = pandas.read_csv('classification.csv')
    true_positive = data[data['true'] == 1][data['pred'] == 1]
    false_positive = data[data['true'] == 0][data['pred'] == 1]
    false_negative = data[data['true'] == 1][data['pred'] == 0]
    true_negative = data[data['true'] == 0][data['pred'] == 0]

    tp_count = true_positive.shape[0]
    fp_count = false_positive.shape[0]
    fn_count = false_negative.shape[0]
    tn_count = true_negative.shape[0]
    answer('%s %s %s %s' % (tp_count, fp_count, fn_count, tn_count),
           'accuracy_metrics_classification_1.txt')

    accuracy = (tp_count + tn_count) / sum(
        [tp_count, fp_count, fn_count, tn_count])
    precision = tp_count / (tp_count + fp_count)
    recall = tp_count / (tp_count + fn_count)
    f_score = f1_score(data['true'], data['pred'])
    answer('%s %s %s %s' % (accuracy, precision, recall, f_score),
           'accuracy_metrics_classification_2.txt')

    scores = pandas.read_csv('scores.csv')
    roc_auc_scores = dict(score_logreg=roc_auc_score(scores['true'],
                                                     scores['score_logreg']),
                          score_svm=roc_auc_score(scores['true'],
                                                  scores['score_svm']),
                          score_knn=roc_auc_score(scores['true'],
Exemple #5
0
if __name__ == '__main__':
    image = imread('parrots_4.jpg')
    image = img_as_float(image)
    min_clusters = 0
    for i in range(1, 20 + 1):
        train_data = np.vstack(tuple(image.tolist()))
        clr = KMeans(i, init='k-means++', random_state=241)
        train_res = clr.fit_predict(train_data)

        mean_assimilated_colors = assimilate(train_data, train_res, MeanAssimilator())
        median_assimilated_colors = assimilate(train_data, train_res, MedianAssimilator())

        deltas = (train_data - mean_assimilated_colors) ** 2
        mean_mse = sum([x.sum() for x in deltas]) / (train_data.shape[0] * train_data.shape[1])
        mean_psnr = 10 * log10(1/mean_mse)

        deltas = (train_data - median_assimilated_colors) ** 2
        median_mse = sum([x.sum() for x in deltas]) / (train_data.shape[0] * train_data.shape[1])
        median_psnr = 10 * log10(1/median_mse)

        imsave(os.path.join(os.getcwd(), 'mean_assimilated_colors-%s.jpg' % i),
               np.reshape(mean_assimilated_colors, image.shape))
        imsave(os.path.join(os.getcwd(), 'median_assimilated_colors-%s.jpg' % i),
               np.reshape(median_assimilated_colors, image.shape))

        if mean_psnr > 20 or median_psnr > 20:
            min_clusters = i
            break

    answer(str(min_clusters), 'clustering.txt')
    vectorizer = TfidfVectorizer(min_df=5)
    enc = DictVectorizer()
    data_train = pandas.read_csv('salary-train.csv', index_col=None)  # type: DataFrame
    data_test = pandas.read_csv('salary-test-mini.csv', index_col=None)  # type: DataFrame

    for key in data_train.keys()[:3]:
        data_train[key] = data_train[key].str.lower()
    data_train.replace('[^a-zA-Z0-9]', ' ', regex=True, inplace=True)
    data_train['LocationNormalized'].fillna('nan', inplace=True)
    data_train['ContractTime'].fillna('nan', inplace=True)

    data_test['LocationNormalized'].fillna('nan', inplace=True)
    data_test['ContractTime'].fillna('nan', inplace=True)

    X_train_categ = enc.fit_transform(data_train[['LocationNormalized', 'ContractTime']]
                                      .to_dict('records'))
    X_test_categ = enc.transform(data_test[['LocationNormalized', 'ContractTime']]
                                 .to_dict('records'))

    tf_idf_features = vectorizer.fit_transform(data_train['FullDescription'])
    tf_idf_features_test = vectorizer.transform(data_test['FullDescription'])

    train_features = hstack((tf_idf_features, X_train_categ), format='csr')
    test_features = hstack((tf_idf_features_test, X_test_categ), format='csr')
    regressor = Ridge(random_state=241, alpha=1)
    regressor.fit(train_features, data_train['SalaryNormalized'])
    res = regressor.predict(test_features)

    answer('%0.2f %0.2f' % (res[0], res[1]), 'salary_res.txt')
    answer('%s %s' % (res[0], res[1]), 'salary_res_2.txt')
Exemple #7
0
import operator
from sklearn import cross_validation
from titanic import answer
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
from sklearn.preprocessing import scale


def get_accuracies(X: np.array, Y: np.array):
    for i in range(1, 51):
        clf = KNeighborsClassifier(i)
        kf = cross_validation.KFold(len(data), 5, shuffle=True, random_state=42)
        scores = cross_validation.cross_val_score(clf, X, Y, cv=kf)
        yield scores.mean()


if __name__ == '__main__':
    data = np.genfromtxt('wine.data', delimiter=',')
    classes = data[:, 0]
    features = data[:, 1:]

    accuracies = [x for x in get_accuracies(features, classes)]
    n_neighbors, accuracy = max(enumerate(accuracies), key=operator.itemgetter(1))
    answer(n_neighbors+1, 'wine_kNN_1.txt')
    answer(accuracy, 'wine_kNN_2.txt')

    accuracies = [x for x in get_accuracies(scale(features), classes)]
    n_neighbors, accuracy = max(enumerate(accuracies), key=operator.itemgetter(1))
    answer(n_neighbors+1, 'wine_kNN_3.txt')
    answer(accuracy, 'wine_kNN_4.txt')
Exemple #8
0
        train_data = np.vstack(tuple(image.tolist()))
        clr = KMeans(i, init='k-means++', random_state=241)
        train_res = clr.fit_predict(train_data)

        mean_assimilated_colors = assimilate(train_data, train_res,
                                             MeanAssimilator())
        median_assimilated_colors = assimilate(train_data, train_res,
                                               MedianAssimilator())

        deltas = (train_data - mean_assimilated_colors)**2
        mean_mse = sum([x.sum() for x in deltas
                        ]) / (train_data.shape[0] * train_data.shape[1])
        mean_psnr = 10 * log10(1 / mean_mse)

        deltas = (train_data - median_assimilated_colors)**2
        median_mse = sum([x.sum() for x in deltas
                          ]) / (train_data.shape[0] * train_data.shape[1])
        median_psnr = 10 * log10(1 / median_mse)

        imsave(os.path.join(os.getcwd(), 'mean_assimilated_colors-%s.jpg' % i),
               np.reshape(mean_assimilated_colors, image.shape))
        imsave(
            os.path.join(os.getcwd(), 'median_assimilated_colors-%s.jpg' % i),
            np.reshape(median_assimilated_colors, image.shape))

        if mean_psnr > 20 or median_psnr > 20:
            min_clusters = i
            break

    answer(str(min_clusters), 'clustering.txt')
            (LEARNING_RATE / l) * sum([y * xi[0] * _diff_base(y, xi[0], xi[1], weight1, weight2)
                                       for xi, y in zip(x_train_data, y_train_data)]) -
            LEARNING_RATE * regularization_coeff * weight1
        )
        w2_new = (
            weight2 +
            (LEARNING_RATE / l) * sum([y * xi[1] * _diff_base(y, xi[0], xi[1], weight1, weight2)
                                       for xi, y in zip(x_train_data, y_train_data)]) -
            LEARNING_RATE * regularization_coeff * weight2
        )

        if euclidean([weight1, weight2], [w1_new, w2_new]) <= THRESHOLD:
            return weight1, weight2

        weight1, weight2 = w1_new, w2_new
        # emperical_risk = _compute_emperical_risk(x_train_data, y_train_data, weight1, weight2,
        #                                          regularization_coeff)
    return weight1, weight2


if __name__ == '__main__':
    train_data = np.genfromtxt('data-logistic.csv', delimiter=',')
    X_train_data = train_data[:, 1:]
    Y_train_data = train_data[:, 0]
    w1, w2 = _compute_weights(X_train_data, Y_train_data, REGULARIZATION_COEFF)
    w1_, w2_ = _compute_weights(X_train_data, Y_train_data, 0)
    answer('%s %s' % (
        roc_auc_score(Y_train_data, [_algorithm(x[0], x[1], w1_, w2_) for x in X_train_data]),
        roc_auc_score(Y_train_data, [_algorithm(x[0], x[1], w1, w2) for x in X_train_data])
    ), 'logistic_res.txt')
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_curve

if __name__ == "__main__":
    data = pandas.read_csv("classification.csv")
    true_positive = data[data["true"] == 1][data["pred"] == 1]
    false_positive = data[data["true"] == 0][data["pred"] == 1]
    false_negative = data[data["true"] == 1][data["pred"] == 0]
    true_negative = data[data["true"] == 0][data["pred"] == 0]

    tp_count = true_positive.shape[0]
    fp_count = false_positive.shape[0]
    fn_count = false_negative.shape[0]
    tn_count = true_negative.shape[0]
    answer("%s %s %s %s" % (tp_count, fp_count, fn_count, tn_count), "accuracy_metrics_classification_1.txt")

    accuracy = (tp_count + tn_count) / sum([tp_count, fp_count, fn_count, tn_count])
    precision = tp_count / (tp_count + fp_count)
    recall = tp_count / (tp_count + fn_count)
    f_score = f1_score(data["true"], data["pred"])
    answer("%s %s %s %s" % (accuracy, precision, recall, f_score), "accuracy_metrics_classification_2.txt")

    scores = pandas.read_csv("scores.csv")
    roc_auc_scores = dict(
        score_logreg=roc_auc_score(scores["true"], scores["score_logreg"]),
        score_svm=roc_auc_score(scores["true"], scores["score_svm"]),
        score_knn=roc_auc_score(scores["true"], scores["score_knn"]),
        score_tree=roc_auc_score(scores["true"], scores["score_tree"]),
    )
Exemple #11
0
#!/usr/bin/env python
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Perceptron
import numpy as np

from titanic import answer

if __name__ == '__main__':
    train_data = np.genfromtxt('perceptron-train.csv', delimiter=',')
    test_data = np.genfromtxt('perceptron-test.csv', delimiter=',')

    X_train_data = features = train_data[:, 1:]
    Y_train_data = train_data[:, 0]
    X_test_data = features = test_data[:, 1:]
    Y_test_data = test_data[:, 0]

    scaler = StandardScaler()
    clf = Perceptron(random_state=241)

    clf.fit(X_train_data, Y_train_data)
    scores = clf.score(X_test_data, Y_test_data)
    print(scores.mean())

    X_train_data_scaled = scaler.fit_transform(X_train_data)
    X_test_data_scaled = scaler.transform(X_test_data)

    clf.fit(X_train_data_scaled, Y_train_data)
    scaled_scores = clf.score(X_test_data_scaled, Y_test_data)
    print(scores.mean(), scaled_scores.mean())
    answer(scaled_scores.mean() - scores.mean(), 'feature_normalization.txt')
Exemple #12
0
#!/usr/bin/env python
from sklearn.svm import SVC
from titanic import answer
import numpy as np

if __name__ == '__main__':
    train_data = np.genfromtxt('svm-data.csv', delimiter=',')
    X_train_data = features = train_data[:, 1:]
    Y_train_data = train_data[:, 0]
    clf = SVC(random_state=241, C=100000, kernel='linear')
    clf = clf.fit(X_train_data, Y_train_data)
    answer(' '.join([str(x+1) for x in clf.support_]), 'svm_learn_response.txt')
Exemple #13
0
#!/usr/bin/env python
import pandas
from sklearn.cross_validation import KFold, cross_val_score
from sklearn.ensemble import RandomForestRegressor

from titanic import answer

if __name__ == '__main__':
    data_train = pandas.read_csv('abalone.csv',
                                 index_col=None)  # type: DataFrame
    data_train['Sex'] = data_train['Sex'].map(lambda x: 1 if x == 'M' else
                                              (-1 if x == 'F' else 0))
    data = data_train.values[:, :-1]
    target = data_train.values[:, -1]

    i = None
    for i in range(1, 50):
        knr = RandomForestRegressor(i, random_state=1)
        kf = KFold(len(target), 5, shuffle=True, random_state=1)
        scores = cross_val_score(estimator=knr,
                                 X=data,
                                 y=target,
                                 scoring='r2',
                                 cv=kf)
        accuracy = scores.mean()
        if accuracy > 0.52:
            break

    answer(i, 'forest_res.txt')
Exemple #14
0
from sklearn.preprocessing import scale


def get_accuracies(X: np.array, Y: np.array):
    for i in range(1, 51):
        clf = KNeighborsClassifier(i)
        kf = cross_validation.KFold(len(data),
                                    5,
                                    shuffle=True,
                                    random_state=42)
        scores = cross_validation.cross_val_score(clf, X, Y, cv=kf)
        yield scores.mean()


if __name__ == '__main__':
    data = np.genfromtxt('wine.data', delimiter=',')
    classes = data[:, 0]
    features = data[:, 1:]

    accuracies = [x for x in get_accuracies(features, classes)]
    n_neighbors, accuracy = max(enumerate(accuracies),
                                key=operator.itemgetter(1))
    answer(n_neighbors + 1, 'wine_kNN_1.txt')
    answer(accuracy, 'wine_kNN_2.txt')

    accuracies = [x for x in get_accuracies(scale(features), classes)]
    n_neighbors, accuracy = max(enumerate(accuracies),
                                key=operator.itemgetter(1))
    answer(n_neighbors + 1, 'wine_kNN_3.txt')
    answer(accuracy, 'wine_kNN_4.txt')
Exemple #15
0
#!/usr/bin/env python
import pandas
from pandas import DataFrame
from titanic import answer
from sklearn.decomposition import PCA
import numpy as np

if __name__ == '__main__':
    data_train = pandas.read_csv('close_prices.csv',
                                 index_col=None)  # type: DataFrame
    data_indexes = pandas.read_csv('djia_index.csv',
                                   index_col=None)  # type: DataFrame
    X_train = data_train.values[:, 1:]
    pca = None
    i = None
    for i in range(1, X_train.shape[1]):
        pca = PCA(i)
        pca.fit(X_train)
        print(i, pca.explained_variance_ratio_)
        if sum(pca.explained_variance_ratio_) > 0.9:
            break

    answer(i, 'pca_1.txt')
    transformed_features = pca.transform(X_train)
    pearson_c = np.corrcoef([transformed_features[:, 0],
                             data_indexes['^DJI']])[1, 0]
    answer(pearson_c, 'pca_2.txt')

    index = np.argmax(pca.components_[0])
    answer(data_train.keys()[1:][index], 'pca_3.txt')
Exemple #16
0
    data_train = pandas.read_csv('salary-train.csv',
                                 index_col=None)  # type: DataFrame
    data_test = pandas.read_csv('salary-test-mini.csv',
                                index_col=None)  # type: DataFrame

    for key in data_train.keys()[:3]:
        data_train[key] = data_train[key].str.lower()
    data_train.replace('[^a-zA-Z0-9]', ' ', regex=True, inplace=True)
    data_train['LocationNormalized'].fillna('nan', inplace=True)
    data_train['ContractTime'].fillna('nan', inplace=True)

    data_test['LocationNormalized'].fillna('nan', inplace=True)
    data_test['ContractTime'].fillna('nan', inplace=True)

    X_train_categ = enc.fit_transform(
        data_train[['LocationNormalized', 'ContractTime']].to_dict('records'))
    X_test_categ = enc.transform(
        data_test[['LocationNormalized', 'ContractTime']].to_dict('records'))

    tf_idf_features = vectorizer.fit_transform(data_train['FullDescription'])
    tf_idf_features_test = vectorizer.transform(data_test['FullDescription'])

    train_features = hstack((tf_idf_features, X_train_categ), format='csr')
    test_features = hstack((tf_idf_features_test, X_test_categ), format='csr')
    regressor = Ridge(random_state=241, alpha=1)
    regressor.fit(train_features, data_train['SalaryNormalized'])
    res = regressor.predict(test_features)

    answer('%0.2f %0.2f' % (res[0], res[1]), 'salary_res.txt')
    answer('%s %s' % (res[0], res[1]), 'salary_res_2.txt')
#!/usr/bin/env python
import pandas
from pandas import DataFrame
from titanic import answer
from sklearn.decomposition import PCA
import numpy as np

if __name__ == '__main__':
    data_train = pandas.read_csv('close_prices.csv', index_col=None)  # type: DataFrame
    data_indexes = pandas.read_csv('djia_index.csv', index_col=None)  # type: DataFrame
    X_train = data_train.values[:, 1:]
    pca = None
    i = None
    for i in range(1, X_train.shape[1]):
        pca = PCA(i)
        pca.fit(X_train)
        print(i, pca.explained_variance_ratio_)
        if sum(pca.explained_variance_ratio_) > 0.9:
            break

    answer(i, 'pca_1.txt')
    transformed_features = pca.transform(X_train)
    pearson_c = np.corrcoef([transformed_features[:, 0], data_indexes['^DJI']])[1, 0]
    answer(pearson_c, 'pca_2.txt')

    index = np.argmax(pca.components_[0])
    answer(data_train.keys()[1:][index], 'pca_3.txt')
        train_probs = clf.predict_proba(X_data_train)
        test_probs = clf.predict_proba(X_data_test)

        train_losts = []
        for pred in clf.staged_decision_function(X_data_train):
            train_losts.append(log_loss(Y_data_train, [1 / (1 + exp(-x)) for x in pred]))
        train_losts = np.array(train_losts)

        test_losts = []
        for pred in clf.staged_decision_function(X_data_test):
            test_losts.append(log_loss(Y_data_test, [1 / (1 + exp(-x)) for x in pred]))
        test_losts = np.array(test_losts)

        figure()
        plot(test_losts, 'g', linewidth=2)
        plot(train_losts, 'r', linewidth=2)
        legend(['test', 'train'])
        savefig('image-%s.png' % learning_rate)

        if learning_rate == 0.2:
            answer2_argmin = np.argmin(test_losts)
            answer2_value = test_losts.min()

    f_clf = RandomForestClassifier(random_state=241, n_estimators=answer2_argmin)
    f_clf.fit(X_data_train, Y_data_train)
    rf_min_loss = log_loss(Y_data_test, f_clf.predict_proba(X_data_test))

    answer('overfitting', 'gradient_boost_decision_trees-1.txt')
    answer('%s %s' % (answer2_value, answer2_argmin), 'gradient_boost_decision_trees-2.txt')
    answer(rf_min_loss, 'gradient_boost_decision_trees-3.txt')