def psnr_and_save(floats, clusters, means, x, y, means_type): _floats = colorize(floats, clusters, means, x, y) imsave(arr=_floats, fname=directory + str(n) + '_' + means_type + '.jpg') psnr = PSNR(_floats, floats2D) print(str(n) + ' ' + means_type + ': ' + str(psnr)) if psnr > 20.: io_yandex.print_result(str(n), '1.txt') sys.exit(0)
def calculate_most_important_value(df, importances): first = [0, 0] #index value second = [0, 0] #index value index = 0 for value in importances: if value > first[1]: second = first first = [index, value] elif value > second[1]: second = [index, value] index += 1 result = df.columns[first[0]] + ' ' + df.columns[second[0]] io_yandex.print_result(result, "1b.txt")
return accuracy, precision, recall, f1 def calculate_roc_auc(true, *arg): for a in arg: score = roc_auc_score(true, a) yield score def largest_index(a): return numpy.argsort(a)[::-1][:1] data = pandas.read_csv("./data/classification.csv") tp, fp, fn, tn = calculate_prediction_type(data["true"], data["pred"]) io_yandex.print_result( " ".join(map(io_yandex.two_digit_round, [tp, fp, fn, tn])), "4_1.txt") accuracy, precision, recall, f1 = calculate_scores(data["true"], data["pred"]) io_yandex.print_result( " ".join(map(io_yandex.two_digit_round, [accuracy, precision, recall, f1])), "4_2.txt") data = pandas.read_csv("./data/scores.csv") print(list(data.columns.values)) logreg, svm, knn, tree = calculate_roc_auc(data["true"], data["score_logreg"], data["score_svm"], data["score_knn"], data["score_tree"]) index = largest_index([logreg, svm, knn, tree]) + 1 # first is "true" io_yandex.print_result("".join(data.columns.values[index]), "4_3.txt") #"".join is used to remove [' and '] symbols
from mylib import io_yandex def load_data(path): train = pandas.read_csv(path) #train = train.head(100) target, train = io_yandex.get_value_column(train, 'SalaryNormalized') train['FullDescription'] = train['FullDescription'].str.lower() train['FullDescription'] = train['FullDescription'].replace('[^a-z0-9]', ' ', regex = True) train['LocationNormalized'].fillna('nan', inplace=True) train['ContractTime'].fillna('nan', inplace=True) return target, train target, train = load_data('salary-train.csv') tfid_vectoriser = TfidfVectorizer(min_df=5) train_text = tfid_vectoriser.fit_transform(train['FullDescription']) dict_vectorizer = DictVectorizer() train_categ = dict_vectorizer.fit_transform(train[['LocationNormalized', 'ContractTime']].to_dict('records')) train = hstack(blocks=[train_text, train_categ]) clf = Ridge(alpha=1, random_state=241) clf.fit(train, target) target, train = load_data('salary-test-mini.csv') train_text = tfid_vectoriser.transform(train['FullDescription']) train_categ = dict_vectorizer.transform(train[['LocationNormalized', 'ContractTime']].to_dict('records')) train = hstack(blocks=[train_text, train_categ]) target = clf.predict(train) io_yandex.print_result(' '.join(map(io_yandex.two_digit_round, target)), '1.txt')
import os, sys import pandas from sklearn.svm import SVC PACKAGE_PARENT = '..' SCRIPT_DIR = os.path.dirname(os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__)))) sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, PACKAGE_PARENT))) from mylib import io_yandex def load_data(): data_train = pandas.read_csv('./data/svm-data.csv', header=None) classes_train, data_train = io_yandex.get_value_column(data_train, 0) return data_train, classes_train def constact_SVC(): svc = SVC(kernel='linear', C=100000, random_state=241) return svc data_train, classes_train = load_data() abc = SVC(kernel='linear', C=100000, random_state=241) #abc.fit(data_train[:20], classes_train[:20]) abc.fit(data_train, classes_train) print(abc.support_) vectors = [x+1 for x in abc.support_] vectors.sort() io_yandex.print_result(' '.join(map(str, vectors)), "1.txt")
def replace_sex(x): if x == 'M': return 1 elif x == 'I': return 0 elif x == 'F': return -1 def load_data(path): train = pandas.read_csv(path) #train = train.head(100) target, train = io_yandex.get_value_column(train, 'Rings') train['Sex'] = list(map(replace_sex, train['Sex'])) return train, target X, y = load_data('abalone.csv') start = time.time() for i in range(1, 51): clf = RandomForestRegressor(n_estimators=i, random_state=1) kf = KFold(len(y), n_folds=5, random_state=1, shuffle=True) score = mean(cross_val_score(clf, X, y, cv=kf, scoring='r2', n_jobs=-1)) #print(i, score) if (score > 0.52): io_yandex.print_result(str(i), "1.txt") break end = time.time() print(end - start)
def print_n_neighbors_and_accuracies(df, classes, kf, path1, path2): n_neighbors, max_accuracy = calculate_max_accuracies(df, classes, kf) max_accuracy = io_yandex.two_digit_round(max_accuracy) io_yandex.print_result(str(n_neighbors), path1) io_yandex.print_result(max_accuracy, path2)
PACKAGE_PARENT = "../.." SCRIPT_DIR = os.path.dirname( os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__)))) sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, PACKAGE_PARENT))) from mylib import io_yandex train = pandas.read_csv('close_prices.csv') target, train = io_yandex.get_value_column(train, 'date') #print(train.head(5)) pca = PCA(n_components=10) pca.fit(train) ratio = 0. number = 0 while ratio < 0.9 and number < len(pca.explained_variance_ratio_): ratio += pca.explained_variance_ratio_[number] number += 1 print(number, ratio) io_yandex.print_result(str(number), '1_1.txt') reduced = pca.transform(train)[:, 0] real = pandas.read_csv('djia_index.csv') real = real['^DJI'] correlation = corrcoef(reduced, real)[0, 1] io_yandex.print_result(str(correlation), '1_2.txt') company = train.columns[argmax(pca.components_[0])] io_yandex.print_result(company, '1_3.txt')
newsgroups = datasets.fetch_20newsgroups( subset='all', categories=['alt.atheism', 'sci.space']) vectoriser = TfidfVectorizer() train = vectoriser.fit_transform(newsgroups.data).toarray() #grid = {'C': numpy.power(10.0, numpy.arange(-5, 6))} #cv = KFold(len(newsgroups.data), n_folds=5, shuffle=True, random_state=241) #clf = SVC(kernel='linear', random_state=241) #gs = GridSearchCV(clf, grid, scoring='accuracy', cv=cv) #gs.fit(train, newsgroups.target) #print(gs.best_estimator_.coef_) #coef = gs.best_estimator_.coef_ #coef_data = numpy.abs(coef.data) #coef_class = numpy.abs(coef.class) #key_words = get_key_words(vectoriser, coef_data, 10) #key_words.sort() #print(gs.best_params_['C']) #for a in gs.grid_scores_: # print(a.mean_validation_score) # — оценка качества по кросс-валидации # print(a.parameters) # — значения параметров clf = SVC(kernel='linear', C=1., random_state=241) # C > 1 - best values clf.fit(train, newsgroups.target) coef = clf.coef_ coef_data = numpy.abs(coef.data)[0] print(coef_data) key_words = get_key_words(vectoriser, coef_data, 10) key_words.sort() io_yandex.print_result(','.join(map(str, key_words)), "1_2.txt")
return accuracy, precision, recall, f1 def calculate_roc_auc(true, *arg): for a in arg: score = roc_auc_score(true, a) yield score def largest_index(a): return numpy.argsort(a)[::-1][:1] data = pandas.read_csv("./data/classification.csv") tp, fp, fn, tn = calculate_prediction_type(data["true"], data["pred"]) io_yandex.print_result(" ".join(map(io_yandex.two_digit_round, [tp, fp, fn, tn])), "4_1.txt") accuracy, precision, recall, f1 = calculate_scores(data["true"], data["pred"]) io_yandex.print_result(" ".join(map(io_yandex.two_digit_round, [accuracy, precision, recall, f1])), "4_2.txt") data = pandas.read_csv("./data/scores.csv") print(list(data.columns.values)) logreg, svm, knn, tree = calculate_roc_auc(data["true"], data["score_logreg"], data["score_svm"], data["score_knn"], data["score_tree"]) index = largest_index([logreg, svm, knn, tree]) + 1 # first is "true" io_yandex.print_result("".join(data.columns.values[index]), "4_3.txt") #"".join is used to remove [' and '] symbols max_val = 0 max_name = data.columns.values[2] for index in range(2, len(data.columns.values)):
train = pandas.read_csv(path) #train = train.head(100) target, train = io_yandex.get_value_column(train, 'SalaryNormalized') train['FullDescription'] = train['FullDescription'].str.lower() train['FullDescription'] = train['FullDescription'].replace('[^a-z0-9]', ' ', regex=True) train['LocationNormalized'].fillna('nan', inplace=True) train['ContractTime'].fillna('nan', inplace=True) return target, train target, train = load_data('salary-train.csv') tfid_vectoriser = TfidfVectorizer(min_df=5) train_text = tfid_vectoriser.fit_transform(train['FullDescription']) dict_vectorizer = DictVectorizer() train_categ = dict_vectorizer.fit_transform( train[['LocationNormalized', 'ContractTime']].to_dict('records')) train = hstack(blocks=[train_text, train_categ]) clf = Ridge(alpha=1, random_state=241) clf.fit(train, target) target, train = load_data('salary-test-mini.csv') train_text = tfid_vectoriser.transform(train['FullDescription']) train_categ = dict_vectorizer.transform( train[['LocationNormalized', 'ContractTime']].to_dict('records')) train = hstack(blocks=[train_text, train_categ]) target = clf.predict(train) io_yandex.print_result(' '.join(map(io_yandex.two_digit_round, target)), '1.txt')
def cross_validate(df): accuracies = [] params = [] kf = KFold(len(df.target), n_folds=5, shuffle=True, random_state=42) for p in linspace(1., 10., 100): regressor = KNeighborsRegressor(n_neighbors=5, weights='distance', metric='minkowski', p=p) score = cross_val_score(regressor, X=df.data, y=df.target, cv=kf, scoring='mean_squared_error') accuracies.append(mean(score)) params.append(p) return accuracies, params def calculate_max_accuracies(df): accuracies, params = cross_validate(df) max_accuracy = max(accuracies) index = accuracies.index(max_accuracy) return (params[index]) df = sklearn.datasets.load_boston() df.data = scale(df.data) index = calculate_max_accuracies(df) io_yandex.print_result(io_yandex.one_digit_round(index), "1b.txt")
sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, PACKAGE_PARENT))) from mylib import io_yandex def get_procent(dataframe, colomn_name, value): s = dataframe.loc[dataframe[colomn_name] == value] s = float(len(s.index)) / float(len(dataframe.index)) * 100 s = io_yandex.two_digit_round(s) return s data = io_yandex.load_titanic_to_dataframe() print (list(data.columns.values)) len(df.index) s = data['Sex'].value_counts() s = str(s[0]) + ' ' + str(s[1]) io_yandex.print_result(s, '1.txt') s = get_procent(data, 'Survived', 1) io_yandex.print_result(s, '2.txt') s = get_procent(data, 'Pclass', 1) io_yandex.print_result(s, '3.txt') s = data.loc[data.Age.notnull()] s = s['Age'] s = str(io_yandex.two_digit_round(float(numpy.mean(s, axis = 0)))) + ' ' \ + str(io_yandex.two_digit_round(float(numpy.median(s, axis = 0)))) io_yandex.print_result(s, '4.txt') s = scipy.stats.pearsonr(data['SibSp'], data['Parch']) s = io_yandex.two_digit_round(s[0])
def draw_plot(learning_rate, train_loss, test_loss, index): plt.figure() plt.title('Learning rate = ' + str(learning_rate)) plt.plot(test_loss, 'r', linewidth=2) plt.plot(train_loss, 'g', linewidth=2) plt.legend(['test', 'train']) plt.savefig(str(index) + '.png') X_train, X_test, y_train, y_test = load_data('gbm-data.csv') min_res = 1 for index, learning_rate in enumerate([1, 0.5, 0.3, 0.2, 0.1], start=1): train_loss, test_loss = fit_and_log_loss(X_train, y_train, learning_rate) draw_plot(learning_rate, train_loss, test_loss, index) if index == 4: # learning_rate = 0.2 min_res = numpy.argmin(test_loss) io_yandex.print_result( io_yandex.two_digit_round(test_loss[min_res]) + ' ' + str(min_res), '2.txt') io_yandex.print_result('overfitting', '1.txt') min_res = 37 clf = RandomForestClassifier(n_estimators=min_res, random_state=241) clf.fit(X_train, y_train) train_score = clf.predict_proba(X_train) test_score = clf.predict_proba(X_test) test_loss = log_loss(y_test, test_score) io_yandex.print_result(io_yandex.two_digit_round(test_loss), '3.txt')
return data_train, classes_train, data_test, classes_test def scale_data(*arg): scaler = StandardScaler() X_train_scaled = scaler.fit_transform(arg[0]) yield X_train_scaled for a in arg[1:]: yield scaler.transform(a) def teach(data_train, classes_train, data_test): clf = Perceptron(random_state=241) clf.fit(data_train, classes_train) classes_predictions = clf.predict(data_test) return classes_predictions data_train, classes_train, data_test, classes_test = load_data() predictions = teach(data_train, classes_train, data_test) non_scaled_accuracies = accuracy_score(classes_test, predictions) print(non_scaled_accuracies) data_train, data_test = scale_data(data_train, data_test) predictions = teach(data_train, classes_train, data_test) scaled_accuracies = accuracy_score(classes_test, predictions) print(scaled_accuracies) io_yandex.print_result( io_yandex.three_digit_round((scaled_accuracies - non_scaled_accuracies)), "2_2.txt")
from numpy import corrcoef, argmax PACKAGE_PARENT = "../.." SCRIPT_DIR = os.path.dirname(os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__)))) sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, PACKAGE_PARENT))) from mylib import io_yandex train = pandas.read_csv('close_prices.csv') target, train = io_yandex.get_value_column(train, 'date') #print(train.head(5)) pca = PCA(n_components=10) pca.fit(train) ratio = 0. number = 0 while ratio < 0.9 and number < len(pca.explained_variance_ratio_): ratio += pca.explained_variance_ratio_[number] number += 1 print(number, ratio) io_yandex.print_result(str(number), '1_1.txt') reduced = pca.transform(train)[:,0] real = pandas.read_csv('djia_index.csv') real = real['^DJI'] correlation = corrcoef(reduced, real)[0, 1] io_yandex.print_result(str(correlation), '1_2.txt') company = train.columns[argmax(pca.components_[0])] io_yandex.print_result(company, '1_3.txt')
data_test = pandas.read_csv('../data/perceptron-test.csv', header=None) classes_test, data_test = io_yandex.get_value_column(data_test, 0) return data_train, classes_train, data_test, classes_test def scale_data(*arg): scaler = StandardScaler() X_train_scaled = scaler.fit_transform(arg[0]) yield X_train_scaled for a in arg[1:]: yield scaler.transform(a) def teach(data_train, classes_train, data_test): clf = Perceptron(random_state=241) clf.fit(data_train, classes_train) classes_predictions = clf.predict(data_test) return classes_predictions data_train, classes_train, data_test, classes_test = load_data() predictions = teach(data_train, classes_train, data_test) non_scaled_accuracies = accuracy_score(classes_test, predictions) print(non_scaled_accuracies) data_train, data_test = scale_data(data_train, data_test) predictions = teach(data_train, classes_train, data_test) scaled_accuracies = accuracy_score(classes_test, predictions) print(scaled_accuracies) io_yandex.print_result(io_yandex.three_digit_round((scaled_accuracies - non_scaled_accuracies)), "2_2.txt")
from numpy import linspace, mean PACKAGE_PARENT = '..' SCRIPT_DIR = os.path.dirname(os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__)))) sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, PACKAGE_PARENT))) from mylib import io_yandex def cross_validate(df): accuracies = [] params = [] kf = KFold(len(df.target), n_folds=5, shuffle=True, random_state=42) for p in linspace(1., 10., 100): regressor = KNeighborsRegressor(n_neighbors=5, weights='distance', metric='minkowski', p = p) score = cross_val_score(regressor, X=df.data, y=df.target, cv=kf, scoring='mean_squared_error') accuracies.append(mean(score)) params.append(p) return accuracies, params def calculate_max_accuracies(df): accuracies, params = cross_validate(df) max_accuracy = max(accuracies) index = accuracies.index(max_accuracy) return(params[index]) df = sklearn.datasets.load_boston() df.data = scale(df.data) index = calculate_max_accuracies(df) io_yandex.print_result(io_yandex.one_digit_round(index), "1b.txt")