def load_data(path): train = pandas.read_csv(path) #train = train.head(100) target, train = io_yandex.get_value_column(train, 'Activity') train = train.values X_train, X_test, y_train, y_test = \ train_test_split(train, target, test_size=0.8, random_state=241) return X_train, X_test, y_train, y_test
def load_data(path): train = pandas.read_csv(path) #train = train.head(100) target, train = io_yandex.get_value_column(train, 'SalaryNormalized') train['FullDescription'] = train['FullDescription'].str.lower() train['FullDescription'] = train['FullDescription'].replace('[^a-z0-9]', ' ', regex = True) train['LocationNormalized'].fillna('nan', inplace=True) train['ContractTime'].fillna('nan', inplace=True) return target, train
def load_data(path): train = pandas.read_csv(path) #train = train.head(100) target, train = io_yandex.get_value_column(train, 'SalaryNormalized') train['FullDescription'] = train['FullDescription'].str.lower() train['FullDescription'] = train['FullDescription'].replace('[^a-z0-9]', ' ', regex=True) train['LocationNormalized'].fillna('nan', inplace=True) train['ContractTime'].fillna('nan', inplace=True) return target, train
import os, sys import pandas from sklearn.decomposition import PCA from numpy import corrcoef, argmax PACKAGE_PARENT = "../.." SCRIPT_DIR = os.path.dirname( os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__)))) sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, PACKAGE_PARENT))) from mylib import io_yandex train = pandas.read_csv('close_prices.csv') target, train = io_yandex.get_value_column(train, 'date') #print(train.head(5)) pca = PCA(n_components=10) pca.fit(train) ratio = 0. number = 0 while ratio < 0.9 and number < len(pca.explained_variance_ratio_): ratio += pca.explained_variance_ratio_[number] number += 1 print(number, ratio) io_yandex.print_result(str(number), '1_1.txt') reduced = pca.transform(train)[:, 0] real = pandas.read_csv('djia_index.csv') real = real['^DJI'] correlation = corrcoef(reduced, real)[0, 1] io_yandex.print_result(str(correlation), '1_2.txt')
import os, sys import pandas from sklearn.decomposition import PCA from numpy import corrcoef, argmax PACKAGE_PARENT = "../.." SCRIPT_DIR = os.path.dirname(os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__)))) sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, PACKAGE_PARENT))) from mylib import io_yandex train = pandas.read_csv('close_prices.csv') target, train = io_yandex.get_value_column(train, 'date') #print(train.head(5)) pca = PCA(n_components=10) pca.fit(train) ratio = 0. number = 0 while ratio < 0.9 and number < len(pca.explained_variance_ratio_): ratio += pca.explained_variance_ratio_[number] number += 1 print(number, ratio) io_yandex.print_result(str(number), '1_1.txt') reduced = pca.transform(train)[:,0] real = pandas.read_csv('djia_index.csv') real = real['^DJI'] correlation = corrcoef(reduced, real)[0, 1] io_yandex.print_result(str(correlation), '1_2.txt')
return df def create_decision_tree(dataframe, value_column): clf = tree.DecisionTreeClassifier(random_state=241) return clf.fit(dataframe, value_column) def calculate_most_important_value(df, importances): first = [0, 0] #index value second = [0, 0] #index value index = 0 for value in importances: if value > first[1]: second = first first = [index, value] elif value > second[1]: second = [index, value] index += 1 result = df.columns[first[0]] + ' ' + df.columns[second[0]] io_yandex.print_result(result, "1b.txt") df = io_yandex.load_titanic_to_dataframe() df = prepare_data(df) is_survived, df = io_yandex.get_value_column(df,'Survived') clf = create_decision_tree(df, is_survived) importances = clf.feature_importances_ print(importances) calculate_most_important_value(df, importances) #Fare Sex
def load_data(): data_train = pandas.read_csv('./data/svm-data.csv', header=None) classes_train, data_train = io_yandex.get_value_column(data_train, 0) return data_train, classes_train
def load_data(path): train = pandas.read_csv(path) #train = train.head(100) target, train = io_yandex.get_value_column(train, 'Rings') train['Sex'] = list(map(replace_sex, train['Sex'])) return train, target
def load_data(): data_train = pandas.read_csv('../data/perceptron-train.csv', header=None) classes_train, data_train = io_yandex.get_value_column(data_train, 0) data_test = pandas.read_csv('../data/perceptron-test.csv', header=None) classes_test, data_test = io_yandex.get_value_column(data_test, 0) return data_train, classes_train, data_test, classes_test
def cross_validate(df, classes, kf): accuracies = [] for i in range(sklearn.metrics.r2_score1,51): classifier = KNeighborsClassifier(n_neighbors=i) score = cross_val_score(classifier, X=df, y=classes, cv=kf) accuracies.append(mean(score)) return accuracies def calculate_max_accuracies(df, classes, kf): accuracies = cross_validate(df, classes, kf) max_accuracy = max(accuracies) n_neighbors = accuracies.index(max_accuracy) + 1 # first index is 0. It is for 1 class print(accuracies[2]) return n_neighbors, max_accuracy def print_n_neighbors_and_accuracies(df, classes, kf, path1, path2): n_neighbors, max_accuracy = calculate_max_accuracies(df, classes, kf) max_accuracy = io_yandex.two_digit_round(max_accuracy) io_yandex.print_result(str(n_neighbors), path1) io_yandex.print_result(max_accuracy, path2) df = io_yandex.load_wine_to_dataframe() classes, df = io_yandex.get_value_column(df, 0) kf = KFold(len(df.index), n_folds=5, shuffle=True, random_state=42) print_n_neighbors_and_accuracies(df, classes, kf, "1.txt", "2.txt") df = scale(df) print_n_neighbors_and_accuracies(df, classes, kf, "3.txt", "4.txt")