from Ensemble.Predictor import Predictor import xgboost as xgb import sys test = False titanic_train_data = pd.read_csv(r"data/train.csv").drop(['Ticket'], axis=1) titanic_test_data = pd.read_csv(r"data/test.csv").drop(['Ticket'], axis=1) all_data = titanic_train_data.append(titanic_test_data).drop(['Survived'], axis=1) survived_data = titanic_train_data['Survived'] freq_port = all_data.Embarked.dropna().mode()[0] all_data.loc[:, 'Embarked'] = all_data['Embarked'].fillna(freq_port) all_data = ap.correct_age(all_data) #all_data = cp.correct_cabin(all_data) all_data.loc[:, 'Cabin'] = all_data['Cabin'].map(lambda x: 'U' if pd.isnull(x) else x[0]) all_data.Cabin.replace(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'T', 'U'], [0, 0, 0, 0, 0, 0, 0, 0, 1], inplace=True) all_data = tp.preprocessing(all_data) to_predict = all_data[survived_data.shape[0]:] train_x, test_x, train_y, test_y = train_test_split( all_data[0:survived_data.shape[0]], survived_data, test_size=0.4) with open('error.log', 'w+') as err_log:
import pandas as pd from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import LogisticRegression, SGDClassifier from sklearn.model_selection import train_test_split from sklearn.metrics import mean_absolute_error import numpy as np # read data from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC titanic_train_data = pd.read_csv(r"data/train.csv") titanic_test_data = pd.read_csv(r"data/test.csv") # correct age titanic_train_data = ap.correct_age(titanic_train_data) #titanic_train_data.Age.dropna(axis=0) titanic_test_data = ap.correct_age(titanic_test_data) # preprocess data titanic_train_data = tp.preprocessing(titanic_train_data) titanic_test_data = tp.preprocessing(titanic_test_data) # split train data predictions = np.zeros(titanic_test_data.shape[0], np.uint32) counter = 0 for i in range(2000): counter += 1 train_x, test_x, train_y, test_y = train_test_split(