import utils #path_to_data = '/courses/cs342/Assignment2/' path_to_data = '' train, train_meta = utils.load_train(path_to_data) g_train, eg_train, g_meta, eg_meta, g_target, eg_target = utils.gal_split_data( train, train_meta, True) g_features = utils.feature_engineering(g_train, g_meta) g_wtable, g_labels, g_classes, g_target_map = utils.preprocess_target(g_target) g_features = utils.standardize_data(g_features) utils.train_mlp(g_features, g_wtable, g_labels, g_classes, g_target_map, True) eg_features = utils.feature_engineering(eg_train, eg_meta) eg_wtable, eg_labels, eg_classes, eg_target_map = utils.preprocess_target( eg_target) eg_features = utils.standardize_data(eg_features) utils.train_mlp(eg_features, eg_wtable, eg_labels, eg_classes, eg_target_map, False)
pd.read_csv(path_to_data + 'test_set.csv', chunksize=chunks, iterator=True)): if i_c != 0: data_chunk = pd.concat([straddler, data_chunk], ignore_index=True) arr = data_chunk['object_id'].unique() straddler = data_chunk.loc[data_chunk['object_id'] == arr[len(arr) - 1]] data_chunk = data_chunk[data_chunk.object_id != arr[len(arr) - 1]] data_chunk = data_chunk.reset_index(drop=True) meta_chunk = test_meta[test_meta['object_id'].isin( data_chunk['object_id'].unique())] meta_chunk = meta_chunk.reset_index(drop=True) g_data, eg_data, g_meta, eg_meta = utils.gal_split_data( data_chunk, meta_chunk, False) g_features = None eg_features = None if g_meta.shape[0] > 0: #make meta not drop object_id in the feature engineering function g_features = utils.feature_engineering(g_data, g_meta, False) if i_c == 0: g_features.to_csv('test_g_features.csv', header=True, mode='a', index=False) else: g_features.to_csv('test_g_features.csv', header=False,
import pandas as pd import numpy as np import time import matplotlib.pyplot as plt from sklearn import metrics from sklearn.preprocessing import StandardScaler, LabelEncoder from sklearn.model_selection import StratifiedKFold, train_test_split from sklearn.ensemble import RandomForestClassifier #path_to_data = '/courses/cs342/Assignment2/' path_to_data = '' train, train_meta = utils.load_train(path_to_data) g_train, eg_train, g_meta, eg_meta, g_target, eg_target = utils.gal_split_data( train, train_meta, True) g_features = utils.feature_engineering(g_train, g_meta) eg_features = utils.feature_engineering(eg_train, eg_meta) print eg_features.columns import sys sys.exit() #X_train, X_test, y_train, y_test = train_test_split(g_features, g_target, test_size = 0.1, random_state=0) g_clf = RandomForestClassifier(n_estimators=200, max_depth=25, random_state=0) g_clf.fit(g_features, g_target) #prediction = g_clf.predict(X_test)