Ejemplo n.º 1
0
import utils

#path_to_data = '/courses/cs342/Assignment2/'
path_to_data = ''

train, train_meta = utils.load_train(path_to_data)

g_train, eg_train, g_meta, eg_meta, g_target, eg_target = utils.gal_split_data(
    train, train_meta, True)

g_features = utils.feature_engineering(g_train, g_meta)
g_wtable, g_labels, g_classes, g_target_map = utils.preprocess_target(g_target)
g_features = utils.standardize_data(g_features)
utils.train_mlp(g_features, g_wtable, g_labels, g_classes, g_target_map, True)

eg_features = utils.feature_engineering(eg_train, eg_meta)
eg_wtable, eg_labels, eg_classes, eg_target_map = utils.preprocess_target(
    eg_target)
eg_features = utils.standardize_data(eg_features)
utils.train_mlp(eg_features, eg_wtable, eg_labels, eg_classes, eg_target_map,
                False)
Ejemplo n.º 2
0
        pd.read_csv(path_to_data + 'test_set.csv',
                    chunksize=chunks,
                    iterator=True)):
    if i_c != 0:
        data_chunk = pd.concat([straddler, data_chunk], ignore_index=True)

    arr = data_chunk['object_id'].unique()
    straddler = data_chunk.loc[data_chunk['object_id'] == arr[len(arr) - 1]]
    data_chunk = data_chunk[data_chunk.object_id != arr[len(arr) - 1]]
    data_chunk = data_chunk.reset_index(drop=True)

    meta_chunk = test_meta[test_meta['object_id'].isin(
        data_chunk['object_id'].unique())]
    meta_chunk = meta_chunk.reset_index(drop=True)

    g_data, eg_data, g_meta, eg_meta = utils.gal_split_data(
        data_chunk, meta_chunk, False)

    g_features = None
    eg_features = None

    if g_meta.shape[0] > 0:
        #make meta not drop object_id in the feature engineering function
        g_features = utils.feature_engineering(g_data, g_meta, False)
        if i_c == 0:
            g_features.to_csv('test_g_features.csv',
                              header=True,
                              mode='a',
                              index=False)
        else:
            g_features.to_csv('test_g_features.csv',
                              header=False,
Ejemplo n.º 3
0
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt

from sklearn import metrics
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.ensemble import RandomForestClassifier

#path_to_data = '/courses/cs342/Assignment2/'
path_to_data = ''

train, train_meta = utils.load_train(path_to_data)

g_train, eg_train, g_meta, eg_meta, g_target, eg_target = utils.gal_split_data(
    train, train_meta, True)

g_features = utils.feature_engineering(g_train, g_meta)

eg_features = utils.feature_engineering(eg_train, eg_meta)
print eg_features.columns
import sys
sys.exit()

#X_train, X_test, y_train, y_test = train_test_split(g_features, g_target, test_size = 0.1, random_state=0)

g_clf = RandomForestClassifier(n_estimators=200, max_depth=25, random_state=0)
g_clf.fit(g_features, g_target)

#prediction = g_clf.predict(X_test)