Example #1
0
        if 'name' not in ignored:
            if 'name' in entry:
                if entry['name']:
                    name_tokens = entry['name'].translate(table, string.punctuation).lower().split()
                    for t in name_tokens:
                        entry['name=' + t] = 1.0

                del entry['name']

target_field = 'survived'

training_file = 'data/train.csv'
test_file = 'data/test.csv'
other_file = 'data/titanic3.csv'

training,   headers = csv_to_row_dicts(training_file,   ['name', 'ticket'])
test,       _       = csv_to_row_dicts(test_file,       ['name', 'ticket'])
other,      _       = csv_to_row_dicts(other_file,      ['name', 'ticket'])

compare_data(other, training)
cheat_test = compare_data(other, test)

analysis_set = (training.values())
cheat_set = (cheat_test.values())
full_set = analysis_set + cheat_set

# Delete Uninteresting Variables
ignore_fields = ['boat', 'home.dest', 'body', 'passengerid', 'survived',
    # 'sex',
    # 'pclass',
    # 'parch',
from ml_utils.data.sklearn_compatible import PercentileCategorizer
from ml_utils.learners.utils import multiclass_prediction
from ml_utils.metrics.metrics import logloss
from ml_utils.analysis.analysis import val_frequency_hist, analyze_date_format, write_val_hist
from py_utils.utils import is_num

import string, csv, re
from calendar import monthrange

import numpy as np
from sklearn import pipeline, feature_extraction, ensemble, linear_model, tree
from sklearn.cross_validation import LabelShuffleSplit as Splitter

training_file = 'data/train.csv'
test_file = 'data/test.csv'
training,   headers = csv_to_row_dicts(training_file, display=True, row_limit=0)
# test,       _       = csv_to_row_dicts(test_file,     display=True)

training_set = training.values()
# test_set = test.values()

if 0:
    full_set = training_set + test_set

    Addresses = {};    Mismatches = [];    N_mismatches = 0
    for entry in training_set:
        add_data = {'address': '', 'x': '', 'y': ''}
        for key in entry:
            if key in ['address', 'x', 'y']:
                if is_num(entry[key]):
                    add_data[key] = "%.2f" % float(entry[key])