Esempio n. 1
0
def make_final_sets():
    X_train, X_test, y_train, y_test = get_cleaned_train_test_df()

    full_pipeline = make_final_transformation_pipe()
    X_train_processed_values = full_pipeline.fit_transform(X_train)
    X_test_processed_values = full_pipeline.transform(X_test)

    # Add column names to build the processed dataframe
    region_ohe_features = list(
        full_pipeline.named_transformers_["nom"].get_feature_names())
    column_names = CONTINUOUS_FEATURES + ORDINAL_FEATURES + region_ohe_features
    X_train_processed = pd.DataFrame(X_train_processed_values,
                                     columns=column_names)
    X_test_processed = pd.DataFrame(X_test_processed_values,
                                    columns=column_names)

    # Drop one of the ohe features to limit correlations in the data set
    for df in (X_train_processed, X_test_processed):
        df.drop("x0_EUROPE", axis=1, inplace=True)

    # Save the data
    df_train_processed = X_train_processed.join(y_train.reset_index(drop=True))
    df_train_processed.to_pickle(data_path("processed", "train_processed.pkl"))

    df_test_processsed = X_test_processed.join(y_test.reset_index(drop=True))
    df_test_processsed.to_pickle(data_path("processed", "test_processed.pkl"))

    return df_train_processed, df_test_processsed
Esempio n. 2
0
    def transform_address_test():
        from src.utils import data_path
        from bokeh.models import pd
        train_path = data_path('train.csv')
        train_frame = pd.read_csv(train_path)

        print(train_frame['Address'].apply(__address_to_abbs))
Esempio n. 3
0
    def transform_address_test():
        from src.utils import data_path
        from bokeh.models import pd
        train_path = data_path('train.csv')
        train_frame = pd.read_csv(train_path)

        print(train_frame['Address'].apply(__address_to_abbs))
Esempio n. 4
0
def transform_set(name, train=True):
    train_path = data_path(name)
    train_frame = pd.read_csv(train_path)
    if train:
        del train_frame['Descript']
        del train_frame['Resolution']
    del train_frame['Address']

    train_frame['X'] = normalize_features(train_frame['X'])
    train_frame['Y'] = normalize_features(train_frame['Y'])
    train_frame['Times'] = train_frame['Dates'].apply(transform_normalized_time)

    train_frame['Year'] = train_frame['Dates'].apply(transform_data_to('year'))
    train_frame['Month'] = train_frame['Dates'].apply(transform_data_to('month'))
    del train_frame['Dates']

    transformer = OneHotTransformer(categorical(train_frame), train_frame.columns)
    transformer.fit(train_frame)
    result = transformer.transform_frame(train_frame)

    not_regex = "^Dates|^PdDistrict|^DayOfWeek|^Resolution|^X|^Y"
    train_transformed = result.filter(regex=not_regex)
    label_transformed = None
    if train: label_transformed = result.filter(regex="^Category")

    return train_transformed, label_transformed
Esempio n. 5
0
def transform_set(name, train=True):
    train_path = data_path(name)
    train_frame = pd.read_csv(train_path)

    categories = None
    if train: categories = train_frame['Category']

    if train:
        del train_frame['Descript']
        del train_frame['Resolution']
        del train_frame['Category']
    del train_frame['Address']
    if not train: del train_frame['Id']

    train_frame['X'] = normalize_features(train_frame['X'])
    train_frame['Y'] = normalize_features(train_frame['Y'])
    train_frame['Times'] = train_frame['Dates'].apply(
        transform_normalized_time)
    # train_frame = transform_address(train_frame)
    print(train_frame)

    # train_frame['Year'] = train_frame['Dates'].apply(transform_data_to('year'))
    # train_frame['Month'] = train_frame['Dates'].apply(transform_data_to('month'))
    del train_frame['Dates']

    transformer = OneHotTransformer(categorical(train_frame),
                                    train_frame.columns)
    transformer.fit(train_frame)
    train_transformed = transformer.transform_frame(train_frame)

    return train_transformed, categories
Esempio n. 6
0
def create_submission(prediction, file_name):
    with open(data_path(file_name), 'w') as f:
        f.write("{},{}\n".format("id", ",".join(classes)))
        for i in range(len(prediction)):
            f.write("{},{}\n".format(str(i),
                                     ",".join([str(j)
                                               for j in prediction[i]])))
Esempio n. 7
0
def transform_set(name, train=True):
    train_path = data_path(name)
    train_frame = pd.read_csv(train_path)

    categories = None
    if train: categories = train_frame['Category']

    if train:
        del train_frame['Descript']
        del train_frame['Resolution']
        del train_frame['Category']
    del train_frame['Address']
    if not train: del train_frame['Id']

    train_frame['X'] = normalize_features(train_frame['X'])
    train_frame['Y'] = normalize_features(train_frame['Y'])
    train_frame['Times'] = train_frame['Dates'].apply(transform_normalized_time)
    # train_frame = transform_address(train_frame)
    print(train_frame)

    # train_frame['Year'] = train_frame['Dates'].apply(transform_data_to('year'))
    # train_frame['Month'] = train_frame['Dates'].apply(transform_data_to('month'))
    del train_frame['Dates']

    transformer = OneHotTransformer(categorical(train_frame), train_frame.columns)
    transformer.fit(train_frame)
    train_transformed = transformer.transform_frame(train_frame)

    return train_transformed, categories
Esempio n. 8
0
def transform_set(name, train=True):
    train_path = data_path(name)
    train_frame = pd.read_csv(train_path)
    if train:
        del train_frame['Descript']
        del train_frame['Resolution']
    del train_frame['Address']

    train_frame['X'] = normalize_features(train_frame['X'])
    train_frame['Y'] = normalize_features(train_frame['Y'])
    train_frame['Times'] = train_frame['Dates'].apply(
        transform_normalized_time)

    train_frame['Year'] = train_frame['Dates'].apply(transform_data_to('year'))
    train_frame['Month'] = train_frame['Dates'].apply(
        transform_data_to('month'))
    del train_frame['Dates']

    transformer = OneHotTransformer(categorical(train_frame),
                                    train_frame.columns)
    transformer.fit(train_frame)
    result = transformer.transform_frame(train_frame)

    not_regex = "^Dates|^PdDistrict|^DayOfWeek|^Resolution|^X|^Y"
    train_transformed = result.filter(regex=not_regex)
    label_transformed = None
    if train: label_transformed = result.filter(regex="^Category")

    return train_transformed, label_transformed
Esempio n. 9
0
def load_raw_data(file_name="auto-mpg.data"):
    file_path = data_path("raw", file_name)
    return pd.read_csv(
        file_path,
        delim_whitespace=True,
        header=None,
        names=[
            "mpg",
            "cylinders",
            "displacement",
            "horsepower",
            "weight",
            "acceleration",
            "year",
            "origin",
            "name",
        ],
    )
Esempio n. 10
0
def transform_set(name, train=True):
    train_path = data_path(name)
    train_frame = pd.read_csv(train_path)

    categories = None
    if train: categories = train_frame['Category']

    if train:
        del train_frame['Descript']
        del train_frame['Resolution']
        del train_frame['Category']
    del train_frame['Address']
    if not train: del train_frame['Id']

    train_frame['X'] = normalize_features(train_frame['X'])
    train_frame['Y'] = normalize_features(train_frame['Y'])
    train_frame['Times'] = train_frame['Dates'].apply(transform_normalized_time)
    # train_frame = transform_address(train_frame)
    # print(train_frame)

    train_frame['Year'] = train_frame['Dates'].apply(transform_data_to('year'))
    print("sdfasdfsadfasdfasdf")
    print(train_frame.columns)
    # train_frame['Month'] = train_frame['Dates'].apply(transform_data_to('month'))
    del train_frame['Dates']

    transformer = OneHotTransformer(categorical(train_frame), train_frame.columns)
    transformer.fit(train_frame)
    train_transformed = transformer.transform_frame(train_frame)

    label_transformed = None
    if train:
        values = sorted(list(set(categories)))
        mapping = {value: index for index, value in enumerate(values)}
        label_transformed = [mapping[cat] for cat in categories]

    # print(train_transformed.columns)
    return train_transformed, label_transformed
Esempio n. 11
0
import pandas as pd
from src.utils import data_path
from collections import Counter

df = pd.read_csv(data_path('train.csv'))

# columns = list(df.columns)
#
# from itertools import groupby
# import re
#
# grouped = groupby(columns, key=lambda name: re.split('_|[0-9]', name)[0])
#
# for key, group in grouped:
#     print(key, list(group))


def desc(name):
    var = df[[name]]
    varV = df[name]

    print(set(varV))
    print(var.describe())
    print(len(var))
    print(len(var[varV == -999999]))
    counter = Counter(varV)
    print(counter.most_common())


# desc('var3')
# desc('var36')
Esempio n. 12
0
from sklearn.linear_model import LogisticRegression
import pandas as pd

from src.submission import make_submission
from src.utils import data_path
from sklearn import metrics, cross_validation

df = pd.read_csv(data_path('train.csv'))
df_test = pd.read_csv(data_path('test.csv'))

clf = LogisticRegression()

target = df['TARGET']
del df['TARGET']
scores = cross_validation.cross_val_score(clf,
                                          df,
                                          target,
                                          cv=5,
                                          scoring='log_loss')
print(scores.all())
print(scores)
print(scores.mean())
clf.fit(df, target)

print(len(df_test))
print(len(clf.predict_proba(df_test)))

prediction = [pred for _, pred in clf.predict_proba(df_test)]

# make_submission('baseline.csv', df_test['ID'], prediction)
Esempio n. 13
0
def get_cleaned_train_test_df():
    clean_data_path = data_path("interim", "data_cleaned.pkl")
    df = pd.read_pickle(clean_data_path)
    X = df.drop("mpg", axis=1)
    y = df["mpg"]
    return train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)
Esempio n. 14
0
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

from src.utils import data_path, setup
import pandas as pd

setup()


train_path = data_path('train.csv')
train_frame = pd.read_csv(train_path)
train_frame['Descript'] = train_frame['Descript'].apply(lambda des: re.sub('[\(\),]', '', des))

text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ['clf', MultinomialNB()],
])

text_clf = text_clf.fit(train_frame['Descript'], train_frame["Category"])
print(train_frame.ix[0]['Descript'], train_frame.ix[0]['Category'])

prediction = text_clf.predict_proba(train_frame['Descript'])

print(prediction[0])
print(text_clf.classes_)

# TODO: validate
Esempio n. 15
0
def make_submission(file_name, test_id, prediction):
    with open(data_path(file_name), 'w') as f:
        f.write("ID,TARGET\n")
        for id, pred in zip(test_id, prediction):
            if hasattr(pred, '__iter__'): pred = pred[1]
            f.write('{},{}\n'.format(str(id), str(pred)))
Esempio n. 16
0
def create_dataset(paths):
    frames = [pd.read_csv(data_path(path)) for path in paths]
    return pd.concat(frames, axis=1)
Esempio n. 17
0
import pandas as pd
from src.utils import data_path
from collections import Counter

df = pd.read_csv(data_path('train.csv'))


# columns = list(df.columns)
#
# from itertools import groupby
# import re
#
# grouped = groupby(columns, key=lambda name: re.split('_|[0-9]', name)[0])
#
# for key, group in grouped:
#     print(key, list(group))

def desc(name):
    var = df[[name]]
    varV = df[name]

    print(set(varV))
    print(var.describe())
    print(len(var))
    print(len(var[varV == -999999]))
    counter = Counter(varV)
    print(counter.most_common())


# desc('var3')
# desc('var36')
Esempio n. 18
0
# from sklearn.externals import joblib
import numpy as np
import pandas as pd

# joblib.dump(clf, 'filename.pkl')
from src.utils import data_path

train = data_path('train.csv')['Category']
labels = train['Category']
del train

classifiers_outputs_test = ['train1.pkl']
classifiers_outputs_train = ['test1.pkl']


def create_dataset(paths):
    frames = [pd.read_csv(data_path(path)) for path in paths]
    return pd.concat(frames, axis=1)


train_set = create_dataset(classifiers_outputs_train)
train_set.to_csv('join_train.csv')

test_set = create_dataset(classifiers_outputs_test)
test_set.to_csv('join_test.csv')
Esempio n. 19
0
from sklearn.preprocessing import MultiLabelBinarizer
from collections import Counter

from src.submission import create_submission
from src.utils import data_path, setup
import pandas as pd
import numpy as np

setup(pd)


def to_singleton(iterable):
    return [[elem] for elem in iterable]


train_path = data_path('train.csv')
train_frame = pd.read_csv(data_path('train.csv'))

submission_size = 884262

classes = ['ARSON', 'ASSAULT', 'BAD CHECKS', 'BRIBERY', 'BURGLARY', 'DISORDERLY CONDUCT', 'DRIVING UNDER THE INFLUENCE',
           'DRUG/NARCOTIC', 'DRUNKENNESS', 'EMBEZZLEMENT', 'EXTORTION', 'FAMILY OFFENSES', 'FORGERY/COUNTERFEITING',
           'FRAUD', 'GAMBLING', 'KIDNAPPING', 'LARCENY/THEFT', 'LIQUOR LAWS', 'LOITERING', 'MISSING PERSON',
           'NON-CRIMINAL', 'OTHER OFFENSES', 'PORNOGRAPHY/OBSCENE MAT', 'PROSTITUTION', 'RECOVERED VEHICLE', 'ROBBERY',
           'RUNAWAY', 'SECONDARY CODES', 'SEX OFFENSES FORCIBLE', 'SEX OFFENSES NON FORCIBLE', 'STOLEN PROPERTY',
           'SUICIDE', 'SUSPICIOUS OCC', 'TREA', 'TRESPASS', 'VANDALISM', 'VEHICLE THEFT', 'WARRANTS', 'WEAPON LAWS']

category = train_frame['Category']
mapping = {clazz: num for (num, clazz) in enumerate(classes)}

most_freq_class = Counter(category).most_common()[0][0]
Esempio n. 20
0
import numpy as np
from sklearn import cross_validation
from sklearn.decomposition import PCA, KernelPCA
import pandas as pd
from sklearn.decomposition import KernelPCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression


from src.utils import data_path

df = pd.read_csv(data_path('train.csv'))
df_test = pd.read_csv(data_path('test.csv'))

target = df['TARGET']
del df['TARGET']
# del df['ID']
id = df_test['ID']
# del df_test['ID']

pca = PCA(n_components=250)
train_pcaed = pca.fit_transform(df, target)

random_forest = RandomForestClassifier(n_estimators=30, max_depth=5, max_features=20)
random_forest.fit(train_pcaed, target)
forested = random_forest.predict_proba(train_pcaed)
# pipe = Pipeline(steps=[('pca', pca), ('random_forest', random_forest)])
Esempio n. 21
0
from src.utils import data_path, setup
import pandas as pd
import numpy as np

setup(pd)


train_path = data_path('train.csv')
train_frame = pd.read_csv(data_path('train.csv'))

columns = list(train_frame.columns)

print(columns)
# print(train_frame['Address'].describe())

# no nulls in data

print(train_frame.ix[0:4])
print(set(train_frame['DayOfWeek']))
#
# print(train_frame[['X', 'Y']].describe())
#
# from sklearn.preprocessing import Normalizer, maxabs_scale, minmax_scale
#
# normalizer = Normalizer()
#
# X = train_frame[['X']]
#
# normalized = normalizer.fit_transform(train_frame[['X']])
#
# print(np.unique(normalized))