from matplotlib.cbook import mkdirs from sklearn.linear_model.logistic import LogisticRegression from src.submission import create_submission from src.utils import data_path, setup import pandas as pd from src.transformers import normalize_features, transform_normalized_time from src.OneHotTransformer import OneHotTransformer, categorical from src.validation import cross_validation from sklearn.multiclass import OneVsRestClassifier from sklearn.externals import joblib from sklearn.svm import LinearSVC setup(pd) # for now only dates, day of week, pd district, x, y def transform_set(name, train=True): train_path = data_path(name) train_frame = pd.read_csv(train_path) if train: del train_frame['Descript'] del train_frame['Resolution'] del train_frame['Address'] train_frame['X'] = normalize_features(train_frame['X']) train_frame['Y'] = normalize_features(train_frame['Y']) train_frame['Dates'] = train_frame['Dates'].apply(transform_normalized_time) transformer = OneHotTransformer(categorical(train_frame), train_frame.columns) transformer.fit(train_frame)
from matplotlib.cbook import mkdirs from sklearn.linear_model.logistic import LogisticRegression from src.submission import create_submission from src.utils import data_path, setup import pandas as pd from src.transformers import normalize_features, transform_normalized_time from src.OneHotTransformer import OneHotTransformer, categorical from src.validation import cross_validation from sklearn.multiclass import OneVsRestClassifier from sklearn.externals import joblib from sklearn.svm import LinearSVC setup(pd) # for now only dates, day of week, pd district, x, y def transform_set(name, train=True): train_path = data_path(name) train_frame = pd.read_csv(train_path) if train: del train_frame['Descript'] del train_frame['Resolution'] del train_frame['Address'] train_frame['X'] = normalize_features(train_frame['X']) train_frame['Y'] = normalize_features(train_frame['Y']) train_frame['Dates'] = train_frame['Dates'].apply( transform_normalized_time) transformer = OneHotTransformer(categorical(train_frame), train_frame.columns)
import re from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer from sklearn.naive_bayes import MultinomialNB from sklearn.pipeline import Pipeline from src.utils import data_path, setup import pandas as pd setup() train_path = data_path('train.csv') train_frame = pd.read_csv(train_path) train_frame['Descript'] = train_frame['Descript'].apply(lambda des: re.sub('[\(\),]', '', des)) text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ['clf', MultinomialNB()], ]) text_clf = text_clf.fit(train_frame['Descript'], train_frame["Category"]) print(train_frame.ix[0]['Descript'], train_frame.ix[0]['Category']) prediction = text_clf.predict_proba(train_frame['Descript']) print(prediction[0]) print(text_clf.classes_) # TODO: validate