from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import SGDClassifier from sklearn.linear_model import LogisticRegression from sklearn.externals import joblib from os.path import basename """ import os os.chdir('C:/Users/ngaude/Documents/GitHub/kaggle/cdiscount/') """ ######################## # Normalisation ######################## """ normalize_file(ddir + 'test.csv',header(test=True)) normalize_file(ddir + 'validation.csv',header()) normalize_file(ddir + 'training_shuffled.csv',header()) """ def score(df,vec,cla,target): X = vec.transform(iterText(df)) Y = list(df[target]) sc = cla.score(X,Y) return sc def vectorizer(df): # 1M max_features should fit in memory, # OvA will be at max 184 classes, # so we can fit coef_ = 1M*184*8B ~ 1GB in memory easily
from utils import ddir,normalize_file normalize_file(ddir + 'test.csv',header(test=True)) normalize_file(ddir + 'training_shuffled.csv',header())
import numpy as np import pandas as pd from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import SGDClassifier from sklearn.linear_model import LogisticRegression from sklearn.externals import joblib from os.path import basename """ import os os.chdir('C:/Users/ngaude/Documents/GitHub/kaggle/cdiscount/') """ ######################## # Normalisation ######################## """ normalize_file(ddir + 'test.csv',header(test=True)) normalize_file(ddir + 'validation.csv',header()) normalize_file(ddir + 'training_shuffled.csv',header()) """ def score(df, vec, cla, target): X = vec.transform(iterText(df)) Y = list(df[target]) sc = cla.score(X, Y) return sc def vectorizer(df): # 1M max_features should fit in memory,