Example #1
0
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.externals import joblib
from os.path import basename

"""
import os
os.chdir('C:/Users/ngaude/Documents/GitHub/kaggle/cdiscount/')
"""

########################
# Normalisation
########################

"""
normalize_file(ddir + 'test.csv',header(test=True))
normalize_file(ddir + 'validation.csv',header())
normalize_file(ddir + 'training_shuffled.csv',header())
"""

def score(df,vec,cla,target):
    X = vec.transform(iterText(df))
    Y = list(df[target])
    sc = cla.score(X,Y)
    return sc

def vectorizer(df):
    # 1M max_features should fit in memory, 
    # OvA will be at max 184 classes, 
    # so we can fit coef_ =  1M*184*8B ~ 1GB in memory easily
Example #2
0
from utils import ddir,normalize_file

normalize_file(ddir + 'test.csv',header(test=True))
normalize_file(ddir + 'training_shuffled.csv',header())
Example #3
0
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.externals import joblib
from os.path import basename
"""
import os
os.chdir('C:/Users/ngaude/Documents/GitHub/kaggle/cdiscount/')
"""

########################
# Normalisation
########################
"""
normalize_file(ddir + 'test.csv',header(test=True))
normalize_file(ddir + 'validation.csv',header())
normalize_file(ddir + 'training_shuffled.csv',header())
"""


def score(df, vec, cla, target):
    X = vec.transform(iterText(df))
    Y = list(df[target])
    sc = cla.score(X, Y)
    return sc


def vectorizer(df):
    # 1M max_features should fit in memory,